Commit c59c1fe8 authored by yihua.huang's avatar yihua.huang

update comments

parent 59aad6a7
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.model.annotation.Experimental;
import java.util.Collection; import java.util.Collection;
/** /**
* 实现此接口以进行支持爬虫分页抓取。<br> * Extract an object of more than one pages, such as news and articles。<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:18 <br>
*/ */
@Experimental
public interface MultiPageModel { public interface MultiPageModel {
/**
* Page key is the identifier for the object.
*
* @return page key
*/
public String getPageKey(); public String getPageKey();
public Collection<String> getOtherPages(); /**
* page is the identifier of a page in pages for one object.
*
* @return page
*/
public String getPage(); public String getPage();
/**
* other pages to be extracted.<br>
* It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted.
*
* @return other pages
*/
public Collection<String> getOtherPages();
/**
* Combine multiPageModels to a whole object.
*
* @param multiPageModel
* @return multiPageModel combined
*/
public MultiPageModel combine(MultiPageModel multiPageModel); public MultiPageModel combine(MultiPageModel multiPageModel);
} }
...@@ -6,8 +6,6 @@ import us.codecraft.webmagic.Page; ...@@ -6,8 +6,6 @@ import us.codecraft.webmagic.Page;
* 实现这个接口即可在抽取后进行后处理。<br> * 实现这个接口即可在抽取后进行后处理。<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/ */
public interface AfterExtractor { public interface AfterExtractor {
......
...@@ -5,10 +5,26 @@ import us.codecraft.webmagic.Spider; ...@@ -5,10 +5,26 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* 基于Model的Spider,封装后的入口类。<br> * The spider for page model extractor。<br>
* In webmagic, we call a POJO containing extract result as "page model". <br>
* You can customize a crawler by write a page model with annotations. <br>
* Such as:
* <pre>
* {@literal @}TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
* public class OschinaBlog{
*
* {@literal @}ExtractBy("//title")
* private String title;
*
* {@literal @}ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
* private String content;
*
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
* private List<String> tags;
* }
</pre>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br> * @since 0.2.0
* Time: 上午9:51 <br>
*/ */
public class OOSpider extends Spider { public class OOSpider extends Spider {
......
<html> <html>
<body> <body>
webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。 Page model and annotations used to customize a crawler.
</body> </body>
</html> </html>
...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.utils; ...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.utils;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* Date Dec 14, 2012
*/ */
import java.util.HashMap; import java.util.HashMap;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment