Commit 5f1f4cbc authored by yihua.huang's avatar yihua.huang

update comments

parent 6cc1d62a
...@@ -8,30 +8,19 @@ import java.util.ArrayList; ...@@ -8,30 +8,19 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* <pre class="zh">
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
* *
* 主要方法: * Object storing extracted result and urls to be crawled.<br>
* {@link #getUrl()} 获取页面的Url * Main method: <br>
* {@link #getHtml()} 获取页面的html内容 * {@link #getUrl()} get url of current page <br>
* {@link #putField(String, Object)} 保存抽取的结果 * {@link #getHtml()} get content of current page <br>
* {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用 * {@link #putField(String, Object)} save extracted result <br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接 * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
* </pre>
* <pre class="en">
* Store extracted result and urls to be crawled.
*
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* </pre>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
*/ */
public class Page { public class Page {
...@@ -55,19 +44,19 @@ public class Page { ...@@ -55,19 +44,19 @@ public class Page {
} }
/** /**
* store extract results
* *
* * @param key
* @param key 结果的key * @param field
* @param field 结果的value
*/ */
public void putField(String key, Object field) { public void putField(String key, Object field) {
resultItems.put(key, field); resultItems.put(key, field);
} }
/** /**
* 获取页面的html内容 * get html content of page
* *
* @return html 页面的html内容 * @return html
*/ */
public Selectable getHtml() { public Selectable getHtml() {
return html; return html;
...@@ -82,9 +71,9 @@ public class Page { ...@@ -82,9 +71,9 @@ public class Page {
} }
/** /**
* 添加待抓取的链接 * add urls to crawl
* *
* @param requests 待抓取的链接 * @param requests
*/ */
public void addTargetRequests(List<String> requests) { public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) { synchronized (targetRequests) {
...@@ -99,9 +88,9 @@ public class Page { ...@@ -99,9 +88,9 @@ public class Page {
} }
/** /**
* 添加待抓取的链接 * add url to crawl
* *
* @param requestString 待抓取的链接 * @param requestString
*/ */
public void addTargetRequest(String requestString) { public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
...@@ -114,9 +103,9 @@ public class Page { ...@@ -114,9 +103,9 @@ public class Page {
} }
/** /**
* 添加待抓取的页面,在需要传递附加信息时使用 * add requests to crawl
* *
* @param request 待抓取的页面 * @param request
*/ */
public void addTargetRequest(Request request) { public void addTargetRequest(Request request) {
synchronized (targetRequests) { synchronized (targetRequests) {
...@@ -125,27 +114,22 @@ public class Page { ...@@ -125,27 +114,22 @@ public class Page {
} }
/** /**
* 获取页面的Url * get url of current page
* *
* @return url 当前页面的url,可用于抽取 * @return url of current page
*/ */
public Selectable getUrl() { public Selectable getUrl() {
return url; return url;
} }
/**
* 设置url
*
* @param url
*/
public void setUrl(Selectable url) { public void setUrl(Selectable url) {
this.url = url; this.url = url;
} }
/** /**
* 获取抓取请求 * get request of current page
* *
* @return request 抓取请求 * @return request
*/ */
public Request getRequest() { public Request getRequest() {
return request; return request;
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
* <div class="zh"> * Object contains url to crawl.<br>
* Request对象封装了待抓取的url信息。<br/> * It contains some additional information.<br>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* <br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* </div>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 上午11:37
*/ */
public class Request implements Serializable { public class Request implements Serializable {
...@@ -36,20 +20,22 @@ public class Request implements Serializable { ...@@ -36,20 +20,22 @@ public class Request implements Serializable {
private String url; private String url;
/** /**
* 额外参数,可以保存一些需要的上下文信息 * Store additional information in extras.
*/ */
private Map<String, Object> extras; private Map<String, Object> extras;
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*/
@Experimental
private double priority; private double priority;
public Request() { public Request() {
} }
/**
* 构建一个request对象
*
* @param url 必须参数,待抓取的url
*/
public Request(String url) { public Request(String url) {
this.url = url; this.url = url;
} }
...@@ -59,12 +45,14 @@ public class Request implements Serializable { ...@@ -59,12 +45,14 @@ public class Request implements Serializable {
} }
/** /**
* 设置优先级,用于URL队列排序<br> * Set the priority of request for sorting.<br>
* 需扩展Scheduler<br> * Need a scheduler supporting priority.<br>
* 目前还没有对应支持优先级的Scheduler实现 =。= <br> * But no scheduler in webmagic supporting priority now (:
* @param priority 优先级,越大则越靠前 *
* @param priority
* @return this * @return this
*/ */
@Experimental
public Request setPriority(double priority) { public Request setPriority(double priority) {
this.priority = priority; this.priority = priority;
return this; return this;
...@@ -85,11 +73,6 @@ public class Request implements Serializable { ...@@ -85,11 +73,6 @@ public class Request implements Serializable {
return this; return this;
} }
/**
* 获取待抓取的url
*
* @return url 待抓取的url
*/
public String getUrl() { public String getUrl() {
return url; return url;
} }
......
...@@ -4,10 +4,13 @@ import java.util.HashMap; ...@@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br> * Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br> * @since 0.1.0
* Time: 下午12:20 <br> * @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/ */
public class ResultItems { public class ResultItems {
...@@ -25,7 +28,7 @@ public class ResultItems { ...@@ -25,7 +28,7 @@ public class ResultItems {
return (T) fields.get(key); return (T) fields.get(key);
} }
public Map<String, Object> getAll() { public Map<String, Object> getAll() {
return fields; return fields;
} }
...@@ -44,8 +47,10 @@ public class ResultItems { ...@@ -44,8 +47,10 @@ public class ResultItems {
} }
/** /**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 * Whether to skip the result.<br>
* @return 是否忽略 true 忽略 * Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/ */
public boolean isSkip() { public boolean isSkip() {
return skip; return skip;
...@@ -53,8 +58,10 @@ public class ResultItems { ...@@ -53,8 +58,10 @@ public class ResultItems {
/** /**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 * Set whether to skip the result.<br>
* @param skip * Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this * @return this
*/ */
public ResultItems setSkip(boolean skip) { public ResultItems setSkip(boolean skip) {
......
...@@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*; import java.util.*;
/** /**
* Site定义一个待抓取的站点的各种信息。<br> * Object contains setting for crawler.<br>
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午12:13 * @see us.codecraft.webmagic.processor.PageProcessor
*/ */
public class Site { public class Site {
...@@ -22,6 +21,9 @@ public class Site { ...@@ -22,6 +21,9 @@ public class Site {
private String charset; private String charset;
/**
* startUrls is the urls the crawler to start with.
*/
private List<String> startUrls = new ArrayList<String>(); private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000; private int sleepTime = 3000;
...@@ -37,19 +39,19 @@ public class Site { ...@@ -37,19 +39,19 @@ public class Site {
} }
/** /**
* 创建一个Site对象,等价于new Site() * new a Site
* *
* @return 新建的对象 * @return new site
*/ */
public static Site me() { public static Site me() {
return new Site(); return new Site();
} }
/** /**
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的 * Add a cookie with domain {@link #getDomain()}
* *
* @param name cookie的名称 * @param name
* @param value cookie的值 * @param value
* @return this * @return this
*/ */
public Site addCookie(String name, String value) { public Site addCookie(String name, String value) {
...@@ -58,7 +60,7 @@ public class Site { ...@@ -58,7 +60,7 @@ public class Site {
} }
/** /**
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。 * set user agent
* *
* @param userAgent userAgent * @param userAgent userAgent
* @return this * @return this
...@@ -69,27 +71,27 @@ public class Site { ...@@ -69,27 +71,27 @@ public class Site {
} }
/** /**
* 获取已经设置的所有cookie * get cookies
* *
* @return 已经设置的所有cookie * @return get cookies
*/ */
public Map<String, String> getCookies() { public Map<String, String> getCookies() {
return cookies; return cookies;
} }
/** /**
* 获取已设置的user-agent * get user agent
* *
* @return 已设置的user-agent * @return user agent
*/ */
public String getUserAgent() { public String getUserAgent() {
return userAgent; return userAgent;
} }
/** /**
* 获取已设置的domain * get domain
* *
* @return 已设置的domain * @return get domain
*/ */
public String getDomain() { public String getDomain() {
if (domain == null) { if (domain == null) {
...@@ -101,10 +103,9 @@ public class Site { ...@@ -101,10 +103,9 @@ public class Site {
} }
/** /**
* 设置这个站点所在域名,必须项。<br> * set the domain of site.
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
* *
* @param domain 爬虫会抓取的域名 * @param domain
* @return this * @return this
*/ */
public Site setDomain(String domain) { public Site setDomain(String domain) {
...@@ -113,10 +114,10 @@ public class Site { ...@@ -113,10 +114,10 @@ public class Site {
} }
/** /**
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br> * Set charset of page manually.<br>
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br> * When charset is not set or set to null, it can be auto detected by Http header.
* *
* @param charset 编码格式,主要是"utf-8"、"gbk"两种 * @param charset
* @return this * @return this
*/ */
public Site setCharset(String charset) { public Site setCharset(String charset) {
...@@ -125,20 +126,21 @@ public class Site { ...@@ -125,20 +126,21 @@ public class Site {
} }
/** /**
* 获取已设置的编码 * get charset set manually
* *
* @return 已设置的domain * @return charset
*/ */
public String getCharset() { public String getCharset() {
return charset; return charset;
} }
/** /**
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br> * Set acceptStatCode.<br>
* 默认为200,正常情况下,无须设置此项。<br> * When status code of http response is in acceptStatCodes, it will be processed.<br>
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br> * {200} by default.<br>
* It is not necessarily to be set.<br>
* *
* @param acceptStatCode 可接受的状态码 * @param acceptStatCode
* @return this * @return this
*/ */
public Site setAcceptStatCode(Set<Integer> acceptStatCode) { public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
...@@ -147,27 +149,27 @@ public class Site { ...@@ -147,27 +149,27 @@ public class Site {
} }
/** /**
* 获取可接受的状态码 * get acceptStatCode
* *
* @return 可接受的状态码 * @return acceptStatCode
*/ */
public Set<Integer> getAcceptStatCode() { public Set<Integer> getAcceptStatCode() {
return acceptStatCode; return acceptStatCode;
} }
/** /**
* 获取初始页面的地址列表 * get start urls
* *
* @return 初始页面的地址列表 * @return start urls
*/ */
public List<String> getStartUrls() { public List<String> getStartUrls() {
return startUrls; return startUrls;
} }
/** /**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。 * Add a url to start url.<br>
* *
* @param startUrl 初始页面的地址 * @param startUrl
* @return this * @return this
*/ */
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
...@@ -176,9 +178,10 @@ public class Site { ...@@ -176,9 +178,10 @@ public class Site {
} }
/** /**
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。 * Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* *
* @param sleepTime 单位毫秒 * @param sleepTime
* @return this * @return this
*/ */
public Site setSleepTime(int sleepTime) { public Site setSleepTime(int sleepTime) {
...@@ -187,25 +190,26 @@ public class Site { ...@@ -187,25 +190,26 @@ public class Site {
} }
/** /**
* 获取两次抓取之间的间隔 * Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* *
* @return 两次抓取之间的间隔,单位毫秒 * @return the interval between the processing of two pages,
*/ */
public int getSleepTime() { public int getSleepTime() {
return sleepTime; return sleepTime;
} }
/** /**
* 获取重新下载的次数,默认为0 * Get retry times when download fail, 0 by default.<br>
* *
* @return 重新下载的次数 * @return retry times when download fail
*/ */
public int getRetryTimes() { public int getRetryTimes() {
return retryTimes; return retryTimes;
} }
/** /**
* 设置获取重新下载的次数,默认为0 * Set retry times when download fail, 0 by default.<br>
* *
* @return this * @return this
*/ */
......
...@@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService; ...@@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* <pre> * Entrance of a crawler.<br>
* webmagic爬虫的入口类。 * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* * Every module is a field of Spider. <br>
* 示例: * The modules are defined in interface. <br>
* 定义一个最简单的爬虫: * You can customize a spider with various implementations of them. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run(); * Examples: <br>
* * <br>
* 使用FilePipeline保存结果到文件: * A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); * <br>
* * Store results to files by FilePipeline: <br>
* 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取: * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); * <br>
* </pre> * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @see Downloader
* Time: 上午6:53 * @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/ */
public class Spider implements Runnable, Task { public class Spider implements Runnable, Task {
...@@ -222,11 +227,12 @@ public class Spider implements Runnable, Task { ...@@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/** /**
* 用某些特定URL进行爬虫测试 * 用某些特定URL进行爬虫测试
*
* @param urls 要抓取的url * @param urls 要抓取的url
*/ */
public void test(String... urls){ public void test(String... urls) {
checkComponent(); checkComponent();
if (urls.length>0){ if (urls.length > 0) {
for (String url : urls) { for (String url : urls) {
processRequest(new Request(url)); processRequest(new Request(url));
} }
...@@ -241,7 +247,7 @@ public class Spider implements Runnable, Task { ...@@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
} }
pageProcessor.process(page); pageProcessor.process(page);
addRequest(page); addRequest(page);
if (!page.getResultItems().isSkip()){ if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) { for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this); pipeline.process(page.getResultItems(), this);
} }
...@@ -298,8 +304,8 @@ public class Spider implements Runnable, Task { ...@@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
public Spider clearPipeline(){ public Spider clearPipeline() {
pipelines=new ArrayList<Pipeline>(); pipelines = new ArrayList<Pipeline>();
return this; return this;
} }
......
...@@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 直接下载页面的简便方法 * 直接下载页面的简便方法
* *
* @param url * @param url
* @return * @return html
*/ */
public Html download(String url) { public Html download(String url) {
Page page = download(new Request(url), null); Page page = download(new Request(url), null);
......
...@@ -2,9 +2,6 @@ ...@@ -2,9 +2,6 @@
<body> <body>
<div class="en"> <div class="en">
Main class "Spider" and models. Main class "Spider" and models.
</div>
<div class="zh">
包括webmagic入口类Spider和一些数据传递的实体类。
</div> </div>
</body> </body>
</html> </html>
package us.codecraft.webmagic.model.annotation; package us.codecraft.webmagic.utils;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import java.util.Collection; import java.util.Collection;
......
...@@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils; ...@@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.*; import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
......
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
/** /**
* Interface to be implemented by page mode.<br> * Interface to be implemented by page mode.<br>
......
...@@ -21,7 +21,7 @@ public @interface ComboExtract { ...@@ -21,7 +21,7 @@ public @interface ComboExtract {
*/ */
ExtractBy[] value(); ExtractBy[] value();
enum Op { public static enum Op {
/** /**
* All extractors will be arranged as a pipeline. <br> * All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source. * The next extractor uses the result of the previous as source.
...@@ -49,7 +49,10 @@ public @interface ComboExtract { ...@@ -49,7 +49,10 @@ public @interface ComboExtract {
*/ */
boolean notNull() default false; boolean notNull() default false;
public enum Source { /**
* types of source for extracting.
*/
public static enum Source {
/** /**
* extract from the content extracted by class extractor * extract from the content extracted by class extractor
*/ */
......
...@@ -21,7 +21,10 @@ public @interface ExtractBy { ...@@ -21,7 +21,10 @@ public @interface ExtractBy {
*/ */
String value(); String value();
public enum Type {XPath, Regex, Css} /**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
/** /**
* Extractor type, support XPath, CSS Selector and regex. * Extractor type, support XPath, CSS Selector and regex.
...@@ -38,7 +41,10 @@ public @interface ExtractBy { ...@@ -38,7 +41,10 @@ public @interface ExtractBy {
*/ */
boolean notNull() default false; boolean notNull() default false;
public enum Source { /**
* types of source for extracting.
*/
public static enum Source {
/** /**
* extract from the content extracted by class extractor * extract from the content extracted by class extractor
*/ */
......
...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline; ...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap; import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*; import java.util.*;
......
...@@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean; ...@@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* Store urls and cursor in files so that a Spider can resume the status when shutdown<br> * Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment