Commit 5f1f4cbc authored by yihua.huang's avatar yihua.huang

update comments

parent 6cc1d62a
......@@ -8,30 +8,19 @@ import java.util.ArrayList;
import java.util.List;
/**
* <pre class="zh">
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
* 主要方法:
* {@link #getUrl()} 获取页面的Url
* {@link #getHtml()} 获取页面的html内容
* {@link #putField(String, Object)} 保存抽取的结果
* {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
* </pre>
* <pre class="en">
* Store extracted result and urls to be crawled.
*
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* </pre>
* Object storing extracted result and urls to be crawled.<br>
* Main method: <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Page {
......@@ -55,19 +44,19 @@ public class Page {
}
/**
* store extract results
*
*
* @param key 结果的key
* @param field 结果的value
* @param key
* @param field
*/
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
* 获取页面的html内容
* get html content of page
*
* @return html 页面的html内容
* @return html
*/
public Selectable getHtml() {
return html;
......@@ -82,9 +71,9 @@ public class Page {
}
/**
* 添加待抓取的链接
* add urls to crawl
*
* @param requests 待抓取的链接
* @param requests
*/
public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) {
......@@ -99,9 +88,9 @@ public class Page {
}
/**
* 添加待抓取的链接
* add url to crawl
*
* @param requestString 待抓取的链接
* @param requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
......@@ -114,9 +103,9 @@ public class Page {
}
/**
* 添加待抓取的页面,在需要传递附加信息时使用
* add requests to crawl
*
* @param request 待抓取的页面
* @param request
*/
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
......@@ -125,27 +114,22 @@ public class Page {
}
/**
* 获取页面的Url
* get url of current page
*
* @return url 当前页面的url,可用于抽取
* @return url of current page
*/
public Selectable getUrl() {
return url;
}
/**
* 设置url
*
* @param url
*/
public void setUrl(Selectable url) {
this.url = url;
}
/**
* 获取抓取请求
* get request of current page
*
* @return request 抓取请求
* @return request
*/
public Request getRequest() {
return request;
......
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* <div class="zh">
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* <br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* </div>
* Object contains url to crawl.<br>
* It contains some additional information.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:37
* @since 0.1.0
*/
public class Request implements Serializable {
......@@ -36,20 +20,22 @@ public class Request implements Serializable {
private String url;
/**
* 额外参数,可以保存一些需要的上下文信息
* Store additional information in extras.
*/
private Map<String, Object> extras;
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*/
@Experimental
private double priority;
public Request() {
}
/**
* 构建一个request对象
*
* @param url 必须参数,待抓取的url
*/
public Request(String url) {
this.url = url;
}
......@@ -59,12 +45,14 @@ public class Request implements Serializable {
}
/**
* 设置优先级,用于URL队列排序<br>
* 需扩展Scheduler<br>
* 目前还没有对应支持优先级的Scheduler实现 =。= <br>
* @param priority 优先级,越大则越靠前
* Set the priority of request for sorting.<br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*
* @param priority
* @return this
*/
@Experimental
public Request setPriority(double priority) {
this.priority = priority;
return this;
......@@ -85,11 +73,6 @@ public class Request implements Serializable {
return this;
}
/**
* 获取待抓取的url
*
* @return url 待抓取的url
*/
public String getUrl() {
return url;
}
......
......@@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.Map;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 下午12:20 <br>
* @since 0.1.0
* @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public class ResultItems {
......@@ -25,7 +28,7 @@ public class ResultItems {
return (T) fields.get(key);
}
public Map<String, Object> getAll() {
public Map<String, Object> getAll() {
return fields;
}
......@@ -44,8 +47,10 @@ public class ResultItems {
}
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
* Whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/
public boolean isSkip() {
return skip;
......@@ -53,8 +58,10 @@ public class ResultItems {
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip
* Set whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this
*/
public ResultItems setSkip(boolean skip) {
......
......@@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
/**
* Site定义一个待抓取的站点的各种信息。<br>
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午12:13
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Site {
......@@ -22,6 +21,9 @@ public class Site {
private String charset;
/**
* startUrls is the urls the crawler to start with.
*/
private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000;
......@@ -37,19 +39,19 @@ public class Site {
}
/**
* 创建一个Site对象,等价于new Site()
* new a Site
*
* @return 新建的对象
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
* Add a cookie with domain {@link #getDomain()}
*
* @param name cookie的名称
* @param value cookie的值
* @param name
* @param value
* @return this
*/
public Site addCookie(String name, String value) {
......@@ -58,7 +60,7 @@ public class Site {
}
/**
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
* set user agent
*
* @param userAgent userAgent
* @return this
......@@ -69,27 +71,27 @@ public class Site {
}
/**
* 获取已经设置的所有cookie
* get cookies
*
* @return 已经设置的所有cookie
* @return get cookies
*/
public Map<String, String> getCookies() {
return cookies;
}
/**
* 获取已设置的user-agent
* get user agent
*
* @return 已设置的user-agent
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* 获取已设置的domain
* get domain
*
* @return 已设置的domain
* @return get domain
*/
public String getDomain() {
if (domain == null) {
......@@ -101,10 +103,9 @@ public class Site {
}
/**
* 设置这个站点所在域名,必须项。<br>
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
* set the domain of site.
*
* @param domain 爬虫会抓取的域名
* @param domain
* @return this
*/
public Site setDomain(String domain) {
......@@ -113,10 +114,10 @@ public class Site {
}
/**
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
* Set charset of page manually.<br>
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset 编码格式,主要是"utf-8"、"gbk"两种
* @param charset
* @return this
*/
public Site setCharset(String charset) {
......@@ -125,20 +126,21 @@ public class Site {
}
/**
* 获取已设置的编码
* get charset set manually
*
* @return 已设置的domain
* @return charset
*/
public String getCharset() {
return charset;
}
/**
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
* 默认为200,正常情况下,无须设置此项。<br>
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
* Set acceptStatCode.<br>
* When status code of http response is in acceptStatCodes, it will be processed.<br>
* {200} by default.<br>
* It is not necessarily to be set.<br>
*
* @param acceptStatCode 可接受的状态码
* @param acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
......@@ -147,27 +149,27 @@ public class Site {
}
/**
* 获取可接受的状态码
* get acceptStatCode
*
* @return 可接受的状态码
* @return acceptStatCode
*/
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
}
/**
* 获取初始页面的地址列表
* get start urls
*
* @return 初始页面的地址列表
* @return start urls
*/
public List<String> getStartUrls() {
return startUrls;
}
/**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
* Add a url to start url.<br>
*
* @param startUrl 初始页面的地址
* @param startUrl
* @return this
*/
public Site addStartUrl(String startUrl) {
......@@ -176,9 +178,10 @@ public class Site {
}
/**
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
* Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @param sleepTime 单位毫秒
* @param sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
......@@ -187,25 +190,26 @@ public class Site {
}
/**
* 获取两次抓取之间的间隔
* Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @return 两次抓取之间的间隔,单位毫秒
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* 获取重新下载的次数,默认为0
* Get retry times when download fail, 0 by default.<br>
*
* @return 重新下载的次数
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
/**
* 设置获取重新下载的次数,默认为0
* Set retry times when download fail, 0 by default.<br>
*
* @return this
*/
......
......@@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
/**
* <pre>
* webmagic爬虫的入口类。
*
* 示例:
* 定义一个最简单的爬虫:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
* 使用FilePipeline保存结果到文件:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* Every module is a field of Spider. <br>
* The modules are defined in interface. <br>
* You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br>
* A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* <br>
* Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午6:53
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
public class Spider implements Runnable, Task {
......@@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/**
* 用某些特定URL进行爬虫测试
*
* @param urls 要抓取的url
*/
public void test(String... urls){
public void test(String... urls) {
checkComponent();
if (urls.length>0){
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
......@@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
}
pageProcessor.process(page);
addRequest(page);
if (!page.getResultItems().isSkip()){
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
......@@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider clearPipeline(){
pipelines=new ArrayList<Pipeline>();
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}
......
......@@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 直接下载页面的简便方法
*
* @param url
* @return
* @return html
*/
public Html download(String url) {
Page page = download(new Request(url), null);
......
......@@ -2,9 +2,6 @@
<body>
<div class="en">
Main class "Spider" and models.
</div>
<div class="zh">
包括webmagic入口类Spider和一些数据传递的实体类。
</div>
</body>
</html>
package us.codecraft.webmagic.model.annotation;
package us.codecraft.webmagic.utils;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import java.util.Collection;
......
......@@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
......
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
/**
* Interface to be implemented by page mode.<br>
......
......@@ -21,7 +21,7 @@ public @interface ComboExtract {
*/
ExtractBy[] value();
enum Op {
public static enum Op {
/**
* All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source.
......@@ -49,7 +49,10 @@ public @interface ComboExtract {
*/
boolean notNull() default false;
public enum Source {
/**
* types of source for extracting.
*/
public static enum Source {
/**
* extract from the content extracted by class extractor
*/
......
......@@ -21,7 +21,10 @@ public @interface ExtractBy {
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
/**
* Extractor type, support XPath, CSS Selector and regex.
......@@ -38,7 +41,10 @@ public @interface ExtractBy {
*/
boolean notNull() default false;
public enum Source {
/**
* types of source for extracting.
*/
public static enum Source {
/**
* extract from the content extracted by class extractor
*/
......
......@@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
......
......@@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown<br>
* Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment