Commit 90bbe9b9 authored by yihua.huang's avatar yihua.huang

webmagic-core

parent 17f8ead2
...@@ -9,13 +9,13 @@ import java.util.List; ...@@ -9,13 +9,13 @@ import java.util.List;
/** /**
* *
* Object storing extracted result and urls to be crawled.<br> * Object storing extracted result and urls to fetch.<br>
* Main method: <br> * Main method: <br>
* {@link #getUrl()} get url of current page <br> * {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br> * {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br> * {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br> * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br> * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.1.0
...@@ -71,7 +71,7 @@ public class Page { ...@@ -71,7 +71,7 @@ public class Page {
} }
/** /**
* add urls to crawl * add urls to fetch
* *
* @param requests * @param requests
*/ */
...@@ -88,7 +88,7 @@ public class Page { ...@@ -88,7 +88,7 @@ public class Page {
} }
/** /**
* add url to crawl * add url to fetch
* *
* @param requestString * @param requestString
*/ */
...@@ -103,7 +103,7 @@ public class Page { ...@@ -103,7 +103,7 @@ public class Page {
} }
/** /**
* add requests to crawl * add requests to fetch
* *
* @param request * @param request
*/ */
......
...@@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request; ...@@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br> * Downloader is the part that downloads web pages and store in Page object. <br>
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午12:14
*/ */
public interface Downloader { public interface Downloader {
/** /**
* 下载页面,并保存信息到Page对象中。 * Downloads web pages and store in Page object.
* *
* @param request * @param request
* @param task * @param task
...@@ -23,10 +24,8 @@ public interface Downloader { ...@@ -23,10 +24,8 @@ public interface Downloader {
public Page download(Request request, Task task); public Page download(Request request, Task task);
/** /**
* 设置线程数,多线程程序一般需要Downloader支持<br> * Tell the downloader how many threads the spider used.
* 如果不考虑多线程的可以不实现这个方法<br> * @param threadNum number of threads
*
* @param thread 线程数量
*/ */
public void setThread(int thread); public void setThread(int threadNum);
} }
...@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils; ...@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.http.Header; import org.apache.http.Header;
import org.apache.http.HeaderElement; import org.apache.http.HeaderElement;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
...@@ -22,12 +23,12 @@ import java.util.Set; ...@@ -22,12 +23,12 @@ import java.util.Set;
/** /**
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br> * The http downloader based on HttpClient.
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午12:15
*/ */
@ThreadSafe
public class HttpClientDownloader implements Downloader { public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
...@@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader { ...@@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader {
private int poolSize = 1; private int poolSize = 1;
/** /**
* 直接下载页面的简便方法 * A simple method to download a url.
* *
* @param url * @param url
* @return html * @return html
*/ */
public Html download(String url) { public Html download(String url) {
Page page = download(new Request(url), null); Page page = download(new Request(url), null);
return (Html)page.getHtml(); return (Html) page.getHtml();
} }
@Override @Override
......
...@@ -20,8 +20,7 @@ import java.util.Map; ...@@ -20,8 +20,7 @@ import java.util.Map;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午12:29
*/ */
public class HttpClientPool { public class HttpClientPool {
......
<html> <html>
<body> <body>
包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。 Downloader is the part that downloads web pages and store in Page object.
</body> </body>
</html> </html>
...@@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task; ...@@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task;
import java.util.Map; import java.util.Map;
/** /**
* 命令行输出抽取结果。可用于测试。<br> * Write results in console.<br>
* Usually used in test.
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午1:45
*/ */
public class ConsolePipeline implements Pipeline { public class ConsolePipeline implements Pipeline {
......
package us.codecraft.webmagic.pipeline; package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
...@@ -12,28 +13,23 @@ import java.io.PrintWriter; ...@@ -12,28 +13,23 @@ import java.io.PrintWriter;
import java.util.Map; import java.util.Map;
/** /**
* 持久化到文件的接口。 * Store results in files.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午6:28
*/ */
public class FilePipeline extends FilePersistentBase implements Pipeline { @ThreadSafe
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
/** /**
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" * create a FilePipeline with default path"/data/webmagic/"
*/ */
public FilePipeline() { public FilePipeline() {
setPath("/data/webmagic/"); setPath("/data/webmagic/");
} }
/**
* 新建一个FilePipeline
*
* @param path 文件保存路径
*/
public FilePipeline(String path) { public FilePipeline(String path) {
setPath(path); setPath(path);
} }
......
...@@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems; ...@@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。 * Pipeline is the persistent and offline process part of crawler.<br>
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午1:39 * @see ConsolePipeline
* @see FilePipeline
*/ */
public interface Pipeline { public interface Pipeline {
public void process(ResultItems resultItems,Task task); /**
* Process extracted results.
*
* @param resultItems
* @param task
*/
public void process(ResultItems resultItems, Task task);
} }
<html> <html>
<body> <body>
包含了处理页面抽取结果的接口Pipeline和它的几个实现类。 Pipeline is the persistent and offline process part of crawler.
</body> </body>
</html> </html>
...@@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page; ...@@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
/** /**
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br> * Interface to be implemented to customize a crawler.<br>
* extends the class to implements various spiders.<br> * <br>
* In PageProcessor, you can customize:
* <p/>
* start urls and other settings in {@link Site}<br>
* how the urls to fetch are detected <br>
* how the data are extracted and stored <br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @see Site
* Time: 上午11:42 * @see Page
* @since 0.1.0
*/ */
public interface PageProcessor { public interface PageProcessor {
/** /**
* 定义如何处理页面,包括链接提取、内容抽取等。 * process the page, extract urls to fetch, extract the data and store
*
* @param page * @param page
*/ */
public void process(Page page); public void process(Page page);
/** /**
* 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。 * get the site settings
*
* @return site * @return site
* @see Site
*/ */
public Site getSite(); public Site getSite();
} }
...@@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List; import java.util.List;
/** /**
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br> * A simple PageProcessor.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-22 * @since 0.1.0
* Time: 下午9:15
*/ */
public class SimplePageProcessor implements PageProcessor { public class SimplePageProcessor implements PageProcessor {
...@@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor { ...@@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor {
this.site = Site.me().addStartUrl(startUrl). this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)); setDomain(UrlUtils.getDomain(startUrl));
//compile "*" expression to regex //compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
} }
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).all(); List<String> requests = page.getHtml().links().regex(urlPattern).all();
//调用page.addTargetRequests()方法添加待抓取链接 //add urls to fetch
page.addTargetRequests(requests); page.addTargetRequests(requests);
//xpath方式抽取 //extract by XPath
page.putField("title", page.getHtml().xpath("//title")); page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文
page.putField("html", page.getHtml().toString()); page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent()); page.putField("content", page.getHtml().smartContent());
} }
@Override @Override
public Site getSite() { public Site getSite() {
//定义抽取站点的相关参数 //settings
return site; return site;
} }
} }
<html> <html>
<body> <body>
包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。 PageProcessor custom part of a crawler for specific site.
</body> </body>
</html> </html>
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
...@@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue; ...@@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
/** /**
* 内存队列实现的线程安全Scheduler。<br> * Basic Scheduler implementation.<br>
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午1:13
*/ */
@ThreadSafe
public class QueueScheduler implements Scheduler { public class QueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
...@@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler { ...@@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler {
private Set<String> urls = new HashSet<String>(); private Set<String> urls = new HashSet<String>();
@Override @Override
public synchronized void push(Request request,Task task) { public synchronized void push(Request request, Task task) {
if (logger.isDebugEnabled()){ if (logger.isDebugEnabled()) {
logger.debug("push to queue "+request.getUrl()); logger.debug("push to queue " + request.getUrl());
} }
if (urls.add(request.getUrl())){ if (urls.add(request.getUrl())) {
queue.add(request); queue.add(request);
} }
......
...@@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request; ...@@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br> * Scheduler is the part of url management.<br>
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br> * You can implement interface Scheduler to do:
* manage urls to fetch
* remove duplicate urls
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 下午1:12
*/ */
public interface Scheduler { public interface Scheduler {
/** /**
* 加入一个待抓取的链接 * add a url to fetch
* @param request 待抓取的链接 *
* @param task 定义的任务,以满足单Scheduler多Task的情况 * @param request
* @param task
*/ */
public void push(Request request,Task task); public void push(Request request, Task task);
/** /**
* 返回下一个要抓取的链接 * 返回下一个要抓取的链接
*
* @param task 定义的任务,以满足单Scheduler多Task的情况 * @param task 定义的任务,以满足单Scheduler多Task的情况
* @return 下一个要抓取的链接 * @return 下一个要抓取的链接
*/ */
......
<html> <html>
<body> <body>
包含url管理和调度的接口Scheduler及它的几个实现类。 Scheduler is the part of url management.
</body> </body>
</html> </html>
...@@ -4,6 +4,8 @@ import java.util.ArrayList; ...@@ -4,6 +4,8 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* All selectors will be arranged as a pipeline. <br>
* The next selector uses the result of the previous as source.
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
......
...@@ -10,10 +10,10 @@ import java.util.ArrayList; ...@@ -10,10 +10,10 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* css风格的选择器。包装了Jsoup。<br> * CSS selector. Based on Jsoup.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 上午9:39
*/ */
public class CssSelector implements Selector { public class CssSelector implements Selector {
......
...@@ -4,6 +4,8 @@ import java.util.ArrayList; ...@@ -4,6 +4,8 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
......
...@@ -4,13 +4,16 @@ import java.util.List; ...@@ -4,13 +4,16 @@ import java.util.List;
/** /**
* Selector(extractor) for text.<br> * Selector(extractor) for text.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0
*/ */
public interface Selector { public interface Selector {
/** /**
* Extract single result in text.<br> * Extract single result in text.<br>
* If there are more than one result, only the first will be chosen. * If there are more than one result, only the first will be chosen.
*
* @param text * @param text
* @return result * @return result
*/ */
...@@ -18,6 +21,7 @@ public interface Selector { ...@@ -18,6 +21,7 @@ public interface Selector {
/** /**
* Extract all results in text.<br> * Extract all results in text.<br>
*
* @param text * @param text
* @return results * @return results
*/ */
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
{@link #getHtml()} get content of current page {@link #getHtml()} get content of current page
{@link #putField(String, Object)} save extracted result {@link #putField(String, Object)} save extracted result
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline} {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
</pre> </pre>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment