Commit 23f6bb8d authored by yihua.huang's avatar yihua.huang

Merge branch 'annotation'

parents 9338e13c 21eca688
......@@ -3,14 +3,14 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.1.0</version>
<version>0.2.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>webmagic</artifactId>
<modules>
<module>webmagic-core</module>
<module>webmagic-plugin/</module>
<module>webmagic-extension/</module>
<module>webmagic-samples/</module>
</modules>
......@@ -27,6 +27,11 @@
<artifactId>httpclient</artifactId>
<version>4.2.4</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
......@@ -45,7 +50,7 @@
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
......@@ -75,6 +80,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.8</version>
<executions>
<execution>
<id>copy-dependencies</id>
......@@ -94,6 +100,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
......@@ -101,6 +108,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
......@@ -113,6 +121,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
<executions>
<execution>
<id>attach-javadocs</id>
......@@ -125,11 +137,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
<version>2.4.1</version>
</plugin>
</plugins>
</build>
</project>
webmagic使用手册
------
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。
>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。
>webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
>python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
>Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
>webmagic遵循[Apache 2.0协议](http://www.apache.org/licenses/LICENSE-2.0.html),你可以自由进行使用和修改。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。
<div style="page-break-after:always"></div>
## 快速开始
### 使用maven
webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译:
git clone https://github.com/code4craft/webmagic.git
mvn clean install
安装后,在项目中添加对应的依赖即可使用webmagic:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.2.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.2.0</version>
</dependency>
#### 项目结构
webmagic主要包括两个包:
* **webmagic-core**
webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
* **webmagic-extension**
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来:
* **webmagic-saxon**
webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。
* **webmagic-selenium**
webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。
在项目中,你可以根据需要依赖不同的包。
### 不使用maven
不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)):
git clone http://git.oschina.net/flashsword20/webmagic-bin.git
**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。
### 第一个爬虫
#### 定制PageProcessor
PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码:
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net")
.addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser())
.pipeline(new ConsolePipeline()).run();
}
}
这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。
Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。
执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。
#### 使用注解
webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同:
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(
Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),
new ConsolePageModelPipeline(), OschinaBlog.class).run();
}
}
这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。
注解的详细使用方式见后文中得webmagic-extension注解模块。
<div style="page-break-after:always"></div>
## webmagic-core
webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
此节部分内容摘自作者的博文
[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)
### webmagic-core的模块划分
webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。
![image](http://code4craft.github.io/images/posts/webmagic.png)
<div style="page-break-after:always"></div>
#### Spider类(核心调度)
**Spider**是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。
Spider.create(sinaBlogProcessor)
.scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/"))
.pipeline(new FilePipeline())
.thread(10).run();
Spider的核心处理流程非常简单,代码如下:
<!-- lang: java -->
private void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
return;
}
pageProcessor.process(page);
addRequest(page);
for (Pipeline pipeline : pipelines) {
pipeline.process(page, this);
}
sleep(site.getSleepTime());
}
#### PageProcessor(页面分析及链接抽取)
页面分析是垂直爬虫中需要定制的部分。在webmagic-core里,通过实现**PageProcessor**接口来实现定制爬虫。PageProcessor有两个核心方法:public void process(Page page)和public Site getSite() 。
* public void process(Page page)
通过对**Page**对象的操作,实现爬虫逻辑。Page对象包括两个最重要的方法:addTargetRequests()可以添加URL到待抓取队列,put()可以将结果保存供后续处理。
Page的数据可以通过Page.getHtml()和Page.getUrl()获取。
* public Site getSite()
**Site**对象定义了爬虫的域名、起始地址、抓取间隔、编码等信息。
**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic-core的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取。
<!-- lang: java -->
//content是用别的爬虫工具抽取到的正文
List<String> links = page.getHtml()
.$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的
.xpath("//@href") //提取链接
.regex(".*blog.*") //正则匹配过滤
.all(); //转换为string列表
webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。
基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉XPath2.0语法,倒是不妨一试(需要引入**webmagic-saxon**包)。
**webmagic-samples**包里有一些为某个站点定制的PageProcessor,供学习之用。
#### Downloader(页面下载)
**Downloader**是webmagic中下载页面的接口,主要方法:
* public Page download(Request request, Task task)
**Request**对象封装了待抓取的URL及其他信息,而Page则包含了页面下载后的Html及其他信息。Task是一个包装了任务对应的Site信息的抽象接口。
* public void setThread(int thread)
因为Downloader一般会涉及连接池等功能,而这些功能与多线程密切相关,所以定义了此方法。
目前有几个Downloader的实现:
* HttpClientDownloader
集成了**Apache HttpClient**的Downloader。Apache HttpClient(4.0后整合到HttpCompenent项目中)是强大的Java http下载器,它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。
* SeleniumDownloader
对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它;另一种就是:内置一个浏览器,直接获取最后加载完的页面。**webmagic-selenium**包中整合了Selenium到SeleniumDownloader,可以直接进行动态加载页面的抓取。
#### Scheduler(URL管理)
**Scheduler**是webmagic的管理模块,通过实现Scheduler可以定制自己的URL管理器。Scheduler包括两个主要方法:
* public void push(Request request,Task task)
将待抓取URL加入Scheduler。Request对象是对URL的一个封装,还包括优先级、以及一个供存储数据的Map。Task仍然用于区分不同任务,在多个任务公用一个Scheduler时可以此进行区分。
* public Request poll(Task task)
从Scheduler里取出一条请求,并进行后续执行。
webmagic目前有三个Scheduler的实现:
* QueueScheduler
一个简单的内存队列,速度较快,并且是线程安全的。
* FileCacheQueueScheduler
使用文件保存队列,它可以用于耗时较长的下载任务,在任务中途停止后(手动停止或者程序崩溃),下次执行仍然从中止的URL开始继续爬取。
* RedisScheduler
使用redis存储URL队列。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。
#### Pipeline(后续处理和持久化)
**Pipeline**是最终抽取结果进行输出和持久化的接口。它只包括一个方法:
* public void process(ResultItems resultItems,Task task)
**ResultItems**是集成了抽取结果的对象。通过ResultItems.get(key)可以获取抽取结果。Task同样是用于区分不同任务的对象。
webmagic包括以下几个Pipeline的实现:
* ConsolePipeline
直接输出结果到控制台,测试时使用。
* FilePipeline
输出结果到文件,每个URL单独保存到一个页面,以URL的MD5结果作为文件名。通过构造函数`public FilePipeline(String path)`定义存储路径,**以下使用文件持久化的类,多数都使用此方法指定路径**
* JsonFilePipeline
以JSON输出结果到文件(.json后缀),其他与FilePipeline相同。
webmagic目前不支持持久化到数据库,但是结合其他工具,持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以这段代码并没有放到webmagic-samples里来。
<div style="page-break-after:always"></div>
## webmagic-extension
webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。
### 注解模块
webmagic-extension包括注解模块。为什么会有注解方式?
因为PageProcessor的方式灵活、强大,但是没有解决两个问题:
* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。
* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。
注解的核心是Model类,本身是一个POJO,这个Model类用于传递、保存页面最终抓取结果数据。注解方式直接将抽取与数据绑定,以便于编写和维护。
注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。
注解部分包括以下内容:
* #### TargetUrl
"TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以**sourceRegion**指定提取URL的区域(仅支持XPath)。
TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
与TargetUrl相似的还有**HelpUrl**,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。
* #### ExtractBy
* ##### 用于字段
"ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。
ExtractBy还有几个扩展属性。**multi**表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。**notnull**则表示,此字段不允许为null,若为null则放弃整个对象。
* ##### 用于类
"ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。
* ##### ExtractByRaw & ExtractByUrl
在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL中抽取信息。ExtractByUrl只支持正则表达式。
* ##### ExtractBy2 ExtractBy3
"ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。
* #### AfterExtractor
AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用**afterProcess()**方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。
* #### OOSpider
OOSpider是注解式爬虫的入口,这里调用**create()**方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,例如:
OOSpider.create(
Site.me().addStartUrl("http://www.oschina.net"),
new ConsolePageModelPipeline(),
OschinaBlog.clas,OschinaAnswer.class).run();
OOSpider会根据TargetUrl调用不同的Model进行解析。
* #### PageModelPipeline
可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。
* #### 分页
处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic目前对于分页的解决方案是:在注解模式下,Model通过实现**PagedModel**接口,并引入PagedPipeline作为第一个Pipeline来实现。具体可以参考webmagic-samples中抓取网易新闻的代码:**us.codecraft.webmagic.model.samples.News163**。
关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。
目前分页功能还没有分布式实现,如果实现RedisScheduler进行分布式爬取,请不要使用分页功能。
### 分布式
webmagic-extension中,通过redis来管理URL,达到分布式的效果。但是对于分布式爬虫,仅仅程序能够分布式运行,还满足不了大规模抓取的需要,webmagic可能后期会加入一些任务管理和监控的功能,也欢迎各位用户为webmagic提交代码,做出贡献。
webmagic-core
-------
webmagic核心部分。
\ No newline at end of file
webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
\ No newline at end of file
......@@ -5,7 +5,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -9,7 +9,7 @@ import java.util.List;
/**
* <pre>
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
* 主要方法:
* {@link #getUrl()} 获取页面的Url
......@@ -19,6 +19,7 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
* </pre>
*
* @author code4crafter@gmail.com <br>
*/
public class Page {
......@@ -36,8 +37,15 @@ public class Page {
public Page() {
}
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* 保存抽取的结果
*
* @param key 结果的key
* @param field 结果的value
*/
......@@ -47,6 +55,7 @@ public class Page {
/**
* 获取页面的html内容
*
* @return html 页面的html内容
*/
public Selectable getHtml() {
......@@ -63,6 +72,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requests 待抓取的链接
*/
public void addTargetRequests(List<String> requests) {
......@@ -79,6 +89,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requestString 待抓取的链接
*/
public void addTargetRequest(String requestString) {
......@@ -93,6 +104,7 @@ public class Page {
/**
* 添加待抓取的页面,在需要传递附加信息时使用
*
* @param request 待抓取的页面
*/
public void addTargetRequest(Request request) {
......@@ -103,6 +115,7 @@ public class Page {
/**
* 获取页面的Url
*
* @return url 当前页面的url,可用于抽取
*/
public Selectable getUrl() {
......@@ -111,6 +124,7 @@ public class Page {
/**
* 设置url
*
* @param url
*/
public void setUrl(Selectable url) {
......@@ -119,6 +133,7 @@ public class Page {
/**
* 获取抓取请求
*
* @return request 抓取请求
*/
public Request getRequest() {
......
package us.codecraft.webmagic;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
......@@ -18,40 +22,95 @@ package us.codecraft.webmagic;
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:37
*/
public class Request {
public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
private String url;
private Object[] extra;
/**
* 额外参数,可以保存一些需要的上下文信息
*/
private Map<String, Object> extras;
private double priority;
public Request() {
}
/**
* 构建一个request对象
*
* @param url 必须参数,待抓取的url
* @param extra 额外参数,可以保存一些需要的上下文信息
*/
public Request(String url, Object... extra) {
public Request(String url) {
this.url = url;
this.extra = extra;
}
/**
* 获取预存的对象
* @return object[] 预存的对象数组
*/
public Object[] getExtra() {
return extra;
public double getPriority() {
return priority;
}
public Request setPriority(double priority) {
this.priority = priority;
return this;
}
public Object getExtra(String key) {
if (extras == null) {
return null;
}
return extras.get(key);
}
public Request putExtra(String key, Object value) {
if (extras == null) {
extras = new HashMap<String, Object>();
}
extras.put(key, value);
return this;
}
/**
* 获取待抓取的url
*
* @return url 待抓取的url
*/
public String getUrl() {
return url;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (!url.equals(request.url)) return false;
return true;
}
public Map<String, Object> getExtras() {
return extras;
}
@Override
public int hashCode() {
return url.hashCode();
}
public void setExtras(Map<String, Object> extras) {
this.extras = extras;
}
public void setUrl(String url) {
this.url = url;
}
}
......@@ -5,8 +5,8 @@ import java.util.Map;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 下午12:20 <br>
*/
public class ResultItems {
......
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
/**
......@@ -90,6 +92,11 @@ public class Site {
* @return 已设置的domain
*/
public String getDomain() {
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return domain;
}
......@@ -150,6 +157,7 @@ public class Site {
/**
* 获取初始页面的地址列表
*
* @return 初始页面的地址列表
*/
public List<String> getStartUrls() {
......@@ -158,6 +166,7 @@ public class Site {
/**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
*
* @param startUrl 初始页面的地址
* @return this
*/
......@@ -179,6 +188,7 @@ public class Site {
/**
* 获取两次抓取之间的间隔
*
* @return 两次抓取之间的间隔,单位毫秒
*/
public int getSleepTime() {
......@@ -187,6 +197,7 @@ public class Site {
/**
* 获取重新下载的次数,默认为0
*
* @return 重新下载的次数
*/
public int getRetryTimes() {
......@@ -195,6 +206,7 @@ public class Site {
/**
* 设置获取重新下载的次数,默认为0
*
* @return this
*/
public Site setRetryTimes(int retryTimes) {
......@@ -219,7 +231,7 @@ public class Site {
return true;
}
public Task toTask(){
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
......
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.QueueScheduler;
import us.codecraft.webmagic.schedular.Scheduler;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.ThreadUtils;
import java.util.ArrayList;
......@@ -228,9 +228,11 @@ public class Spider implements Runnable, Task {
}
pageProcessor.process(page);
addRequest(page);
if (!page.getResultItems().isSkip()){
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
sleep(site.getSleepTime());
}
......@@ -283,6 +285,11 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider clearPipeline(){
pipelines=new ArrayList<Pipeline>();
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
......
......@@ -2,8 +2,8 @@ package us.codecraft.webmagic.downloader;
/**
* 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午3:10 <br>
*/
public interface Destroyable {
......
......@@ -7,29 +7,18 @@ import java.util.Map;
/**
* 命令行输出抽取结果。可用于测试。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:45
*/
public class ConsolePipeline implements Pipeline{
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems,Task task) {
if (resultItems.isSkip()){
return;
}
System.out.println("get page: "+resultItems.getRequest().getUrl());
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
System.out.println(entry.getKey() + ":");
for (Object o : value) {
System.out.println(o);
}
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
System.out.println(entry.getKey()+":\t"+entry.getValue());
}
}
}
......@@ -20,12 +20,12 @@ import java.util.Map;
*/
public class FilePipeline implements Pipeline {
private String path = "/data/temp/webmagic/";
private String path = "/data/webmagic/";
private Logger logger = Logger.getLogger(getClass());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
*/
public FilePipeline() {
......@@ -37,6 +37,9 @@ public class FilePipeline implements Pipeline {
* @param path 文件保存路径
*/
public FilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
}
......@@ -47,9 +50,6 @@ public class FilePipeline implements Pipeline {
if (!file.exists()) {
file.mkdirs();
}
if (resultItems.isSkip()) {
return;
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
......
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
......
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午5:29 <br>
*/
public class AndSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public AndSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
if (text == null) {
return null;
}
text = selector.select(text);
}
return text;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
boolean first = true;
for (Selector selector : selectors) {
if (first) {
results = selector.selectList(text);
first = false;
} else {
List<String> resultsTemp = new ArrayList<String>();
for (String result : results) {
resultsTemp.addAll(selector.selectList(result));
}
results = resultsTemp;
if (results == null || results.size() == 0) {
return results;
}
}
}
return results;
}
}
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午5:29 <br>
*/
public class OrSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public OrSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
text = selector.select(text);
if (text!=null){
return text;
}
}
return null;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
for (Selector selector : selectors) {
List<String> strings = selector.selectList(text);
results.addAll(strings);
}
return results;
}
}
......@@ -8,7 +8,7 @@ import java.util.List;
* Date: 13-4-20
* Time: 下午8:02
*/
interface Selector {
public interface Selector {
public String select(String text);
......
webmagic-extension
-------
webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。
\ No newline at end of file
......@@ -4,24 +4,33 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-plugin</artifactId>
<version>0.1.0</version>
<artifactId>webmagic</artifactId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-misc</artifactId>
<artifactId>webmagic-extension</artifactId>
<dependencies>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
<version>2.3.15</version>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.35</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic;
import java.util.Collection;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:18 <br>
*/
public interface PagedModel {
public String getPageKey();
public Collection<String> getOtherPages();
public String getPage();
public PagedModel combine(PagedModel pagedModel);
}
......@@ -34,6 +34,9 @@ public class FileDownloader implements Downloader {
}
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
}
......
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
/**
* 实现这个接口即可在抽取后进行后处理。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/
public interface AfterExtractor {
public void afterProcess(Page page);
}
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午3:41 <br>
*/
public class ConsolePageModelPipeline implements PageModelPipeline {
@Override
public void process(Object o, Task task) {
System.out.println(ToStringBuilder.reflectionToString(o));
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class Extractor {
protected Selector selector;
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
static enum Source {Html, Url, RawHtml}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
this.notNull = notNull;
this.multi = multi;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
return notNull;
}
boolean isMulti() {
return multi;
}
void setSelector(Selector selector) {
this.selector = selector;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class FieldExtractor extends Extractor{
private final Field field;
private Method setterMethod;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
super(selector, source, notNull,multi);
this.field = field;
}
Field getField() {
return field;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
void setSetterMethod(Method setterMethod) {
this.setterMethod = setterMethod;
}
Method getSetterMethod() {
return setterMethod;
}
boolean isNotNull() {
return notNull;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 基于PageProcessor的扩展点。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
class ModelPageProcessor implements PageProcessor {
private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
private Site site;
private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
public static ModelPageProcessor create(Site site, Class... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
for (Class clazz : clazzs) {
modelPageProcessor.addPageModel(clazz);
}
return modelPageProcessor;
}
public ModelPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor);
return this;
}
private ModelPageProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
}
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
protected void postProcessPageModel(Class clazz, Object object) {
}
@Override
public Site getSite() {
return site;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* 基于Pipeline的扩展点,用于实现注解格式的Pipeline。<br>
* 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午10:47 <br>
*/
class ModelPipeline implements Pipeline {
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
public ModelPipeline() {
}
public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
pageModelPipelines.put(clazz, pageModelPipeline);
return this;
}
@Override
public void process(ResultItems resultItems, Task task) {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
if (annotation == null || !((ExtractBy) annotation).multi()) {
classPageModelPipelineEntry.getValue().process(o, task);
} else {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
}
}
}
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/**
* 基于Model的Spider,封装后的入口类。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:51 <br>
*/
public class OOSpider extends Spider {
private ModelPageProcessor modelPageProcessor;
private ModelPipeline modelPipeline;
protected OOSpider(ModelPageProcessor modelPageProcessor) {
super(modelPageProcessor);
this.modelPageProcessor = modelPageProcessor;
}
/**
* 创建一个爬虫。<br>
* @param site
* @param pageModelPipeline
* @param pageModels
*/
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline();
super.pipeline(modelPipeline);
if (pageModelPipeline!=null){
for (Class pageModel : pageModels) {
this.modelPipeline.put(pageModel, pageModelPipeline);
}
}
}
public static OOSpider create(Site site, Class... pageModels) {
return new OOSpider(site, null, pageModels);
}
public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel);
modelPipeline.put(pageModel, pageModelPipeline);
}
return this;
}
}
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午9:33 <br>
*/
class PageModelExtractor {
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
private Selector targetUrlRegionSelector;
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
private Selector helpUrlRegionSelector;
private Class clazz;
private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
return pageModelExtractor;
}
private void init(Class clazz) {
this.clazz = clazz;
initClassExtractors();
fieldExtractors = new ArrayList<FieldExtractor>();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
// ExtractBy2 & ExtractBy3
if (fieldExtractor!=null){
addAnnotationExtractBy2(fieldExtractor);
addAnnotationExtractBy3(fieldExtractor);
}
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
if (fieldExtractor != null) {
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
fieldExtractors.add(fieldExtractor);
}
}
}
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
if (extractByRaw != null) {
String value = extractByRaw.value();
Selector selector;
switch (extractByRaw.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
public static Method getSetterMethod(Class clazz, Field field) {
String name = "set" + StringUtils.capitalize(field.getName());
try {
Method declaredMethod = clazz.getDeclaredMethod(name, field.getType());
declaredMethod.setAccessible(true);
return declaredMethod;
} catch (NoSuchMethodException e) {
return null;
}
}
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
} else {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
public Object process(Page page) {
boolean matched = false;
for (Pattern targetPattern : targetUrlPatterns) {
if (targetPattern.matcher(page.getUrl().toString()).matches()) {
matched = true;
}
}
if (!matched) {
return null;
}
if (extractor == null) {
return processSingle(page, page.getHtml().toString());
} else {
if (extractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
if (o != null) {
os.add(o);
}
}
return os;
} else {
String select = extractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
return o;
}
}
}
private Object processSingle(Page page, String html) {
Object o = null;
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
if (fieldExtractor.isMulti()) {
List<String> value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
break;
case Html:
value = fieldExtractor.getSelector().selectList(html);
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().selectList(html);
}
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, value);
} else {
String value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
break;
case Html:
value = fieldExtractor.getSelector().select(html);
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(html);
}
if (value == null && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, value);
}
}
if (AfterExtractor.class.isAssignableFrom(clazz)) {
((AfterExtractor) o).afterProcess(page);
}
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
return o;
}
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}
Class getClazz() {
return clazz;
}
List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}
Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:34 <br>
*/
public interface PageModelPipeline<T> {
public void process(T t, Task task);
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractBy {
/**
* 抽取规则
*
* @return 抽取规则
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
*
* @return 抽取规则类型
*/
Type type() default Type.XPath;
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 用于字段时,需要List<String>来盛放结果<br>
* 用于类时,表示单页抽取多个对象<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy2 {
String value();
public enum Type {XPath, Regex, Css}
Type type() default Type.XPath;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy3 {
String value();
public enum Type { XPath, Regex, Css}
Type type() default Type.XPath;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractByRaw {
/**
* 抽取规则
*
* @return 抽取规则
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
*
* @return 抽取规则类型
*/
Type type() default Type.XPath;
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 需要List<String>来盛放结果<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractByUrl{
/**
* 抽取规则,支持正则表达式
*
* @return 抽取规则
*/
String value() default "";
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 用于字段时,需要List<String>来盛放结果<br>
* 用于类时,表示单页抽取多个对象<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义辅助爬取的url。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface HelpUrl {
/**
* 某个类对应的URL规则列表<br>
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
*
* @return 抽取规则
*/
String[] value();
/**
* 指定提取URL的区域(仅支持XPath)
* @return 指定提取URL的区域
*/
String sourceRegion() default "";
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface TargetUrl {
/**
* 某个类对应的URL规则列表<br>
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
*
* @return 抽取规则
*/
String[] value();
/**
* 指定提取URL的区域(仅支持XPath)
* @return 指定提取URL的区域
*/
String sourceRegion() default "";
}
<html>
<body>
webmagic注解抓取方式所定义的注解。
</body>
</html>
<html>
<body>
webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。
</body>
</html>
package us.codecraft.webmagic.pipeline;
import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateException;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
......@@ -13,48 +12,50 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
* JSON格式持久化到文件的接口。
*
* @author code4crafter@gmail.com <br>
* Date: 13-6-8
* Time: 下午9:00
* Date: 13-4-21
* Time: 下午6:28
*/
public class FreemarkerPipeline implements Pipeline {
public class JsonFilePipeline implements Pipeline {
private Configuration configuration;
private String path = "/data/webmagic/";
private Template template;
private Logger logger = Logger.getLogger(getClass());
private String path = "/data/temp/webmagic/ftl/";
/**
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
*/
public JsonFilePipeline() {
public FreemarkerPipeline(String template, String path) throws IOException {
configuration = new Configuration();
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
this.template = configuration.getTemplate(template);
this.path = path;
new File(path);
}
public FreemarkerPipeline(String template) throws IOException {
this(template, "/data/temp/webmagic/ftl/");
/**
* 新建一个FilePipeline
*
* @param path 文件保存路径
*/
public JsonFilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
String path = this.path + "" + task.getUUID() + "/";
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
template.process(resultItems.getAll(), printWriter);
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (TemplateException e) {
} catch (IOException e) {
e.printStackTrace();
logger.warn("write file error", e);
}
}
}
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* 用于实现分页的Pipeline。<br>
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:15 <br>
*/
public class PagedPipeline implements Pipeline {
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> resultItemsAll = resultItems.getAll();
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
while (iterator.hasNext()) {
handleObject(iterator);
}
}
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
Map.Entry<String, Object> objectEntry = iterator.next();
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
if (pagedModel.getOtherPages() != null) {
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
if (booleanMap == null) {
return;
}
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
if (!stringBooleanEntry.getValue()) {
iterator.remove();
return;
}
}
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
@Override
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
return i1 - i2;
} catch (NumberFormatException e) {
return o1.getKey().compareTo(o2.getKey());
}
}
});
PagedModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue());
}
objectEntry.setValue(value);
}
}
}
}
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
......@@ -46,6 +46,9 @@ public class FileCacheQueueScheduler implements Scheduler {
private Set<String> urls;
public FileCacheQueueScheduler(String filePath) {
if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){
filePath+="/";
}
this.filePath = filePath;
}
......
package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.schedular.Scheduler;
/**
* 使用redis管理url,构建一个分布式的爬虫。<br>
*
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 上午7:07 <br>
*/
public class RedisScheduler implements Scheduler {
......@@ -22,6 +23,8 @@ public class RedisScheduler implements Scheduler {
private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
pool = new JedisPool(new JedisPoolConfig(), host);
}
......@@ -33,7 +36,12 @@ public class RedisScheduler implements Scheduler {
if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
if (request.getExtras() != null) {
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
byte[] bytes = JSON.toJSONString(request).getBytes();
jedis.set(key.getBytes(), bytes);
}
}
pool.returnResource(jedis);
}
......@@ -42,10 +50,16 @@ public class RedisScheduler implements Scheduler {
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
pool.returnResource(jedis);
if (url==null){
if (url == null) {
return null;
}
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
byte[] bytes = jedis.get(key.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes),Request.class);
return o;
}
pool.returnResource(jedis);
return new Request(url);
}
}
package us.codecraft.webmagic.utils;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date Dec 14, 2012
*/
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
private Map<K1, Map<K2, V>> map;
public DoubleKeyMap() {
init();
}
public DoubleKeyMap(Map<K1, Map<K2, V>> map) {
this(map,DEFAULT_CLAZZ);
}
public DoubleKeyMap(Class<? extends Map> protoMapClass) {
super(protoMapClass);
init();
}
private void init() {
if (map == null) {
map = this.<K1, Map<K2, V>>newMap();
}
}
/**
* init map with protoMapClass
*
* @param protoMapClass
*/
@SuppressWarnings("rawtypes")
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
super(protoMapClass);
this.map = map;
init();
}
/**
* @param key
* @return map
*/
public Map<K2, V> get(K1 key) {
return map.get(key);
}
/**
* @param key1
* @param key2
* @return value
*/
public V get(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
return get(key1).get(key2);
}
/**
* @param key1
* @param submap
* @return
*/
public V put(K1 key1, Map<K2, V> submap) {
return put(key1, submap);
}
/**
* @param key1
* @param key2
* @param value
* @return
*/
public V put(K1 key1, K2 key2, V value) {
if (map.get(key1) == null) {
map.put(key1, this.<K2, V>newMap());
}
return get(key1).put(key2, value);
}
/**
* @param key1
* @param key2
* @return
*/
public V remove(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
V remove = get(key1).remove(key2);
// 如果上一级map为空,把它也回收掉
if (get(key1).size() == 0) {
remove(key1);
}
return remove;
}
/**
* @param key1
* @return
*/
public Map<K2, V> remove(K1 key1) {
Map<K2, V> remove = map.remove(key1);
return remove;
}
}
package us.codecraft.webmagic.utils;
/**
* @author code4crafter@gmail.com
* Date Dec 14, 2012
*/
import java.util.HashMap;
import java.util.Map;
/**
* multikey map, some basic objects *
*
* @author yihua.huang
*/
public abstract class MultiKeyMapBase {
protected static final Class<? extends Map> DEFAULT_CLAZZ = HashMap.class;
@SuppressWarnings("rawtypes")
private Class<? extends Map> protoMapClass = DEFAULT_CLAZZ;
public MultiKeyMapBase() {
}
@SuppressWarnings("rawtypes")
public MultiKeyMapBase(Class<? extends Map> protoMapClass) {
this.protoMapClass = protoMapClass;
}
@SuppressWarnings("unchecked")
protected <K, V2> Map<K, V2> newMap() {
try {
return (Map<K, V2>) protoMapClass.newInstance();
} catch (InstantiationException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
} catch (IllegalAccessException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
}
}
}
\ No newline at end of file
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 上午7:51 <br>
*/
public class RedisSchedulerTest {
......@@ -35,8 +35,11 @@ public class RedisSchedulerTest {
return null;
}
};
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
request.putExtra("1","2");
redisScheduler.push(request, task);
Request poll = redisScheduler.poll(task);
System.out.println(poll);
}
}
webmagic-lucene
--------
尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic</artifactId>
<groupId>us.codecraft</groupId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-lucene</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic.pipeline;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-5 <br>
* Time: 下午2:11 <br>
*/
public class LucenePipeline implements Pipeline {
private Directory directory;
private Analyzer analyzer;
private IndexWriterConfig config;
private void init() throws IOException {
analyzer = new StandardAnalyzer(Version.LUCENE_44);
directory = new RAMDirectory();
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
}
public LucenePipeline() {
try {
init();
} catch (IOException e) {
e.printStackTrace();
}
}
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
List<Document> documents = new ArrayList<Document>();
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
// Parse a simple query that searches for "text":
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
Query query = parser.parse(value);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
documents.add(hitDoc);
}
ireader.close();
return documents;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()){
return;
}
Document doc = new Document();
Map<String,Object> all = resultItems.getAll();
if (all==null){
return;
}
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
}
try {
IndexWriter indexWriter = new IndexWriter(directory, config);
indexWriter.addDocument(doc);
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package us.codecraft.webmagic.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.LucenePipeline;
import java.io.IOException;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
LucenePipeline pipeline = new LucenePipeline();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
while (true) {
try {
List<Document> search = pipeline.search("title", "webmagic");
System.out.println(search);
Thread.sleep(3000);
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}
webmagic-plugin
-------
webmagic的插件模块。
目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。
另外有一个使用Selenium来动态渲染页面的模块在开发中。
\ No newline at end of file
<item>
<title>$it.Title</title>
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
<description></description>
<content:encoded><![CDATA[${text}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
$tags
</item>
\ No newline at end of file
package us.codecraft.webmagic;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午7:14
*/
public class FreemarkerPipelineTest {
@Test
public void testTemplateLoad() throws IOException {
new FreemarkerPipeline("wordpress.ftl");
}
}
webmagic-selenium
-------
尝试使用selenium来进行页面动态渲染,开发中。
\ No newline at end of file
webmagic-samples
-------
webmagic的一些示例。包括抓取常见博客、信息类网站等。
\ No newline at end of file
......@@ -5,7 +5,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......@@ -19,12 +19,7 @@
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-misc</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......@@ -33,4 +28,23 @@
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>./lib/</classpathPrefix>
<mainClass>us.codecraft.webmagic.main.QuickStarter</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package us.codecraft.webmagic.main;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.samples.IteyeBlog;
import us.codecraft.webmagic.model.samples.News163;
import us.codecraft.webmagic.model.samples.OschinaBlog;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-7 <br>
* Time: 下午9:24 <br>
*/
public class QuickStarter {
private static Map<String, Class> clazzMap;
private static Map<String, String> urlMap;
private static void init(){
clazzMap = new LinkedHashMap<String, Class>();
clazzMap.put("1", OschinaBlog.class);
clazzMap.put("2", IteyeBlog.class);
clazzMap.put("3", News163.class);
urlMap = new LinkedHashMap<String, String>();
urlMap.put("1", "http://my.oschina.net/flashsword/blog");
urlMap.put("2", "http://flashsword20.iteye.com/");
urlMap.put("3", "http://news.163.com/");
}
public static void main(String[] args) {
init();
String key = null;
key = readKey(key);
System.out.println("The demo started and will last 20 seconds...");
//Start spider
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync();
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("The demo stopped!");
System.out.println("To more usage, try to customize your own Spider!");
System.exit(0);
}
private static String readKey(String key) {
Scanner stdin = new Scanner(System.in);
System.out.println("Choose a Spider demo:");
for (Map.Entry<String, Class> classEntry : clazzMap.entrySet()) {
System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey()));
}
while (key == null) {
key = new String(stdin.nextLine());
if (clazzMap.get(key) == null) {
System.out.println("Invalid choice!");
key = null;
}
}
return key;
}
}
package us.codecraft.webmagic.model.samples;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午8:10 <br>
*/
public interface Blog {
public String getTitle();
public String getContent();
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://*.iteye.com/blog/*")
public class IteyeBlog implements Blog{
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "IteyeBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractBy2;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.Collection;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false)
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
private String title;
@ExtractBy("//div[@id=\"epContentLeft\"]")
private String content;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@Override
public String toString() {
return "News163{" +
"content='" + content + '\'' +
", title='" + title + '\'' +
", otherPage=" + otherPage +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
public class OschinaAnswer implements AfterExtractor{
@ExtractBy("//img/@title")
private String user;
@ExtractBy("//div[@class='detail']")
private String content;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
}
@Override
public void afterProcess(Page page) {
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new ConsolePageModelPipeline(), OschinaBlog.class).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* Author yihua.huang@dianping.com
* Date: 13-6-24
* Time: 下午2:12
*/
public class GlobalProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
final List<String> requests = page.getHtml().links().all();
page.addTargetRequests(requests);
}
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
public static void main(String[] args) {
Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.runAsync();
Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-14 <br>
* Time: 上午8:33 <br>
*/
public class GuoxueProcessor {
public static void main(String[] args) {
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
}
}
......@@ -7,8 +7,8 @@ import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 上午7:31 <br>
*/
public class IteyeBlogProcessor implements PageProcessor {
......
......@@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -13,18 +15,24 @@ import java.util.List;
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent());
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
}
}
......@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
*/
public class DiandianProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
//pipeline是抓取结束后的处理
//ftl文件放到classpath:ftl/文件夹下
//默认放到/data/temp/webmagic/ftl/[domain]目录下
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}
......@@ -4,9 +4,9 @@ import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......@@ -21,7 +21,7 @@ public class DiaoyuwengProcessorTest {
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
......
......@@ -4,9 +4,9 @@ import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......@@ -22,9 +22,8 @@ public class SinablogProcessorTest {
public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
//pipeline是抓取结束后的处理
//ftl文件放到classpath:ftl/文件夹下
//默认放到/data/temp/webmagic/ftl/[domain]目录下
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
......
webmagic-extension
-------
webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。
\ No newline at end of file
......@@ -5,16 +5,11 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<packaging>pom</packaging>
<modelVersion>4.0.0</modelVersion>
<modules>
<module>webmagic-misc</module>
<module>webmagic-selenium</module>
</modules>
<artifactId>webmagic-plugin</artifactId>
<artifactId>webmagic-saxon</artifactId>
<dependencies>
<dependency>
......@@ -22,6 +17,10 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
......
package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import org.apache.log4j.Logger;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
*/
public class Xpath2Selector implements Selector {
private String xpathStr;
private XPathExpression xPathExpression;
private Logger logger = Logger.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
try {
init();
} catch (XPathExpressionException e) {
throw new IllegalArgumentException("XPath error!", e);
}
}
enum XPath2NamespaceContext implements NamespaceContext {
INSTANCE;
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI);
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null) {
prefixes = new ArrayList<String>();
namespace2PrefixMap.put(namespaceURI, prefixes);
}
prefixes.add(prefix);
}
private XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT);
}
@Override
public String getNamespaceURI(String prefix) {
return prefix2NamespaceMap.get(prefix);
}
@Override
public String getPrefix(String namespaceURI) {
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null || prefixes.size() < 1) {
return null;
}
return prefixes.get(0);
}
@Override
public Iterator getPrefixes(String namespaceURI) {
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null || prefixes.size() < 1) {
return null;
}
return prefixes.iterator();
}
}
private void init() throws XPathExpressionException {
XPathEvaluator xPathEvaluator = new XPathEvaluator();
xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
xPathExpression = xPathEvaluator.compile(xpathStr);
}
@Override
public String select(String text) {
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
}
}
package us.codecraft.webmagic.selector;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
/**
......@@ -1168,7 +1169,7 @@ public class XpathSelectorTest {
+ " var location = window.location;\n"
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
+ " pre.writeAttribute('codeable_id', post_id);\n"
+ " pre.writeAttribute('codeable_type', \"Blog\");\n"
+ " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
+ " pre.writeAttribute('source_url', source_url);\n"
+ " pre.writeAttribute('pre_index', index);\n"
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"
......@@ -1354,4 +1355,41 @@ public class XpathSelectorTest {
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
}
@Test
public void testXPath2() {
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
System.out.println(xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html);
Assert.assertNotNull(select);
}
@Ignore("take long time")
@Test
public void performanceTest() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
long time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
XpathSelector xpathSelector = new XpathSelector("//a");
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpathSelector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
}
}
webmagic-extension
-------
webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。
\ No newline at end of file
......@@ -2,13 +2,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-plugin</artifactId>
<version>0.1.0</version>
<artifactId>webmagic</artifactId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId>
<dependencies>
......@@ -17,7 +17,15 @@
<artifactId>selenium-java</artifactId>
<version>2.33.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger;
import org.openqa.selenium.By;
......@@ -21,8 +21,8 @@ import java.util.Map;
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public class SeleniumDownloader implements Downloader, Destroyable {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
......@@ -11,8 +11,8 @@ import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
......
package us.codecraft.webmagic.selenium;
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
......@@ -13,8 +13,8 @@ import java.util.HashMap;
import java.util.Map;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午12:27 <br>
*/
public class SeleniumTest {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.junit.Ignore;
import org.junit.Test;
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午2:46 <br>
*/
public class SeleniumDownloaderTest {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.WebDriver;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午2:12 <br>
*/
public class WebDriverPoolTest {
......
......@@ -3,16 +3,15 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
/**
* 花瓣网抽取器。<br>
* 使用Selenium做页面动态渲染。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午4:08 <br>
*/
public class HuabanProcessor implements PageProcessor {
......@@ -39,7 +38,6 @@ public class HuabanProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.runAsync();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment