Commit 23f6bb8d authored by yihua.huang's avatar yihua.huang

Merge branch 'annotation'

parents 9338e13c 21eca688
......@@ -3,15 +3,15 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.1.0</version>
<version>0.2.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>webmagic</artifactId>
<modules>
<module>webmagic-core</module>
<module>webmagic-plugin/</module>
<module>webmagic-samples/</module>
<modules>
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-samples/</module>
</modules>
<dependencyManagement>
......@@ -27,6 +27,11 @@
<artifactId>httpclient</artifactId>
<version>4.2.4</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
......@@ -45,7 +50,7 @@
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
......@@ -75,6 +80,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.8</version>
<executions>
<execution>
<id>copy-dependencies</id>
......@@ -94,6 +100,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
......@@ -101,6 +108,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
......@@ -113,6 +121,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
<executions>
<execution>
<id>attach-javadocs</id>
......@@ -125,11 +137,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
<version>2.4.1</version>
</plugin>
</plugins>
</build>
</project>
This diff is collapsed.
webmagic-core
-------
webmagic核心部分。
\ No newline at end of file
webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
\ No newline at end of file
......@@ -5,7 +5,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -9,7 +9,7 @@ import java.util.List;
/**
* <pre>
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
* 主要方法:
* {@link #getUrl()} 获取页面的Url
......@@ -19,6 +19,7 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
* </pre>
*
* @author code4crafter@gmail.com <br>
*/
public class Page {
......@@ -36,9 +37,16 @@ public class Page {
public Page() {
}
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* 保存抽取的结果
* @param key 结果的key
*
* @param key 结果的key
* @param field 结果的value
*/
public void putField(String key, Object field) {
......@@ -47,6 +55,7 @@ public class Page {
/**
* 获取页面的html内容
*
* @return html 页面的html内容
*/
public Selectable getHtml() {
......@@ -63,6 +72,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requests 待抓取的链接
*/
public void addTargetRequests(List<String> requests) {
......@@ -79,6 +89,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requestString 待抓取的链接
*/
public void addTargetRequest(String requestString) {
......@@ -93,6 +104,7 @@ public class Page {
/**
* 添加待抓取的页面,在需要传递附加信息时使用
*
* @param request 待抓取的页面
*/
public void addTargetRequest(Request request) {
......@@ -103,6 +115,7 @@ public class Page {
/**
* 获取页面的Url
*
* @return url 当前页面的url,可用于抽取
*/
public Selectable getUrl() {
......@@ -111,6 +124,7 @@ public class Page {
/**
* 设置url
*
* @param url
*/
public void setUrl(Selectable url) {
......@@ -119,6 +133,7 @@ public class Page {
/**
* 获取抓取请求
*
* @return request 抓取请求
*/
public Request getRequest() {
......
package us.codecraft.webmagic;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
......@@ -18,40 +22,95 @@ package us.codecraft.webmagic;
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:37
* Date: 13-4-21
* Time: 上午11:37
*/
public class Request {
public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
private String url;
private Object[] extra;
/**
* 额外参数,可以保存一些需要的上下文信息
*/
private Map<String, Object> extras;
private double priority;
public Request() {
}
/**
* 构建一个request对象
*
* @param url 必须参数,待抓取的url
* @param extra 额外参数,可以保存一些需要的上下文信息
*/
public Request(String url, Object... extra) {
public Request(String url) {
this.url = url;
this.extra = extra;
}
/**
* 获取预存的对象
* @return object[] 预存的对象数组
*/
public Object[] getExtra() {
return extra;
public double getPriority() {
return priority;
}
public Request setPriority(double priority) {
this.priority = priority;
return this;
}
public Object getExtra(String key) {
if (extras == null) {
return null;
}
return extras.get(key);
}
public Request putExtra(String key, Object value) {
if (extras == null) {
extras = new HashMap<String, Object>();
}
extras.put(key, value);
return this;
}
/**
* 获取待抓取的url
*
* @return url 待抓取的url
*/
public String getUrl() {
return url;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (!url.equals(request.url)) return false;
return true;
}
public Map<String, Object> getExtras() {
return extras;
}
@Override
public int hashCode() {
return url.hashCode();
}
public void setExtras(Map<String, Object> extras) {
this.extras = extras;
}
public void setUrl(String url) {
this.url = url;
}
}
......@@ -5,8 +5,8 @@ import java.util.Map;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 下午12:20 <br>
*/
public class ResultItems {
......
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
/**
......@@ -90,6 +92,11 @@ public class Site {
* @return 已设置的domain
*/
public String getDomain() {
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return domain;
}
......@@ -150,6 +157,7 @@ public class Site {
/**
* 获取初始页面的地址列表
*
* @return 初始页面的地址列表
*/
public List<String> getStartUrls() {
......@@ -158,6 +166,7 @@ public class Site {
/**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
*
* @param startUrl 初始页面的地址
* @return this
*/
......@@ -179,6 +188,7 @@ public class Site {
/**
* 获取两次抓取之间的间隔
*
* @return 两次抓取之间的间隔,单位毫秒
*/
public int getSleepTime() {
......@@ -187,6 +197,7 @@ public class Site {
/**
* 获取重新下载的次数,默认为0
*
* @return 重新下载的次数
*/
public int getRetryTimes() {
......@@ -195,6 +206,7 @@ public class Site {
/**
* 设置获取重新下载的次数,默认为0
*
* @return this
*/
public Site setRetryTimes(int retryTimes) {
......@@ -219,7 +231,7 @@ public class Site {
return true;
}
public Task toTask(){
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
......
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.QueueScheduler;
import us.codecraft.webmagic.schedular.Scheduler;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.ThreadUtils;
import java.util.ArrayList;
......@@ -228,8 +228,10 @@ public class Spider implements Runnable, Task {
}
pageProcessor.process(page);
addRequest(page);
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
if (!page.getResultItems().isSkip()){
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
sleep(site.getSleepTime());
}
......@@ -283,6 +285,11 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider clearPipeline(){
pipelines=new ArrayList<Pipeline>();
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
......
......@@ -2,8 +2,8 @@ package us.codecraft.webmagic.downloader;
/**
* 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午3:10 <br>
*/
public interface Destroyable {
......
......@@ -7,29 +7,18 @@ import java.util.Map;
/**
* 命令行输出抽取结果。可用于测试。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:45
* Date: 13-4-21
* Time: 下午1:45
*/
public class ConsolePipeline implements Pipeline{
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems,Task task) {
if (resultItems.isSkip()){
return;
}
System.out.println("get page: "+resultItems.getRequest().getUrl());
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
System.out.println(entry.getKey() + ":");
for (Object o : value) {
System.out.println(o);
}
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
System.out.println(entry.getKey()+":\t"+entry.getValue());
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
}
}
......@@ -20,12 +20,12 @@ import java.util.Map;
*/
public class FilePipeline implements Pipeline {
private String path = "/data/temp/webmagic/";
private String path = "/data/webmagic/";
private Logger logger = Logger.getLogger(getClass());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
*/
public FilePipeline() {
......@@ -37,6 +37,9 @@ public class FilePipeline implements Pipeline {
* @param path 文件保存路径
*/
public FilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
}
......@@ -47,9 +50,6 @@ public class FilePipeline implements Pipeline {
if (!file.exists()) {
file.mkdirs();
}
if (resultItems.isSkip()) {
return;
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
......
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
......
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午5:29 <br>
*/
public class AndSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public AndSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
if (text == null) {
return null;
}
text = selector.select(text);
}
return text;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
boolean first = true;
for (Selector selector : selectors) {
if (first) {
results = selector.selectList(text);
first = false;
} else {
List<String> resultsTemp = new ArrayList<String>();
for (String result : results) {
resultsTemp.addAll(selector.selectList(result));
}
results = resultsTemp;
if (results == null || results.size() == 0) {
return results;
}
}
}
return results;
}
}
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午5:29 <br>
*/
public class OrSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public OrSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
text = selector.select(text);
if (text!=null){
return text;
}
}
return null;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
for (Selector selector : selectors) {
List<String> strings = selector.selectList(text);
results.addAll(strings);
}
return results;
}
}
......@@ -8,7 +8,7 @@ import java.util.List;
* Date: 13-4-20
* Time: 下午8:02
*/
interface Selector {
public interface Selector {
public String select(String text);
......
webmagic-extension
-------
webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。
\ No newline at end of file
......@@ -4,24 +4,33 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-plugin</artifactId>
<version>0.1.0</version>
<artifactId>webmagic</artifactId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-misc</artifactId>
<artifactId>webmagic-extension</artifactId>
<dependencies>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
<version>2.3.15</version>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.35</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic;
import java.util.Collection;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:18 <br>
*/
public interface PagedModel {
public String getPageKey();
public Collection<String> getOtherPages();
public String getPage();
public PagedModel combine(PagedModel pagedModel);
}
......@@ -34,6 +34,9 @@ public class FileDownloader implements Downloader {
}
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
}
......
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
/**
* 实现这个接口即可在抽取后进行后处理。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/
public interface AfterExtractor {
public void afterProcess(Page page);
}
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午3:41 <br>
*/
public class ConsolePageModelPipeline implements PageModelPipeline {
@Override
public void process(Object o, Task task) {
System.out.println(ToStringBuilder.reflectionToString(o));
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class Extractor {
protected Selector selector;
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
static enum Source {Html, Url, RawHtml}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
this.notNull = notNull;
this.multi = multi;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
return notNull;
}
boolean isMulti() {
return multi;
}
void setSelector(Selector selector) {
this.selector = selector;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class FieldExtractor extends Extractor{
private final Field field;
private Method setterMethod;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
super(selector, source, notNull,multi);
this.field = field;
}
Field getField() {
return field;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
void setSetterMethod(Method setterMethod) {
this.setterMethod = setterMethod;
}
Method getSetterMethod() {
return setterMethod;
}
boolean isNotNull() {
return notNull;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 基于PageProcessor的扩展点。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
class ModelPageProcessor implements PageProcessor {
private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
private Site site;
private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
public static ModelPageProcessor create(Site site, Class... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
for (Class clazz : clazzs) {
modelPageProcessor.addPageModel(clazz);
}
return modelPageProcessor;
}
public ModelPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor);
return this;
}
private ModelPageProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
}
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
protected void postProcessPageModel(Class clazz, Object object) {
}
@Override
public Site getSite() {
return site;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* 基于Pipeline的扩展点,用于实现注解格式的Pipeline。<br>
* 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午10:47 <br>
*/
class ModelPipeline implements Pipeline {
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
public ModelPipeline() {
}
public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
pageModelPipelines.put(clazz, pageModelPipeline);
return this;
}
@Override
public void process(ResultItems resultItems, Task task) {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
if (annotation == null || !((ExtractBy) annotation).multi()) {
classPageModelPipelineEntry.getValue().process(o, task);
} else {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
}
}
}
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/**
* 基于Model的Spider,封装后的入口类。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:51 <br>
*/
public class OOSpider extends Spider {
private ModelPageProcessor modelPageProcessor;
private ModelPipeline modelPipeline;
protected OOSpider(ModelPageProcessor modelPageProcessor) {
super(modelPageProcessor);
this.modelPageProcessor = modelPageProcessor;
}
/**
* 创建一个爬虫。<br>
* @param site
* @param pageModelPipeline
* @param pageModels
*/
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline();
super.pipeline(modelPipeline);
if (pageModelPipeline!=null){
for (Class pageModel : pageModels) {
this.modelPipeline.put(pageModel, pageModelPipeline);
}
}
}
public static OOSpider create(Site site, Class... pageModels) {
return new OOSpider(site, null, pageModels);
}
public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel);
modelPipeline.put(pageModel, pageModelPipeline);
}
return this;
}
}
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:34 <br>
*/
public interface PageModelPipeline<T> {
public void process(T t, Task task);
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractBy {
/**
* 抽取规则
*
* @return 抽取规则
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
*
* @return 抽取规则类型
*/
Type type() default Type.XPath;
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 用于字段时,需要List<String>来盛放结果<br>
* 用于类时,表示单页抽取多个对象<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy2 {
String value();
public enum Type {XPath, Regex, Css}
Type type() default Type.XPath;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy3 {
String value();
public enum Type { XPath, Regex, Css}
Type type() default Type.XPath;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractByRaw {
/**
* 抽取规则
*
* @return 抽取规则
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
*
* @return 抽取规则类型
*/
Type type() default Type.XPath;
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 需要List<String>来盛放结果<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractByUrl{
/**
* 抽取规则,支持正则表达式
*
* @return 抽取规则
*/
String value() default "";
/**
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
*
* @return 是否是不能为空的关键字段
*/
boolean notNull() default false;
/**
* 是否抽取多个结果<br>
* 用于字段时,需要List<String>来盛放结果<br>
* 用于类时,表示单页抽取多个对象<br>
*
* @return 是否抽取多个结果
*/
boolean multi() default false;
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义辅助爬取的url。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface HelpUrl {
/**
* 某个类对应的URL规则列表<br>
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
*
* @return 抽取规则
*/
String[] value();
/**
* 指定提取URL的区域(仅支持XPath)
* @return 指定提取URL的区域
*/
String sourceRegion() default "";
}
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface TargetUrl {
/**
* 某个类对应的URL规则列表<br>
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
*
* @return 抽取规则
*/
String[] value();
/**
* 指定提取URL的区域(仅支持XPath)
* @return 指定提取URL的区域
*/
String sourceRegion() default "";
}
<html>
<body>
webmagic注解抓取方式所定义的注解。
</body>
</html>
<html>
<body>
webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。
</body>
</html>
package us.codecraft.webmagic.pipeline;
import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateException;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
......@@ -13,48 +12,50 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
* JSON格式持久化到文件的接口。
*
* @author code4crafter@gmail.com <br>
* Date: 13-6-8
* Time: 下午9:00
* Date: 13-4-21
* Time: 下午6:28
*/
public class FreemarkerPipeline implements Pipeline {
public class JsonFilePipeline implements Pipeline {
private Configuration configuration;
private String path = "/data/webmagic/";
private Template template;
private Logger logger = Logger.getLogger(getClass());
private String path = "/data/temp/webmagic/ftl/";
/**
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
*/
public JsonFilePipeline() {
public FreemarkerPipeline(String template, String path) throws IOException {
configuration = new Configuration();
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
this.template = configuration.getTemplate(template);
this.path = path;
new File(path);
}
public FreemarkerPipeline(String template) throws IOException {
this(template, "/data/temp/webmagic/ftl/");
/**
* 新建一个FilePipeline
*
* @param path 文件保存路径
*/
public JsonFilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
String path = this.path + "" + task.getUUID() + "/";
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
template.process(resultItems.getAll(), printWriter);
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (TemplateException e) {
} catch (IOException e) {
e.printStackTrace();
logger.warn("write file error", e);
}
}
}
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* 用于实现分页的Pipeline。<br>
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:15 <br>
*/
public class PagedPipeline implements Pipeline {
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> resultItemsAll = resultItems.getAll();
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
while (iterator.hasNext()) {
handleObject(iterator);
}
}
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
Map.Entry<String, Object> objectEntry = iterator.next();
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
if (pagedModel.getOtherPages() != null) {
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
if (booleanMap == null) {
return;
}
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
if (!stringBooleanEntry.getValue()) {
iterator.remove();
return;
}
}
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
@Override
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
return i1 - i2;
} catch (NumberFormatException e) {
return o1.getKey().compareTo(o2.getKey());
}
}
});
PagedModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue());
}
objectEntry.setValue(value);
}
}
}
}
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
......@@ -46,6 +46,9 @@ public class FileCacheQueueScheduler implements Scheduler {
private Set<String> urls;
public FileCacheQueueScheduler(String filePath) {
if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){
filePath+="/";
}
this.filePath = filePath;
}
......
package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.schedular.Scheduler;
/**
* 使用redis管理url,构建一个分布式的爬虫。<br>
*
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 上午7:07 <br>
*/
public class RedisScheduler implements Scheduler {
......@@ -22,6 +23,8 @@ public class RedisScheduler implements Scheduler {
private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
pool = new JedisPool(new JedisPoolConfig(), host);
}
......@@ -33,7 +36,12 @@ public class RedisScheduler implements Scheduler {
if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
if (request.getExtras() != null) {
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
byte[] bytes = JSON.toJSONString(request).getBytes();
jedis.set(key.getBytes(), bytes);
}
}
pool.returnResource(jedis);
}
......@@ -42,10 +50,16 @@ public class RedisScheduler implements Scheduler {
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
pool.returnResource(jedis);
if (url==null){
if (url == null) {
return null;
}
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
byte[] bytes = jedis.get(key.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes),Request.class);
return o;
}
pool.returnResource(jedis);
return new Request(url);
}
}
package us.codecraft.webmagic.utils;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date Dec 14, 2012
*/
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
private Map<K1, Map<K2, V>> map;
public DoubleKeyMap() {
init();
}
public DoubleKeyMap(Map<K1, Map<K2, V>> map) {
this(map,DEFAULT_CLAZZ);
}
public DoubleKeyMap(Class<? extends Map> protoMapClass) {
super(protoMapClass);
init();
}
private void init() {
if (map == null) {
map = this.<K1, Map<K2, V>>newMap();
}
}
/**
* init map with protoMapClass
*
* @param protoMapClass
*/
@SuppressWarnings("rawtypes")
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
super(protoMapClass);
this.map = map;
init();
}
/**
* @param key
* @return map
*/
public Map<K2, V> get(K1 key) {
return map.get(key);
}
/**
* @param key1
* @param key2
* @return value
*/
public V get(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
return get(key1).get(key2);
}
/**
* @param key1
* @param submap
* @return
*/
public V put(K1 key1, Map<K2, V> submap) {
return put(key1, submap);
}
/**
* @param key1
* @param key2
* @param value
* @return
*/
public V put(K1 key1, K2 key2, V value) {
if (map.get(key1) == null) {
map.put(key1, this.<K2, V>newMap());
}
return get(key1).put(key2, value);
}
/**
* @param key1
* @param key2
* @return
*/
public V remove(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
V remove = get(key1).remove(key2);
// 如果上一级map为空,把它也回收掉
if (get(key1).size() == 0) {
remove(key1);
}
return remove;
}
/**
* @param key1
* @return
*/
public Map<K2, V> remove(K1 key1) {
Map<K2, V> remove = map.remove(key1);
return remove;
}
}
package us.codecraft.webmagic.utils;
/**
* @author code4crafter@gmail.com
* Date Dec 14, 2012
*/
import java.util.HashMap;
import java.util.Map;
/**
* multikey map, some basic objects *
*
* @author yihua.huang
*/
public abstract class MultiKeyMapBase {
protected static final Class<? extends Map> DEFAULT_CLAZZ = HashMap.class;
@SuppressWarnings("rawtypes")
private Class<? extends Map> protoMapClass = DEFAULT_CLAZZ;
public MultiKeyMapBase() {
}
@SuppressWarnings("rawtypes")
public MultiKeyMapBase(Class<? extends Map> protoMapClass) {
this.protoMapClass = protoMapClass;
}
@SuppressWarnings("unchecked")
protected <K, V2> Map<K, V2> newMap() {
try {
return (Map<K, V2>) protoMapClass.newInstance();
} catch (InstantiationException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
} catch (IllegalAccessException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
}
}
}
\ No newline at end of file
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 上午7:51 <br>
*/
public class RedisSchedulerTest {
......@@ -35,8 +35,11 @@ public class RedisSchedulerTest {
return null;
}
};
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
request.putExtra("1","2");
redisScheduler.push(request, task);
Request poll = redisScheduler.poll(task);
System.out.println(poll);
}
}
webmagic-lucene
--------
尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic</artifactId>
<groupId>us.codecraft</groupId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-lucene</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic.pipeline;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-5 <br>
* Time: 下午2:11 <br>
*/
public class LucenePipeline implements Pipeline {
private Directory directory;
private Analyzer analyzer;
private IndexWriterConfig config;
private void init() throws IOException {
analyzer = new StandardAnalyzer(Version.LUCENE_44);
directory = new RAMDirectory();
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
}
public LucenePipeline() {
try {
init();
} catch (IOException e) {
e.printStackTrace();
}
}
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
List<Document> documents = new ArrayList<Document>();
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
// Parse a simple query that searches for "text":
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
Query query = parser.parse(value);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
documents.add(hitDoc);
}
ireader.close();
return documents;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()){
return;
}
Document doc = new Document();
Map<String,Object> all = resultItems.getAll();
if (all==null){
return;
}
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
}
try {
IndexWriter indexWriter = new IndexWriter(directory, config);
indexWriter.addDocument(doc);
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package us.codecraft.webmagic.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.LucenePipeline;
import java.io.IOException;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
LucenePipeline pipeline = new LucenePipeline();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
while (true) {
try {
List<Document> search = pipeline.search("title", "webmagic");
System.out.println(search);
Thread.sleep(3000);
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}
webmagic-plugin
-------
webmagic的插件模块。
目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。
另外有一个使用Selenium来动态渲染页面的模块在开发中。
\ No newline at end of file
<item>
<title>$it.Title</title>
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
<description></description>
<content:encoded><![CDATA[${text}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
$tags
</item>
\ No newline at end of file
package us.codecraft.webmagic;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午7:14
*/
public class FreemarkerPipelineTest {
@Test
public void testTemplateLoad() throws IOException {
new FreemarkerPipeline("wordpress.ftl");
}
}
webmagic-selenium
-------
尝试使用selenium来进行页面动态渲染,开发中。
\ No newline at end of file
webmagic-samples
-------
webmagic的一些示例。包括抓取常见博客、信息类网站等。
\ No newline at end of file
......@@ -5,7 +5,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......@@ -19,12 +19,7 @@
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-misc</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......@@ -33,4 +28,23 @@
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>./lib/</classpathPrefix>
<mainClass>us.codecraft.webmagic.main.QuickStarter</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package us.codecraft.webmagic.main;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.samples.IteyeBlog;
import us.codecraft.webmagic.model.samples.News163;
import us.codecraft.webmagic.model.samples.OschinaBlog;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-7 <br>
* Time: 下午9:24 <br>
*/
public class QuickStarter {
private static Map<String, Class> clazzMap;
private static Map<String, String> urlMap;
private static void init(){
clazzMap = new LinkedHashMap<String, Class>();
clazzMap.put("1", OschinaBlog.class);
clazzMap.put("2", IteyeBlog.class);
clazzMap.put("3", News163.class);
urlMap = new LinkedHashMap<String, String>();
urlMap.put("1", "http://my.oschina.net/flashsword/blog");
urlMap.put("2", "http://flashsword20.iteye.com/");
urlMap.put("3", "http://news.163.com/");
}
public static void main(String[] args) {
init();
String key = null;
key = readKey(key);
System.out.println("The demo started and will last 20 seconds...");
//Start spider
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync();
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("The demo stopped!");
System.out.println("To more usage, try to customize your own Spider!");
System.exit(0);
}
private static String readKey(String key) {
Scanner stdin = new Scanner(System.in);
System.out.println("Choose a Spider demo:");
for (Map.Entry<String, Class> classEntry : clazzMap.entrySet()) {
System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey()));
}
while (key == null) {
key = new String(stdin.nextLine());
if (clazzMap.get(key) == null) {
System.out.println("Invalid choice!");
key = null;
}
}
return key;
}
}
package us.codecraft.webmagic.model.samples;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午8:10 <br>
*/
public interface Blog {
public String getTitle();
public String getContent();
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://*.iteye.com/blog/*")
public class IteyeBlog implements Blog{
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "IteyeBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractBy2;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.Collection;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false)
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
private String title;
@ExtractBy("//div[@id=\"epContentLeft\"]")
private String content;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@Override
public String toString() {
return "News163{" +
"content='" + content + '\'' +
", title='" + title + '\'' +
", otherPage=" + otherPage +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
public class OschinaAnswer implements AfterExtractor{
@ExtractBy("//img/@title")
private String user;
@ExtractBy("//div[@class='detail']")
private String content;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
}
@Override
public void afterProcess(Page page) {
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new ConsolePageModelPipeline(), OschinaBlog.class).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* Author yihua.huang@dianping.com
* Date: 13-6-24
* Time: 下午2:12
*/
public class GlobalProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
final List<String> requests = page.getHtml().links().all();
page.addTargetRequests(requests);
}
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
public static void main(String[] args) {
Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.runAsync();
Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-14 <br>
* Time: 上午8:33 <br>
*/
public class GuoxueProcessor {
public static void main(String[] args) {
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
}
}
......@@ -7,8 +7,8 @@ import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 上午7:31 <br>
*/
public class IteyeBlogProcessor implements PageProcessor {
......
......@@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -13,18 +15,24 @@ import java.util.List;
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent());
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
}
}
......@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
*/
public class DiandianProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
//pipeline是抓取结束后的处理
//ftl文件放到classpath:ftl/文件夹下
//默认放到/data/temp/webmagic/ftl/[domain]目录下
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}
......@@ -4,9 +4,9 @@ import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......@@ -21,7 +21,7 @@ public class DiaoyuwengProcessorTest {
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
......
......@@ -4,9 +4,9 @@ import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......@@ -22,9 +22,8 @@ public class SinablogProcessorTest {
public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
//pipeline是抓取结束后的处理
//ftl文件放到classpath:ftl/文件夹下
//默认放到/data/temp/webmagic/ftl/[domain]目录下
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
......
webmagic-extension
-------
webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。
\ No newline at end of file
......@@ -5,16 +5,11 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</parent>
<packaging>pom</packaging>
<modelVersion>4.0.0</modelVersion>
<modules>
<module>webmagic-misc</module>
<module>webmagic-selenium</module>
</modules>
<artifactId>webmagic-plugin</artifactId>
<artifactId>webmagic-saxon</artifactId>
<dependencies>
<dependency>
......@@ -22,6 +17,10 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
......
package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import org.apache.log4j.Logger;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
*/
public class Xpath2Selector implements Selector {
private String xpathStr;
private XPathExpression xPathExpression;
private Logger logger = Logger.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
try {
init();
} catch (XPathExpressionException e) {
throw new IllegalArgumentException("XPath error!", e);
}
}
enum XPath2NamespaceContext implements NamespaceContext {
INSTANCE;
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI);
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null) {
prefixes = new ArrayList<String>();
namespace2PrefixMap.put(namespaceURI, prefixes);
}
prefixes.add(prefix);
}
private XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT);
}
@Override
public String getNamespaceURI(String prefix) {
return prefix2NamespaceMap.get(prefix);
}
@Override
public String getPrefix(String namespaceURI) {
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null || prefixes.size() < 1) {
return null;
}
return prefixes.get(0);
}
@Override
public Iterator getPrefixes(String namespaceURI) {
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null || prefixes.size() < 1) {
return null;
}
return prefixes.iterator();
}
}
private void init() throws XPathExpressionException {
XPathEvaluator xPathEvaluator = new XPathEvaluator();
xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
xPathExpression = xPathEvaluator.compile(xpathStr);
}
@Override
public String select(String text) {
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
}
}
package us.codecraft.webmagic.selector;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
/**
......@@ -1168,7 +1169,7 @@ public class XpathSelectorTest {
+ " var location = window.location;\n"
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
+ " pre.writeAttribute('codeable_id', post_id);\n"
+ " pre.writeAttribute('codeable_type', \"Blog\");\n"
+ " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
+ " pre.writeAttribute('source_url', source_url);\n"
+ " pre.writeAttribute('pre_index', index);\n"
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"
......@@ -1354,4 +1355,41 @@ public class XpathSelectorTest {
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
}
@Test
public void testXPath2() {
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
System.out.println(xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html);
Assert.assertNotNull(select);
}
@Ignore("take long time")
@Test
public void performanceTest() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
long time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
XpathSelector xpathSelector = new XpathSelector("//a");
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpathSelector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
}
}
webmagic-extension
-------
webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。
\ No newline at end of file
......@@ -2,13 +2,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-plugin</artifactId>
<version>0.1.0</version>
<artifactId>webmagic</artifactId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId>
<dependencies>
......@@ -17,7 +17,15 @@
<artifactId>selenium-java</artifactId>
<version>2.33.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger;
import org.openqa.selenium.By;
......@@ -21,8 +21,8 @@ import java.util.Map;
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public class SeleniumDownloader implements Downloader, Destroyable {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
......@@ -11,8 +11,8 @@ import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
......
package us.codecraft.webmagic.selenium;
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
......@@ -13,8 +13,8 @@ import java.util.HashMap;
import java.util.Map;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午12:27 <br>
*/
public class SeleniumTest {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.junit.Ignore;
import org.junit.Test;
......@@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午2:46 <br>
*/
public class SeleniumDownloaderTest {
......
package us.codecraft.webmagic.selenium.downloader;
package us.codecraft.webmagic.downloader.selenium;
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.WebDriver;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午2:12 <br>
*/
public class WebDriverPoolTest {
......
......@@ -3,16 +3,15 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
/**
* 花瓣网抽取器。<br>
* 使用Selenium做页面动态渲染。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午4:08 <br>
*/
public class HuabanProcessor implements PageProcessor {
......@@ -39,7 +38,6 @@ public class HuabanProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.runAsync();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment