Commit 7bed01c9 authored by yihua.huang's avatar yihua.huang

update Spider api

parent 69ff524d
...@@ -3,10 +3,12 @@ package us.codecraft.webmagic; ...@@ -3,10 +3,12 @@ package us.codecraft.webmagic;
import java.util.*; import java.util.*;
/** /**
* Site定义一个待抓取的站点的各种信息。 * Site定义一个待抓取的站点的各种信息。<br>
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:13 * Time: 下午12:13
*/ */
public class Site { public class Site {
...@@ -30,73 +32,157 @@ public class Site { ...@@ -30,73 +32,157 @@ public class Site {
DEFAULT_STATUS_CODE_SET.add(200); DEFAULT_STATUS_CODE_SET.add(200);
} }
/**
* 创建一个Site对象,等价于new Site()
*
* @return 新建的对象
*/
public static Site me() { public static Site me() {
return new Site(); return new Site();
} }
public Site setCookie(String name, String value) { /**
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
*
* @param name cookie的名称
* @param value cookie的值
* @return this
*/
public Site addCookie(String name, String value) {
cookies.put(name, value); cookies.put(name, value);
return this; return this;
} }
/**
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) { public Site setUserAgent(String userAgent) {
this.userAgent = userAgent; this.userAgent = userAgent;
return this; return this;
} }
/**
* 获取已经设置的所有cookie
*
* @return 已经设置的所有cookie
*/
public Map<String, String> getCookies() { public Map<String, String> getCookies() {
return cookies; return cookies;
} }
/**
* 获取已设置的user-agent
*
* @return 已设置的user-agent
*/
public String getUserAgent() { public String getUserAgent() {
return userAgent; return userAgent;
} }
/**
* 获取已设置的domain
*
* @return
*/
public String getDomain() { public String getDomain() {
return domain; return domain;
} }
/**
* 设置这个站点所在域名,必须项。<br>
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
*
* @param domain 爬虫会抓取的域名
* @return this
*/
public Site setDomain(String domain) { public Site setDomain(String domain) {
this.domain = domain; this.domain = domain;
return this; return this;
} }
public String getEncoding() { /**
return encoding; * 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
} * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
*
* @param encoding 编码格式,主要是"utf-8"、"gbk"两种
* @return this
*/
public Site setEncoding(String encoding) { public Site setEncoding(String encoding) {
this.encoding = encoding; this.encoding = encoding;
return this; return this;
} }
public Set<Integer> getAcceptStatCode() { /**
return acceptStatCode; * 获取已设置的编码
*
* @return 已设置的domain
*/
public String getEncoding() {
return encoding;
} }
/**
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
* 默认为200,正常情况下,无须设置此项。<br>
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
*
* @param acceptStatCode 可接受的状态码
* @return this
*/
public Site setAcceptStatCode(Set<Integer> acceptStatCode) { public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
this.acceptStatCode = acceptStatCode; this.acceptStatCode = acceptStatCode;
return this; return this;
} }
/**
* 获取可接受的状态码
*
* @return 可接受的状态码
*/
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
}
/**
* 获取初始页面的地址列表
* @return 初始页面的地址列表
*/
public List<String> getStartUrls() { public List<String> getStartUrls() {
return startUrls; return startUrls;
} }
/**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
* @param startUrl 初始页面的地址
* @return this
*/
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl); this.startUrls.add(startUrl);
return this; return this;
} }
public int getSleepTime() { /**
return sleepTime; * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
} *
* @param sleepTime 单位毫秒
* @return this
*/
public Site setSleepTime(int sleepTime) { public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime; this.sleepTime = sleepTime;
return this; return this;
} }
/**
* 获取两次抓取之间的间隔
* @return 两次抓取之间的间隔,单位毫秒
*/
public int getSleepTime() {
return sleepTime;
}
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
......
...@@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader; ...@@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.QueueSchedular; import us.codecraft.webmagic.schedular.QueueScheduler;
import us.codecraft.webmagic.schedular.Schedular; import us.codecraft.webmagic.schedular.Scheduler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* <pre>
* webmagic爬虫的入口类。
* 示例:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
* </pre>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午6:53 * Time: 上午6:53
...@@ -32,18 +37,17 @@ public class Spider implements Runnable, Task { ...@@ -32,18 +37,17 @@ public class Spider implements Runnable, Task {
private String uuid; private String uuid;
private Schedular schedular = new QueueSchedular(); private Scheduler scheduler = new QueueScheduler();
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
public static Spider me() { public Spider(PageProcessor pageProcessor){
return new Spider();
}
public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite(); this.site = pageProcessor.getSite();
return this; }
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
} }
public Spider startUrls(List<String> startUrls) { public Spider startUrls(List<String> startUrls) {
...@@ -57,8 +61,13 @@ public class Spider implements Runnable, Task { ...@@ -57,8 +61,13 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
public Spider schedular(Schedular schedular) { public Spider setUUID(String uuid) {
this.schedular = schedular; this.uuid = uuid;
return this;
}
public Spider schedular(Scheduler scheduler) {
this.scheduler = scheduler;
return this; return this;
} }
...@@ -71,9 +80,9 @@ public class Spider implements Runnable, Task { ...@@ -71,9 +80,9 @@ public class Spider implements Runnable, Task {
@Override @Override
public void run() { public void run() {
for (String startUrl : startUrls) { for (String startUrl : startUrls) {
schedular.push(new Request(startUrl), this); scheduler.push(new Request(startUrl), this);
} }
Request request = schedular.poll(this); Request request = scheduler.poll(this);
if (pipelines.isEmpty()) { if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline()); pipelines.add(new ConsolePipeline());
} }
...@@ -89,16 +98,10 @@ public class Spider implements Runnable, Task { ...@@ -89,16 +98,10 @@ public class Spider implements Runnable, Task {
pipeline.process(page, this); pipeline.process(page, this);
} }
sleep(site.getSleepTime()); sleep(site.getSleepTime());
request = schedular.poll(this); request = scheduler.poll(this);
} }
} }
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
private void sleep(int time) { private void sleep(int time) {
try { try {
Thread.sleep(time); Thread.sleep(time);
...@@ -110,7 +113,7 @@ public class Spider implements Runnable, Task { ...@@ -110,7 +113,7 @@ public class Spider implements Runnable, Task {
private void addRequest(Page page) { private void addRequest(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) { for (Request request : page.getTargetRequests()) {
schedular.push(request, this); scheduler.push(request, this);
} }
} }
} }
......
...@@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger; ...@@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:13 * Time: 下午1:13
*/ */
public class FileCacheQueueSchedular implements Schedular { public class FileCacheQueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
...@@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular {
private Set<String> urls; private Set<String> urls;
public FileCacheQueueSchedular(String filePath) { public FileCacheQueueScheduler(String filePath) {
this.filePath = filePath; this.filePath = filePath;
} }
......
...@@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue; ...@@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:13 * Time: 下午1:13
*/ */
public class QueueSchedular implements Schedular { public class QueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
......
...@@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task; ...@@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task;
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:12 * Time: 下午1:12
*/ */
public interface Schedular { public interface Scheduler {
public void push(Request request,Task task); public void push(Request request,Task task);
......
...@@ -5,8 +5,8 @@ import java.util.List; ...@@ -5,8 +5,8 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午7:54 * Time: 上午7:54
*/ */
public class Html extends PlainText { public class Html extends PlainText {
...@@ -18,12 +18,16 @@ public class Html extends PlainText { ...@@ -18,12 +18,16 @@ public class Html extends PlainText {
super(text); super(text);
} }
public static Html create(String text) {
return new Html(text);
}
@Override @Override
protected Selectable select(Selector selector, List<String> strings) { protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
for (String string : strings) { for (String string : strings) {
String result = selector.select(string); String result = selector.select(string);
if (result!=null){ if (result != null) {
results.add(result); results.add(result);
} }
} }
...@@ -43,13 +47,13 @@ public class Html extends PlainText { ...@@ -43,13 +47,13 @@ public class Html extends PlainText {
@Override @Override
public Selectable smartContent() { public Selectable smartContent() {
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
return select(smartContentSelector,strings); return select(smartContentSelector, strings);
} }
@Override @Override
public Selectable links() { public Selectable links() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
return selectList(xpathSelector,strings); return selectList(xpathSelector, strings);
} }
@Override @Override
......
...@@ -24,6 +24,10 @@ public class PlainText implements Selectable { ...@@ -24,6 +24,10 @@ public class PlainText implements Selectable {
this.strings = results; this.strings = results;
} }
public static PlainText create(String text) {
return new PlainText(text);
}
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
......
...@@ -15,7 +15,7 @@ public class HttpClientDownloaderTest { ...@@ -15,7 +15,7 @@ public class HttpClientDownloaderTest {
@Test @Test
public void testCookie() { public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
......
...@@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor { ...@@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public static void main(String[] args) { public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor(); DianpingProcessor dianpingProcessor = new DianpingProcessor();
Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
} }
} }
...@@ -5,7 +5,7 @@ import org.junit.Test; ...@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
...@@ -18,7 +18,7 @@ public class SpiderTest { ...@@ -18,7 +18,7 @@ public class SpiderTest {
@Ignore @Ignore
@Test @Test
public void testSpider() throws InterruptedException { public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
me.run(); me.run();
} }
...@@ -26,13 +26,13 @@ public class SpiderTest { ...@@ -26,13 +26,13 @@ public class SpiderTest {
@Test @Test
public void testGlobalSpider(){ public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor(); // PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run(); // processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getEncoding()); System.out.println(pageProcessor2.getSite().getEncoding());
pageProcessor2.getSite().setSleepTime(500); pageProcessor2.getSite().setSleepTime(500);
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
processor(pageProcessor2).run(); run();
} }
......
...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; ...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor; import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
...@@ -30,7 +30,7 @@ public class DiandianProcessorTest { ...@@ -30,7 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台 //ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行 //Spider.run()执行
Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
processor(diaoyuwengProcessor).run(); run();
} }
} }
...@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider; ...@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor; import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-6-9 * Date: 13-6-9
* Time: 上午8:02 * Time: 上午8:02
*/ */
public class DiaoyuwengProcessorTest { public class DiaoyuwengProcessorTest {
...@@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest { ...@@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest {
public void test() throws IOException { public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
processor(diaoyuwengProcessor).run(); run();
} }
} }
...@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider; ...@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser; import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-6-9 * Date: 13-6-9
* Time: 上午8:02 * Time: 上午8:02
*/ */
public class SinablogProcessorTest { public class SinablogProcessorTest {
...@@ -30,7 +30,7 @@ public class SinablogProcessorTest { ...@@ -30,7 +30,7 @@ public class SinablogProcessorTest {
//ConsolePipeline输出结果到控制台 //ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行 //Spider.run()执行
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
processor(sinaBlogProcesser).run(); run();
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment