Commit 65dc3721 authored by yihua.huang's avatar yihua.huang

update pipeline api

parent 55d80129
...@@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/** /**
* <pre> * <pre>
...@@ -27,7 +25,7 @@ public class Page { ...@@ -27,7 +25,7 @@ public class Page {
private Request request; private Request request;
private Map<String, Selectable> fields = new ConcurrentHashMap<String, Selectable>(); private ResultItems resultItems = new ResultItems();
private Selectable html; private Selectable html;
...@@ -35,44 +33,16 @@ public class Page { ...@@ -35,44 +33,16 @@ public class Page {
private List<Request> targetRequests = new ArrayList<Request>(); private List<Request> targetRequests = new ArrayList<Request>();
private boolean skip;
private Object extra;
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
*/
public boolean isSkip() {
return skip;
}
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip 是否忽略 true 忽略
*/
public void setSkip(boolean skip) {
this.skip = skip;
}
public Page() { public Page() {
} }
/**
* 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
* @return fields 抽取的结果
*/
public Map<String, Selectable> getFields() {
return fields;
}
/** /**
* 保存抽取的结果 * 保存抽取的结果
* @param key 结果的key * @param key 结果的key
* @param field 结果的value * @param field 结果的value
*/ */
public void putField(String key, Selectable field) { public void putField(String key, Object field) {
fields.put(key, field); resultItems.put(key, field);
} }
/** /**
...@@ -157,23 +127,10 @@ public class Page { ...@@ -157,23 +127,10 @@ public class Page {
public void setRequest(Request request) { public void setRequest(Request request) {
this.request = request; this.request = request;
this.resultItems.setRequest(request);
} }
/** public ResultItems getResultItems() {
* 获取附加对象 return resultItems;
* @param <T> 对象类型
* @return 对象内容
*/
public <T> T getExtra() {
return (T)extra;
}
/**
* 设置附加对象
* @param extra 对象内容
* @param <T> 对象类型
*/
public <T> void setExtra(T extra) {
this.extra = extra;
} }
} }
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.Map;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* Time: 下午12:20 <br>
*/
public class ResultItems {
private Map<String, Object> fields = new HashMap<String, Object>();
private Request request;
private boolean skip;
public <T> T get(String key) {
Object o = fields.get(key);
if (o == null) {
return null;
}
return (T) fields.get(key);
}
public Map<String, Object> getAll() {
return fields;
}
public <T> ResultItems put(String key, T value) {
fields.put(key, value);
return this;
}
public Request getRequest() {
return request;
}
public ResultItems setRequest(Request request) {
this.request = request;
return this;
}
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
*/
public boolean isSkip() {
return skip;
}
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip
* @return this
*/
public ResultItems setSkip(boolean skip) {
this.skip = skip;
return this;
}
}
...@@ -196,7 +196,7 @@ public class Spider implements Runnable, Task { ...@@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
pageProcessor.process(page); pageProcessor.process(page);
addRequest(page); addRequest(page);
for (Pipeline pipeline : pipelines) { for (Pipeline pipeline : pipelines) {
pipeline.process(page, this); pipeline.process(page.getResultItems(), this);
} }
sleep(site.getSleepTime()); sleep(site.getSleepTime());
} }
......
package us.codecraft.webmagic.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Map; import java.util.Map;
...@@ -15,13 +14,10 @@ import java.util.Map; ...@@ -15,13 +14,10 @@ import java.util.Map;
public class ConsolePipeline implements Pipeline{ public class ConsolePipeline implements Pipeline{
@Override @Override
public void process(Page page,Task task) { public void process(ResultItems resultItems,Task task) {
System.out.println("get page: "+page.getUrl()); System.out.println("get page: "+resultItems.getRequest().getUrl());
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); System.out.println(entry.getKey()+":\t"+entry.getValue());
}
if (page.getExtra()!=null){
System.out.println(page.getExtra());
} }
} }
} }
...@@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline; ...@@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.Map;
/** /**
* 持久化到文件的接口。 * 持久化到文件的接口。
...@@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline { ...@@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
} }
@Override @Override
public void process(Page page, Task task) { public void process(ResultItems resultItems, Task task) {
String path = this.path + "/" + task.getUUID() + "/"; String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path); File file = new File(path);
if (!file.exists()) { if (!file.exists()) {
file.mkdirs(); file.mkdirs();
} }
try { try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()))); PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())));
printWriter.println("url:\t" + page.getUrl()); printWriter.println("url:\t" + resultItems.getRequest().getUrl());
printWriter.println("html:\t" + page.getHtml()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
printWriter.println(entry.getKey()+":\t"+entry.getValue());
}
printWriter.close(); printWriter.close();
} catch (IOException e) { } catch (IOException e) {
logger.warn("write file error",e); logger.warn("write file error",e);
......
package us.codecraft.webmagic.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
...@@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task; ...@@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
*/ */
public interface Pipeline { public interface Pipeline {
public void process(Page page,Task task); public void process(ResultItems resultItems,Task task);
} }
...@@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor { ...@@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings(); List<String> requests = page.getHtml().links().regex(urlPattern).all();
//调用page.addTargetRequests()方法添加待抓取链接 //调用page.addTargetRequests()方法添加待抓取链接
page.addTargetRequests(requests); page.addTargetRequests(requests);
//xpath方式抽取 //xpath方式抽取
page.putField("title", page.getHtml().xpath("//title")); page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文 //sc表示使用Readability技术抽取正文
page.putField("html", page.getHtml().toString());
page.putField("content", page.getHtml().smartContent()); page.putField("content", page.getHtml().smartContent());
} }
......
...@@ -82,14 +82,14 @@ public class PlainText implements Selectable { ...@@ -82,14 +82,14 @@ public class PlainText implements Selectable {
} }
@Override @Override
public List<String> toStrings() { public List<String> all() {
return strings; return strings;
} }
@Override @Override
public String toString() { public String toString() {
if (CollectionUtils.isNotEmpty(toStrings())) { if (CollectionUtils.isNotEmpty(all())) {
return toStrings().get(0); return all().get(0);
} else { } else {
return null; return null;
} }
......
...@@ -69,5 +69,5 @@ public interface Selectable { ...@@ -69,5 +69,5 @@ public interface Selectable {
* *
* @return multi string result * @return multi string result
*/ */
public List<String> toStrings(); public List<String> all();
} }
...@@ -1351,7 +1351,7 @@ public class XpathSelectorTest { ...@@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
public void testOschina() { public void testOschina() {
Html html1 = new Html(html); Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
} }
} }
...@@ -4,7 +4,7 @@ import freemarker.template.Configuration; ...@@ -4,7 +4,7 @@ import freemarker.template.Configuration;
import freemarker.template.Template; import freemarker.template.Template;
import freemarker.template.TemplateException; import freemarker.template.TemplateException;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import java.io.File; import java.io.File;
...@@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline { ...@@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
@Override @Override
public void process(Page page, Task task) { public void process(ResultItems resultItems, Task task) {
if (page.isSkip()) { if (resultItems.isSkip()) {
return; return;
} }
String path = this.path + "" + task.getUUID() + "/"; String path = this.path + "" + task.getUUID() + "/";
...@@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline { ...@@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
file.mkdirs(); file.mkdirs();
} }
try { try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
template.process(page.getFields(), printWriter); template.process(resultItems.getAll(), printWriter);
printWriter.close(); printWriter.close();
} catch (TemplateException e) { } catch (TemplateException e) {
} catch (IOException e) { } catch (IOException e) {
......
...@@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{ ...@@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
@Override @Override
public synchronized void push(Request request, Task task) { public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
//使用SortedSet进行url去重
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
//使用List保存队列
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
} }
......
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler;
import org.junit.Before; import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
...@@ -20,6 +21,7 @@ public class RedisSchedulerTest { ...@@ -20,6 +21,7 @@ public class RedisSchedulerTest {
redisScheduler = new RedisScheduler("localhost"); redisScheduler = new RedisScheduler("localhost");
} }
@Ignore("environment depended")
@Test @Test
public void test() { public void test() {
Task task = new Task() { Task task = new Task() {
...@@ -35,7 +37,6 @@ public class RedisSchedulerTest { ...@@ -35,7 +37,6 @@ public class RedisSchedulerTest {
}; };
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
Request poll = redisScheduler.poll(task); Request poll = redisScheduler.poll(task);
System.out.println(poll.getUrl());
} }
} }
...@@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor { ...@@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
//a()表示提取链接,links()表示提取所有链接 //a()表示提取链接,links()表示提取所有链接
//getHtml()返回Html对象,支持链式调用 //getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//toString()表示取单条结果,toStrings()表示取多条 //toString()表示取单条结果,all()表示取多条
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings(); List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
//使用page.addTargetRequests()方法将待抓取的链接加入队列 //使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests); page.addTargetRequests(requests);
//page.putField(key,value)将抽取的内容加入结果Map //page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取 //x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|")); page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().smartContent()); page.putField("content", page.getHtml().smartContent());
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
......
...@@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor { ...@@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings(); List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
} }
......
...@@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor { ...@@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){ if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
......
...@@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor { ...@@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>")); page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
......
...@@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor { ...@@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
final List<String> requests = page.getHtml().links().toStrings(); final List<String> requests = page.getHtml().links().all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
} }
......
...@@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor { ...@@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().smartContent()); page.putField("content",page.getHtml().smartContent());
......
...@@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor { ...@@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) { if (requests.size() > 2) {
requests = requests.subList(0, 2); requests = requests.subList(0, 2);
} }
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings()); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
} }
......
...@@ -14,7 +14,7 @@ import java.util.List; ...@@ -14,7 +14,7 @@ import java.util.List;
public class NjuBBSProcessor implements PageProcessor { public class NjuBBSProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent()); page.putField("content",page.getHtml().smartContent());
......
...@@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings(); List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent()); page.putField("content", page.getHtml().smartContent());
......
...@@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor { ...@@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
......
...@@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor { ...@@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent()); page.putField("content",page.getHtml().smartContent());
......
...@@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor { ...@@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings()); page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all());
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
......
...@@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor { ...@@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().smartContent()); page.putField("body",page.getHtml().smartContent());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment