Commit 85b7cf15 authored by yihua.huang's avatar yihua.huang

complete test

parent d7cd9e57
......@@ -53,7 +53,7 @@ public class Html extends PlainText {
@Override
public Selectable links() {
XpathSelector xpathSelector = Selectors.xpath("//a/@href");
XsoupSelector xpathSelector = new XsoupSelector("//a/@href");
return selectList(xpathSelector, strings);
}
......
......@@ -15,10 +15,9 @@ import java.util.List;
public class HuxiuProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
List<String> requests = page.getHtml().links().regex(".*article.*").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
page.putField("content",page.getHtml().smartContent());
}
......
......@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
......@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/temp/webmagic/"))
.thread(5)
.run();
}
......
......@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
......@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run();
Spider.create(new IteyeBlogProcessor()).thread(5).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
......@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
@Override
......@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
......@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
Spider.create(new OschinaBlogPageProcesser()).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment