Commit 986ae0be authored by yihua.huang's avatar yihua.huang

update Select api: remove x() s() etc.

parent 586d23ef
...@@ -33,7 +33,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -33,7 +33,7 @@ public class HttpClientDownloader implements Downloader {
if (site.getAcceptStatCode().contains(statusCode)) { if (site.getAcceptStatCode().contains(statusCode)) {
if (site.getEncoding() == null){ if (site.getEncoding() == null){
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString()); site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
} }
String content = IOUtils.toString(httpResponse.getEntity().getContent(), String content = IOUtils.toString(httpResponse.getEntity().getContent(),
site.getEncoding()); site.getEncoding());
......
...@@ -28,13 +28,13 @@ public class SimplePageProcessor implements PageProcessor { ...@@ -28,13 +28,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().as().rs(urlPattern).toStrings(); List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
//调用page.addTargetRequests()方法添加待抓取链接 //调用page.addTargetRequests()方法添加待抓取链接
page.addTargetRequests(requests); page.addTargetRequests(requests);
//xpath方式抽取 //xpath方式抽取
page.putField("title", page.getHtml().x("//title")); page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文 //sc表示使用Readability技术抽取正文
page.putField("content", page.getHtml().sc()); page.putField("content", page.getHtml().smartContent());
} }
@Override @Override
......
...@@ -18,12 +18,6 @@ public class Html extends PlainText { ...@@ -18,12 +18,6 @@ public class Html extends PlainText {
super(text); super(text);
} }
@Override
public Selectable x(String xpath) {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath);
return select(xpathSelector,strings);
}
@Override @Override
protected Selectable select(Selector selector, List<String> strings) { protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
...@@ -47,25 +41,19 @@ public class Html extends PlainText { ...@@ -47,25 +41,19 @@ public class Html extends PlainText {
} }
@Override @Override
public Selectable sc() { public Selectable smartContent() {
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
return select(smartContentSelector,strings); return select(smartContentSelector,strings);
} }
@Override @Override
public Selectable a() { public Selectable links() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
return select(xpathSelector,strings);
}
@Override
public Selectable as() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
return selectList(xpathSelector,strings); return selectList(xpathSelector,strings);
} }
@Override @Override
public Selectable xs(String xpath) { public Selectable xpath(String xpath) {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath);
return selectList(xpathSelector, strings); return selectList(xpathSelector, strings);
} }
......
...@@ -25,38 +25,22 @@ public class PlainText implements Selectable { ...@@ -25,38 +25,22 @@ public class PlainText implements Selectable {
} }
@Override @Override
public Selectable x(String xpath) { public Selectable xpath(String xpath) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public Selectable xs(String xpath) { public Selectable smartContent() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public Selectable sc() { public Selectable links() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public Selectable a() { public Selectable regex(String regex) {
throw new UnsupportedOperationException();
}
@Override
public Selectable as() {
throw new UnsupportedOperationException();
}
@Override
public Selectable r(String regex) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex);
return select(regexSelector, strings);
}
@Override
public Selectable rs(String regex) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex);
return selectList(regexSelector, strings); return selectList(regexSelector, strings);
} }
...@@ -82,7 +66,7 @@ public class PlainText implements Selectable { ...@@ -82,7 +66,7 @@ public class PlainText implements Selectable {
} }
@Override @Override
public Selectable rp(String regex, String replacement) { public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement);
return select(replaceSelector, strings); return select(replaceSelector, strings);
} }
......
...@@ -9,51 +9,27 @@ import java.util.List; ...@@ -9,51 +9,27 @@ import java.util.List;
*/ */
public interface Selectable { public interface Selectable {
/**
* select with xpath
*
* @param xpath
* @return new Selectable after extract
*/
public Selectable x(String xpath);
/** /**
* select list with xpath * select list with xpath
* *
* @param xpath * @param xpath
* @return new Selectable after extract * @return new Selectable after extract
*/ */
public Selectable xs(String xpath); public Selectable xpath(String xpath);
/** /**
* select smart content with ReadAbility algorithm * select smart content with ReadAbility algorithm
* *
* @return content * @return content
*/ */
public Selectable sc(); public Selectable smartContent();
/**
* select a link
*
* @return first link
*/
public Selectable a();
/** /**
* select all links * select all links
* *
* @return all links * @return all links
*/ */
public Selectable as(); public Selectable links();
/**
* select with regex
*
* @param regex
* @return new Selectable after extract
*/
public Selectable r(String regex);
/** /**
* select list with regex * select list with regex
...@@ -61,7 +37,7 @@ public interface Selectable { ...@@ -61,7 +37,7 @@ public interface Selectable {
* @param regex * @param regex
* @return new Selectable after extract * @return new Selectable after extract
*/ */
public Selectable rs(String regex); public Selectable regex(String regex);
/** /**
* replace with regex * replace with regex
...@@ -70,7 +46,7 @@ public interface Selectable { ...@@ -70,7 +46,7 @@ public interface Selectable {
* @param replacement * @param replacement
* @return new Selectable after extract * @return new Selectable after extract
*/ */
public Selectable rp(String regex, String replacement); public Selectable replace(String regex, String replacement);
/** /**
* single string result * single string result
......
...@@ -14,7 +14,7 @@ public class HtmlTest { ...@@ -14,7 +14,7 @@ public class HtmlTest {
@Test @Test
public void testRegexSelector() { public void testRegexSelector() {
Html selectable = new Html("aaaaaaab"); Html selectable = new Html("aaaaaaab");
Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
} }
} }
...@@ -1350,7 +1350,7 @@ public class XpathSelectorTest { ...@@ -1350,7 +1350,7 @@ public class XpathSelectorTest {
@Test @Test
public void testOschina() { public void testOschina() {
Html html1 = new Html(html); Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.x(".//*[@class='QTitle']/h1/a").toString()); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
} }
} }
...@@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor { ...@@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//a()表示提取链接,as()表示提取所有链接 //a()表示提取链接,links()表示提取所有链接
//getHtml()返回Html对象,支持链式调用 //getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,rs()表示提取多条内容 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//toString()表示取单条结果,toStrings()表示取多条 //toString()表示取单条结果,toStrings()表示取多条
List<String> requests = page.getHtml().as().rs("(.*/post/.*)").toStrings(); List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings();
//使用page.addTargetRequests()方法将待抓取的链接加入队列 //使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests); page.addTargetRequests(requests);
//page.putField(key,value)将抽取的内容加入结果Map //page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取 //x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().x("//title").r("(.*?)\\|")); page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|"));
//sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().sc()); page.putField("content", page.getHtml().smartContent());
page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/")); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)")); page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
} }
@Override @Override
......
...@@ -15,13 +15,13 @@ import java.util.List; ...@@ -15,13 +15,13 @@ import java.util.List;
public class DianpingProcessor implements PageProcessor { public class DianpingProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().as().rs(".*shop.*").toStrings(); List<String> requests = page.getHtml().links().regex(".*shop.*").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
requests = page.getHtml().rs(".*search/category/.*").toStrings(); requests = page.getHtml().regex(".*search/category/.*").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")) { if (page.getUrl().toString().contains("shop")) {
page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc()); page.putField("content", page.getHtml().smartContent());
} }
} }
......
...@@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor { ...@@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){ if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().x("//a[@id='thread_subject']")); page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
} }
} }
......
...@@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor { ...@@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title",page.getHtml().r("<title>(.*)</title>")); page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().x("//dd[@class='w133']")); page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
} }
@Override @Override
......
...@@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor { ...@@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().sc()); page.putField("content",page.getHtml().smartContent());
} }
@Override @Override
......
...@@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor { ...@@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i); page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title")); page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", "")); page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
} }
@Override @Override
......
...@@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor { ...@@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
if (requests.size() > 2) { if (requests.size() > 2) {
requests = requests.subList(0, 2); requests = requests.subList(0, 2);
} }
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings());
page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
} }
@Override @Override
......
...@@ -14,10 +14,10 @@ import java.util.List; ...@@ -14,10 +14,10 @@ import java.util.List;
public class NjuBBSProcessor implements PageProcessor { public class NjuBBSProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().rs("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc()); page.putField("content",page.getHtml().smartContent());
} }
@Override @Override
......
...@@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().sc()); page.putField("content", page.getHtml().smartContent());
page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
} }
@Override @Override
......
...@@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor { ...@@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
} }
@Override @Override
......
...@@ -18,10 +18,10 @@ public class QzoneBlogProcessor implements PageProcessor { ...@@ -18,10 +18,10 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc()); page.putField("content",page.getHtml().smartContent());
} }
@Override @Override
......
...@@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor { ...@@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings());
page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); // page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a"));
} }
@Override @Override
......
...@@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor { ...@@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
page.addTargetRequests(strings); page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().sc()); page.putField("body",page.getHtml().smartContent());
} }
@Override @Override
......
...@@ -51,7 +51,7 @@ public class SpiderTest { ...@@ -51,7 +51,7 @@ public class SpiderTest {
/** /**
* *
* _hrefs = rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}") * _hrefs = regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"") * title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']") * body = x("//dd[@class='w133']")
* *
...@@ -72,7 +72,7 @@ public class SpiderTest { ...@@ -72,7 +72,7 @@ public class SpiderTest {
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
* *
* body=body[r(_currentUrl).g(1)] * body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + xs('')) . r('') * tags[%] = (tags[%] + xpath('')) . r('')
* *
* _targetUrls.add('' + x('').r('')) * _targetUrls.add('' + x('').r(''))
* _sourceUrls.add() * _sourceUrls.add()
...@@ -114,7 +114,7 @@ public class SpiderTest { ...@@ -114,7 +114,7 @@ public class SpiderTest {
* content = t(_html) > c() * content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1) * title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + xs('') > r('') * tags[%] = tags + xpath('') > r('')
* model.setTargetUrl(); * model.setTargetUrl();
* *
* _targetUrl = '' + x('') & r('') * _targetUrl = '' + x('') & r('')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment