Commit 986ae0be authored by yihua.huang's avatar yihua.huang

update Select api: remove x() s() etc.

parent 586d23ef
......@@ -33,7 +33,7 @@ public class HttpClientDownloader implements Downloader {
if (site.getAcceptStatCode().contains(statusCode)) {
if (site.getEncoding() == null){
String value = httpResponse.getEntity().getContentType().getValue();
site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString());
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
}
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
site.getEncoding());
......
......@@ -28,13 +28,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().as().rs(urlPattern).toStrings();
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
//调用page.addTargetRequests()方法添加待抓取链接
page.addTargetRequests(requests);
//xpath方式抽取
page.putField("title", page.getHtml().x("//title"));
page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文
page.putField("content", page.getHtml().sc());
page.putField("content", page.getHtml().smartContent());
}
@Override
......
......@@ -18,12 +18,6 @@ public class Html extends PlainText {
super(text);
}
@Override
public Selectable x(String xpath) {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath);
return select(xpathSelector,strings);
}
@Override
protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
......@@ -47,25 +41,19 @@ public class Html extends PlainText {
}
@Override
public Selectable sc() {
public Selectable smartContent() {
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
return select(smartContentSelector,strings);
}
@Override
public Selectable a() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
return select(xpathSelector,strings);
}
@Override
public Selectable as() {
public Selectable links() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
return selectList(xpathSelector,strings);
}
@Override
public Selectable xs(String xpath) {
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath);
return selectList(xpathSelector, strings);
}
......
......@@ -25,38 +25,22 @@ public class PlainText implements Selectable {
}
@Override
public Selectable x(String xpath) {
public Selectable xpath(String xpath) {
throw new UnsupportedOperationException();
}
@Override
public Selectable xs(String xpath) {
public Selectable smartContent() {
throw new UnsupportedOperationException();
}
@Override
public Selectable sc() {
public Selectable links() {
throw new UnsupportedOperationException();
}
@Override
public Selectable a() {
throw new UnsupportedOperationException();
}
@Override
public Selectable as() {
throw new UnsupportedOperationException();
}
@Override
public Selectable r(String regex) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex);
return select(regexSelector, strings);
}
@Override
public Selectable rs(String regex) {
public Selectable regex(String regex) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex);
return selectList(regexSelector, strings);
}
......@@ -82,7 +66,7 @@ public class PlainText implements Selectable {
}
@Override
public Selectable rp(String regex, String replacement) {
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement);
return select(replaceSelector, strings);
}
......
......@@ -9,51 +9,27 @@ import java.util.List;
*/
public interface Selectable {
/**
* select with xpath
*
* @param xpath
* @return new Selectable after extract
*/
public Selectable x(String xpath);
/**
* select list with xpath
*
* @param xpath
* @return new Selectable after extract
*/
public Selectable xs(String xpath);
public Selectable xpath(String xpath);
/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public Selectable sc();
/**
* select a link
*
* @return first link
*/
public Selectable a();
public Selectable smartContent();
/**
* select all links
*
* @return all links
*/
public Selectable as();
/**
* select with regex
*
* @param regex
* @return new Selectable after extract
*/
public Selectable r(String regex);
public Selectable links();
/**
* select list with regex
......@@ -61,7 +37,7 @@ public interface Selectable {
* @param regex
* @return new Selectable after extract
*/
public Selectable rs(String regex);
public Selectable regex(String regex);
/**
* replace with regex
......@@ -70,7 +46,7 @@ public interface Selectable {
* @param replacement
* @return new Selectable after extract
*/
public Selectable rp(String regex, String replacement);
public Selectable replace(String regex, String replacement);
/**
* single string result
......
......@@ -14,7 +14,7 @@ public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString()));
Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
}
}
......@@ -1350,7 +1350,7 @@ public class XpathSelectorTest {
@Test
public void testOschina() {
Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.x(".//*[@class='QTitle']/h1/a").toString());
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
}
}
......@@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor {
@Override
public void process(Page page) {
//a()表示提取链接,as()表示提取所有链接
//a()表示提取链接,links()表示提取所有链接
//getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,rs()表示提取多条内容
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//toString()表示取单条结果,toStrings()表示取多条
List<String> requests = page.getHtml().as().rs("(.*/post/.*)").toStrings();
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests);
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().x("//title").r("(.*?)\\|"));
//sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().sc());
page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/"));
page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)"));
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|"));
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().smartContent());
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
@Override
......
......@@ -15,13 +15,13 @@ import java.util.List;
public class DianpingProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().as().rs(".*shop.*").toStrings();
List<String> requests = page.getHtml().links().regex(".*shop.*").toStrings();
page.addTargetRequests(requests);
requests = page.getHtml().rs(".*search/category/.*").toStrings();
requests = page.getHtml().regex(".*search/category/.*").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")) {
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc());
page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().smartContent());
}
}
......
......@@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
page.addTargetRequests(requests);
requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().x("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody"));
page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
}
......
......@@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().r("<title>(.*)</title>"));
page.putField("body",page.getHtml().x("//dd[@class='w133']"));
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
}
@Override
......
......@@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().sc());
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().smartContent());
}
@Override
......
......@@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
@Override
......
......@@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings());
page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
}
@Override
......
......@@ -14,10 +14,10 @@ import java.util.List;
public class NjuBBSProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().rs("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());
}
@Override
......
......@@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings();
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().sc());
page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent());
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
}
@Override
......
......@@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
}
@Override
......
......@@ -18,10 +18,10 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());
}
@Override
......
......@@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings());
page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2"));
page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)"));
// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a"));
page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings());
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a"));
}
@Override
......
......@@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().sc());
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().smartContent());
}
@Override
......
......@@ -51,7 +51,7 @@ public class SpiderTest {
/**
*
* _hrefs = rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* _hrefs = regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']")
*
......@@ -72,7 +72,7 @@ public class SpiderTest {
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
*
* body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + xs('')) . r('')
* tags[%] = (tags[%] + xpath('')) . r('')
*
* _targetUrls.add('' + x('').r(''))
* _sourceUrls.add()
......@@ -114,7 +114,7 @@ public class SpiderTest {
* content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + xs('') > r('')
* tags[%] = tags + xpath('') > r('')
* model.setTargetUrl();
*
* _targetUrl = '' + x('') & r('')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment