Commit 36494bcf authored by yihua.huang's avatar yihua.huang

add xpath2.0 api

parent 5c96407a
......@@ -7,25 +7,18 @@ import java.util.Map;
/**
* 命令行输出抽取结果。可用于测试。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:45
*/
public class ConsolePipeline implements Pipeline{
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems,Task task) {
System.out.println("get page: "+resultItems.getRequest().getUrl());
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
System.out.println(entry.getKey() + ":");
for (Object o : value) {
System.out.println(o);
}
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
}
}
}
......@@ -63,6 +63,12 @@ public class Html extends PlainText {
return selectList(xpathSelector, strings);
}
@Override
public Selectable xpath2(String xpath) {
Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath);
return selectList(xpathSelector, strings);
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector);
......
......@@ -34,6 +34,11 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException();
}
@Override
public Selectable xpath2(String xpath) {
throw new UnsupportedOperationException();
}
@Override
public Selectable $(String selector) {
throw new UnsupportedOperationException();
......
......@@ -18,6 +18,14 @@ public interface Selectable {
*/
public Selectable xpath(String xpath);
/**
* select list with xpath 2.0 syntax
*
* @param xpath
* @return new Selectable after extract
*/
public Selectable xpath2(String xpath);
/**
* select list with css selector
*
......
......@@ -34,6 +34,10 @@ public class SelectorFactory {
return newSelector(XpathSelector.class, xpath);
}
public Xpath2Selector newXpath2Selector(String xpath) {
return newSelector(Xpath2Selector.class, xpath);
}
public SmartContentSelector newSmartContentSelector(){
return newSelector(SmartContentSelector.class);
}
......
package us.codecraft.webmagic.schedular;
package us.codecraft.webmagic.scheduler;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.schedular.Scheduler;
import java.io.*;
import java.util.LinkedHashSet;
......
......@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
/**
......@@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run();
}
public String getTitle() {
......
......@@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com <br>
......
......@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com <br>
......
......@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......
......@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......
......@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment