Commit 36494bcf authored by yihua.huang's avatar yihua.huang

add xpath2.0 api

parent 5c96407a
...@@ -7,25 +7,18 @@ import java.util.Map; ...@@ -7,25 +7,18 @@ import java.util.Map;
/** /**
* 命令行输出抽取结果。可用于测试。<br> * 命令行输出抽取结果。可用于测试。<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:45 * Time: 下午1:45
*/ */
public class ConsolePipeline implements Pipeline{ public class ConsolePipeline implements Pipeline {
@Override @Override
public void process(ResultItems resultItems,Task task) { public void process(ResultItems resultItems, Task task) {
System.out.println("get page: "+resultItems.getRequest().getUrl()); System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) { System.out.println(entry.getKey() + ":\t" + entry.getValue());
Iterable value = (Iterable) entry.getValue();
System.out.println(entry.getKey() + ":");
for (Object o : value) {
System.out.println(o);
}
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
} }
} }
} }
...@@ -63,6 +63,12 @@ public class Html extends PlainText { ...@@ -63,6 +63,12 @@ public class Html extends PlainText {
return selectList(xpathSelector, strings); return selectList(xpathSelector, strings);
} }
@Override
public Selectable xpath2(String xpath) {
Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath);
return selectList(xpathSelector, strings);
}
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector); CssSelector cssSelector = new CssSelector(selector);
......
...@@ -34,6 +34,11 @@ public class PlainText implements Selectable { ...@@ -34,6 +34,11 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public Selectable xpath2(String xpath) {
throw new UnsupportedOperationException();
}
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
......
...@@ -18,6 +18,14 @@ public interface Selectable { ...@@ -18,6 +18,14 @@ public interface Selectable {
*/ */
public Selectable xpath(String xpath); public Selectable xpath(String xpath);
/**
* select list with xpath 2.0 syntax
*
* @param xpath
* @return new Selectable after extract
*/
public Selectable xpath2(String xpath);
/** /**
* select list with css selector * select list with css selector
* *
......
...@@ -34,6 +34,10 @@ public class SelectorFactory { ...@@ -34,6 +34,10 @@ public class SelectorFactory {
return newSelector(XpathSelector.class, xpath); return newSelector(XpathSelector.class, xpath);
} }
public Xpath2Selector newXpath2Selector(String xpath) {
return newSelector(Xpath2Selector.class, xpath);
}
public SmartContentSelector newSmartContentSelector(){ public SmartContentSelector newSmartContentSelector(){
return newSelector(SmartContentSelector.class); return newSelector(SmartContentSelector.class);
} }
......
package us.codecraft.webmagic.schedular; package us.codecraft.webmagic.scheduler;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.schedular.Scheduler;
import java.io.*; import java.io.*;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
......
...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site; ...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ExtractBy; import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.TargetUrl; import us.codecraft.webmagic.model.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline;
/** /**
...@@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{ ...@@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run();
} }
public String getTitle() { public String getTitle() {
......
...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples; ...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
...@@ -5,7 +5,7 @@ import org.junit.Test; ...@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; ...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor; import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
......
...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; ...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor; import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
......
...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; ...@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser; import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment