Commit b393e383 authored by yihua.huang's avatar yihua.huang

add multi entity extract

parent bfadac75
...@@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor { ...@@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page); Object process = pageModelExtractor.process(page);
if (process == null) { if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true); page.getResultItems().setSkip(true);
} }
postProcessPageModel(pageModelExtractor.getClazz(), process); postProcessPageModel(pageModelExtractor.getClazz(), process);
......
...@@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems; ...@@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
...@@ -32,7 +34,16 @@ public class ObjectPipeline implements Pipeline { ...@@ -32,7 +34,16 @@ public class ObjectPipeline implements Pipeline {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) { for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) { if (o != null) {
classPageModelPipelineEntry.getValue().process(o, task); Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
ExtractBy extractBy = (ExtractBy) annotation;
if (extractBy.multi()) {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
} else {
classPageModelPipelineEntry.getValue().process(o, task);
}
} }
} }
} }
......
...@@ -31,6 +31,8 @@ class PageModelExtractor { ...@@ -31,6 +31,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz); pageModelExtractor.init(clazz);
...@@ -39,7 +41,7 @@ class PageModelExtractor { ...@@ -39,7 +41,7 @@ class PageModelExtractor {
private void init(Class clazz) { private void init(Class clazz) {
this.clazz = clazz; this.clazz = clazz;
initTargetUrlPatterns(); initClassExtractors();
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
...@@ -107,7 +109,7 @@ class PageModelExtractor { ...@@ -107,7 +109,7 @@ class PageModelExtractor {
} }
} }
private void initTargetUrlPatterns() { private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class); Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) { if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*")); targetUrlPatterns.add(Pattern.compile(".*"));
...@@ -115,9 +117,9 @@ class PageModelExtractor { ...@@ -115,9 +117,9 @@ class PageModelExtractor {
TargetUrl targetUrl = (TargetUrl) annotation; TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value(); String[] value = targetUrl.value();
for (String s : value) { for (String s : value) {
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
} }
if (!targetUrl.sourceRegion().equals("")){ if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
} }
} }
...@@ -126,12 +128,17 @@ class PageModelExtractor { ...@@ -126,12 +128,17 @@ class PageModelExtractor {
HelpUrl helpUrl = (HelpUrl) annotation; HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value(); String[] value = helpUrl.value();
for (String s : value) { for (String s : value) {
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
} }
if (!helpUrl.sourceRegion().equals("")){ if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
} }
} }
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
} }
public Object process(Page page) { public Object process(Page page) {
...@@ -144,6 +151,28 @@ class PageModelExtractor { ...@@ -144,6 +151,28 @@ class PageModelExtractor {
if (!matched) { if (!matched) {
return null; return null;
} }
if (extractor == null) {
return processSingle(page,page.getHtml().toString());
} else {
if (extractor.multi){
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
if (o!=null){
os.add(o);
}
}
return os;
}else {
String select = extractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
return o;
}
}
}
private Object processSingle(Page page,String html) {
Object o = null; Object o = null;
try { try {
o = clazz.newInstance(); o = clazz.newInstance();
...@@ -152,38 +181,38 @@ class PageModelExtractor { ...@@ -152,38 +181,38 @@ class PageModelExtractor {
List<String> value; List<String> value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case Html: case Html:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); value = fieldExtractor.getSelector().selectList(html);
break; break;
case Url: case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break; break;
default: default:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); value = fieldExtractor.getSelector().selectList(html);
} }
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true); return null;
} }
setField(o, fieldExtractor, value); setField(o, fieldExtractor, value);
} else { } else {
String value; String value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case Html: case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString()); value = fieldExtractor.getSelector().select(html);
break; break;
case Url: case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString()); value = fieldExtractor.getSelector().select(page.getUrl().toString());
break; break;
default: default:
value = fieldExtractor.getSelector().select(page.getHtml().toString()); value = fieldExtractor.getSelector().select(html);
} }
if (value == null && fieldExtractor.isNotNull()) { if (value == null && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true); return null;
} }
setField(o, fieldExtractor, value); setField(o, fieldExtractor, value);
} }
} }
if (AfterExtractor.class.isAssignableFrom(clazz)) { if (AfterExtractor.class.isAssignableFrom(clazz)) {
((AfterExtractor)o).afterProcess(page); ((AfterExtractor) o).afterProcess(page);
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
......
package us.codecraft.webmagic.oo.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.*;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
public class OschinaAnswer implements AfterExtractor{
@ExtractBy("//img/@title")
private String user;
@ExtractBy(value="//div[@class='detail']",notNull = false)
private String content;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
}
@Override
public void afterProcess(Page page) {
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment