Commit b393e383 authored by yihua.huang's avatar yihua.huang

add multi entity extract

parent bfadac75
......@@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
if (process == null) {
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
......
......@@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
......@@ -32,8 +34,17 @@ public class ObjectPipeline implements Pipeline {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
ExtractBy extractBy = (ExtractBy) annotation;
if (extractBy.multi()) {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
} else {
classPageModelPipelineEntry.getValue().process(o, task);
}
}
}
}
}
......@@ -31,6 +31,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
......@@ -39,7 +41,7 @@ class PageModelExtractor {
private void init(Class clazz) {
this.clazz = clazz;
initTargetUrlPatterns();
initClassExtractors();
fieldExtractors = new ArrayList<FieldExtractor>();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
......@@ -107,7 +109,7 @@ class PageModelExtractor {
}
}
private void initTargetUrlPatterns() {
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
......@@ -115,9 +117,9 @@ class PageModelExtractor {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!targetUrl.sourceRegion().equals("")){
if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
}
}
......@@ -126,12 +128,17 @@ class PageModelExtractor {
HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!helpUrl.sourceRegion().equals("")){
if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
public Object process(Page page) {
......@@ -144,6 +151,28 @@ class PageModelExtractor {
if (!matched) {
return null;
}
if (extractor == null) {
return processSingle(page,page.getHtml().toString());
} else {
if (extractor.multi){
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
if (o!=null){
os.add(o);
}
}
return os;
}else {
String select = extractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
return o;
}
}
}
private Object processSingle(Page page,String html) {
Object o = null;
try {
o = clazz.newInstance();
......@@ -152,38 +181,38 @@ class PageModelExtractor {
List<String> value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
value = fieldExtractor.getSelector().selectList(html);
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
value = fieldExtractor.getSelector().selectList(html);
}
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
return null;
}
setField(o, fieldExtractor, value);
} else {
String value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
value = fieldExtractor.getSelector().select(html);
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
value = fieldExtractor.getSelector().select(html);
}
if (value == null && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
return null;
}
setField(o, fieldExtractor, value);
}
}
if (AfterExtractor.class.isAssignableFrom(clazz)) {
((AfterExtractor)o).afterProcess(page);
((AfterExtractor) o).afterProcess(page);
}
} catch (InstantiationException e) {
e.printStackTrace();
......
package us.codecraft.webmagic.oo.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.*;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
public class OschinaAnswer implements AfterExtractor{
@ExtractBy("//img/@title")
private String user;
@ExtractBy(value="//div[@class='detail']",notNull = false)
private String content;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
}
@Override
public void afterProcess(Page page) {
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment