Commit 14562855 authored by yihua.huang's avatar yihua.huang

update afterextract api

parent aca165b1
...@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page; ...@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
* @date: 13-8-3 <br> * @date: 13-8-3 <br>
* Time: 上午9:42 <br> * Time: 上午9:42 <br>
*/ */
public interface AfterExtractor<T> { public interface AfterExtractor {
public void afterProcess(Page page, T t); public void afterProcess(Page page);
} }
...@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page; ...@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
...@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor { ...@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
} }
public ObjectPageProcessor addPageModel(Class clazz){ public ObjectPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
...@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor { ...@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page); Object process = pageModelExtractor.process(page);
if (process==null){ if (process == null) {
page.getResultItems().setSkip(true); page.getResultItems().setSkip(true);
} }
postProcessPageModel(pageModelExtractor.getClazz(), process); postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
} }
for (String link : page.getHtml().links().all()) { }
for (Pattern targetUrlPattern : targetUrlPatterns) {
if (targetUrlPattern.matcher(link).matches()){ private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
page.addTargetRequest(new Request(link)); List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
} }
} }
} }
} }
protected void postProcessPageModel(Class clazz, Object object){ protected void postProcessPageModel(Class clazz, Object object) {
} }
@Override @Override
......
...@@ -21,14 +21,16 @@ class PageModelExtractor { ...@@ -21,14 +21,16 @@ class PageModelExtractor {
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>(); private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
private Selector targetUrlRegionSelector;
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>(); private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
private Selector helpUrlRegionSelector;
private Class clazz; private Class clazz;
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
private AfterExtractor afterExtractor;
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz); pageModelExtractor.init(clazz);
...@@ -39,13 +41,6 @@ class PageModelExtractor { ...@@ -39,13 +41,6 @@ class PageModelExtractor {
this.clazz = clazz; this.clazz = clazz;
initTargetUrlPatterns(); initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
if (AfterExtractor.class.isAssignableFrom(clazz)) {
try {
afterExtractor = (AfterExtractor) clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class); ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
...@@ -117,16 +112,24 @@ class PageModelExtractor { ...@@ -117,16 +112,24 @@ class PageModelExtractor {
if (annotation == null) { if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*")); targetUrlPatterns.add(Pattern.compile(".*"));
} else { } else {
String[] value = ((TargetUrl) annotation).value(); TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) { for (String s : value) {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
}
if (!targetUrl.sourceRegion().equals("")){
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
} }
} }
annotation = clazz.getAnnotation(HelpUrl.class); annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) { if (annotation != null) {
String[] value = ((HelpUrl) annotation).value(); HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) { for (String s : value) {
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
}
if (!helpUrl.sourceRegion().equals("")){
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
} }
} }
} }
...@@ -179,8 +182,8 @@ class PageModelExtractor { ...@@ -179,8 +182,8 @@ class PageModelExtractor {
setField(o, fieldExtractor, value); setField(o, fieldExtractor, value);
} }
} }
if (afterExtractor != null) { if (AfterExtractor.class.isAssignableFrom(clazz)) {
afterExtractor.afterProcess(page, o); ((AfterExtractor)o).afterProcess(page);
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
...@@ -210,4 +213,12 @@ class PageModelExtractor { ...@@ -210,4 +213,12 @@ class PageModelExtractor {
List<Pattern> getHelpUrlPatterns() { List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns; return helpUrlPatterns;
} }
Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}
Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
} }
...@@ -9,8 +9,8 @@ import java.util.List; ...@@ -9,8 +9,8 @@ import java.util.List;
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午10:18 <br> * Time: 下午10:18 <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/*") @TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']")
public class OschinaBlog implements AfterExtractor<OschinaBlog> { public class OschinaBlog implements AfterExtractor {
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
...@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> { ...@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
private List<String> tags; private List<String> tags;
@Override @Override
public void afterProcess(Page page, OschinaBlog oschinaBlog) { public void afterProcess(Page page) {
content = null; content = null;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment