Commit 14562855 authored by yihua.huang's avatar yihua.huang

update afterextract api

parent aca165b1
......@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
* @date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/
public interface AfterExtractor<T> {
public interface AfterExtractor {
public void afterProcess(Page page, T t);
public void afterProcess(Page page);
}
......@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
......@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
}
public ObjectPageProcessor addPageModel(Class clazz){
public ObjectPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
......@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
if (process==null){
if (process == null) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
}
for (String link : page.getHtml().links().all()) {
for (Pattern targetUrlPattern : targetUrlPatterns) {
if (targetUrlPattern.matcher(link).matches()){
page.addTargetRequest(new Request(link));
}
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
protected void postProcessPageModel(Class clazz, Object object){
protected void postProcessPageModel(Class clazz, Object object) {
}
@Override
......
......@@ -21,14 +21,16 @@ class PageModelExtractor {
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
private Selector targetUrlRegionSelector;
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
private Selector helpUrlRegionSelector;
private Class clazz;
private List<FieldExtractor> fieldExtractors;
private AfterExtractor afterExtractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
......@@ -39,13 +41,6 @@ class PageModelExtractor {
this.clazz = clazz;
initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>();
if (AfterExtractor.class.isAssignableFrom(clazz)) {
try {
afterExtractor = (AfterExtractor) clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
......@@ -117,16 +112,24 @@ class PageModelExtractor {
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
} else {
String[] value = ((TargetUrl) annotation).value();
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
}
if (!targetUrl.sourceRegion().equals("")){
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
String[] value = ((HelpUrl) annotation).value();
HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
}
if (!helpUrl.sourceRegion().equals("")){
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
}
}
}
......@@ -179,8 +182,8 @@ class PageModelExtractor {
setField(o, fieldExtractor, value);
}
}
if (afterExtractor != null) {
afterExtractor.afterProcess(page, o);
if (AfterExtractor.class.isAssignableFrom(clazz)) {
((AfterExtractor)o).afterProcess(page);
}
} catch (InstantiationException e) {
e.printStackTrace();
......@@ -210,4 +213,12 @@ class PageModelExtractor {
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}
Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
}
......@@ -9,8 +9,8 @@ import java.util.List;
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class OschinaBlog implements AfterExtractor<OschinaBlog> {
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']")
public class OschinaBlog implements AfterExtractor {
@ExtractBy("//title")
private String title;
......@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
private List<String> tags;
@Override
public void afterProcess(Page page, OschinaBlog oschinaBlog) {
public void afterProcess(Page page) {
content = null;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment