Commit 7a4dbb1f authored by yihua.huang's avatar yihua.huang

invite notnull

parent 06a39af0
...@@ -2,6 +2,7 @@ package us.codecraft.webmagic; ...@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.annotation.ObjectPageProcessor;
import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Destroyable;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.downloader.HttpClientDownloader;
...@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task { ...@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor); return new Spider(pageProcessor);
} }
public static Spider create(Site site,Class... pageModels) {
return new Spider(ObjectPageProcessor.create(site,pageModels));
}
/** /**
* 重新设置startUrls,会覆盖Site本身的startUrls。 * 重新设置startUrls,会覆盖Site本身的startUrls。
* *
......
...@@ -18,4 +18,6 @@ public @interface ExtractBy { ...@@ -18,4 +18,6 @@ public @interface ExtractBy {
public enum Type {XPath, Regex, Css}; public enum Type {XPath, Regex, Css};
Type type() default Type.XPath; Type type() default Type.XPath;
boolean notNull() default true;
} }
...@@ -11,8 +11,10 @@ import java.lang.annotation.Target; ...@@ -11,8 +11,10 @@ import java.lang.annotation.Target;
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD}) @Target({ElementType.FIELD})
public @interface ExtractByUrl { public @interface ExtractByUrl{
String value() default ""; String value() default "";
boolean notNull() default true;
} }
...@@ -20,18 +20,15 @@ class FieldExtractor { ...@@ -20,18 +20,15 @@ class FieldExtractor {
private Method setterMethod; private Method setterMethod;
static enum Source {Html, Url} private final boolean notNull;
public FieldExtractor(Field field, Selector selector) { static enum Source {Html, Url}
this.field = field;
this.selector = selector;
this.source = Source.Html;
}
public FieldExtractor(Field field, Selector selector, Source source) { public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
this.field = field; this.field = field;
this.selector = selector; this.selector = selector;
this.source = source; this.source = source;
this.notNull = notNull;
} }
Field getField() { Field getField() {
...@@ -53,4 +50,8 @@ class FieldExtractor { ...@@ -53,4 +50,8 @@ class FieldExtractor {
Method getSetterMethod() { Method getSetterMethod() {
return setterMethod; return setterMethod;
} }
boolean isNotNull() {
return notNull;
}
} }
package us.codecraft.webmagic.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface HelpUrl {
String[] value();
}
...@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor { ...@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
targetUrlPatterns = new HashSet<Pattern>(); targetUrlPatterns = new HashSet<Pattern>();
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
} }
} }
...@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor { ...@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page); Object process = pageModelExtractor.process(page);
if (process==null){
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process); postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
} }
......
...@@ -24,6 +24,8 @@ class PageModelExtractor { ...@@ -24,6 +24,8 @@ class PageModelExtractor {
private List<Pattern> targetUrlPatterns; private List<Pattern> targetUrlPatterns;
private List<Pattern> helpUrlPatterns;
private Class clazz; private Class clazz;
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
...@@ -57,7 +59,7 @@ class PageModelExtractor { ...@@ -57,7 +59,7 @@ class PageModelExtractor {
default: default:
selector = new XpathSelector(value); selector = new XpathSelector(value);
} }
FieldExtractor fieldExtractor = new FieldExtractor(field, selector); FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);
...@@ -70,7 +72,7 @@ class PageModelExtractor { ...@@ -70,7 +72,7 @@ class PageModelExtractor {
if (regexPattern.trim().equals("")) { if (regexPattern.trim().equals("")) {
regexPattern = ".*"; regexPattern = ".*";
} }
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url); FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);
...@@ -102,6 +104,14 @@ class PageModelExtractor { ...@@ -102,6 +104,14 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
} }
} }
helpUrlPatterns = new ArrayList<Pattern>();
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
String[] value = ((HelpUrl) annotation).value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
}
} }
public Object process(Page page) { public Object process(Page page) {
...@@ -129,7 +139,10 @@ class PageModelExtractor { ...@@ -129,7 +139,10 @@ class PageModelExtractor {
default: default:
value = fieldExtractor.getSelector().select(page.getHtml().toString()); value = fieldExtractor.getSelector().select(page.getHtml().toString());
} }
setField(o,fieldExtractor,value); if (value==null&&fieldExtractor.isNotNull()){
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
...@@ -142,8 +155,8 @@ class PageModelExtractor { ...@@ -142,8 +155,8 @@ class PageModelExtractor {
} }
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod()!=null){ if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o,value); fieldExtractor.getSetterMethod().invoke(o, value);
} }
fieldExtractor.getField().set(o, value); fieldExtractor.getField().set(o, value);
} }
...@@ -155,4 +168,8 @@ class PageModelExtractor { ...@@ -155,4 +168,8 @@ class PageModelExtractor {
List<Pattern> getTargetUrlPatterns() { List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns; return targetUrlPatterns;
} }
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
} }
package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.TargetUrl;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://dengminhui.iteye.com/blog/*")
public class IteyeBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "IteyeBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment