Commit 7a4dbb1f authored by yihua.huang's avatar yihua.huang

invite notnull

parent 06a39af0
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.annotation.ObjectPageProcessor;
import us.codecraft.webmagic.downloader.Destroyable;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
......@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor);
}
public static Spider create(Site site,Class... pageModels) {
return new Spider(ObjectPageProcessor.create(site,pageModels));
}
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
*
......
......@@ -18,4 +18,6 @@ public @interface ExtractBy {
public enum Type {XPath, Regex, Css};
Type type() default Type.XPath;
boolean notNull() default true;
}
......@@ -11,8 +11,10 @@ import java.lang.annotation.Target;
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractByUrl {
public @interface ExtractByUrl{
String value() default "";
boolean notNull() default true;
}
......@@ -20,18 +20,15 @@ class FieldExtractor {
private Method setterMethod;
static enum Source {Html, Url}
private final boolean notNull;
public FieldExtractor(Field field, Selector selector) {
this.field = field;
this.selector = selector;
this.source = Source.Html;
}
static enum Source {Html, Url}
public FieldExtractor(Field field, Selector selector, Source source) {
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
this.field = field;
this.selector = selector;
this.source = source;
this.notNull = notNull;
}
Field getField() {
......@@ -53,4 +50,8 @@ class FieldExtractor {
Method getSetterMethod() {
return setterMethod;
}
boolean isNotNull() {
return notNull;
}
}
package us.codecraft.webmagic.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface HelpUrl {
String[] value();
}
......@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
targetUrlPatterns = new HashSet<Pattern>();
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
}
}
......@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
if (process==null){
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
......
......@@ -24,6 +24,8 @@ class PageModelExtractor {
private List<Pattern> targetUrlPatterns;
private List<Pattern> helpUrlPatterns;
private Class clazz;
private List<FieldExtractor> fieldExtractors;
......@@ -57,7 +59,7 @@ class PageModelExtractor {
default:
selector = new XpathSelector(value);
}
FieldExtractor fieldExtractor = new FieldExtractor(field, selector);
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
......@@ -70,7 +72,7 @@ class PageModelExtractor {
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url);
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
......@@ -102,6 +104,14 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
}
helpUrlPatterns = new ArrayList<Pattern>();
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
String[] value = ((HelpUrl) annotation).value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
}
}
public Object process(Page page) {
......@@ -129,7 +139,10 @@ class PageModelExtractor {
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
}
setField(o,fieldExtractor,value);
if (value==null&&fieldExtractor.isNotNull()){
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
}
} catch (InstantiationException e) {
e.printStackTrace();
......@@ -142,8 +155,8 @@ class PageModelExtractor {
}
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod()!=null){
fieldExtractor.getSetterMethod().invoke(o,value);
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}
......@@ -155,4 +168,8 @@ class PageModelExtractor {
List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
}
package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.TargetUrl;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl("http://dengminhui.iteye.com/blog/*")
public class IteyeBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "IteyeBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment