Commit 65518f76 authored by yihua.huang's avatar yihua.huang

add list support

parent d4de60a5
...@@ -10,16 +10,17 @@ import java.lang.annotation.Target; ...@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
* Time: 下午8:40 <br> * Time: 下午8:40 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD}) @Target({ElementType.FIELD,ElementType.TYPE})
public @interface ExtractBy { public @interface ExtractBy {
//TODO: add list support
String value(); String value();
public enum Type {XPath, Regex, Css}; public enum Type {XPath2, XPath, Regex, Css}
Type type() default Type.XPath; Type type() default Type.XPath2;
boolean notNull() default true; boolean notNull() default true;
boolean multi() default false;
} }
...@@ -17,4 +17,6 @@ public @interface ExtractByUrl{ ...@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
boolean notNull() default true; boolean notNull() default true;
boolean multi() default false;
} }
package us.codecraft.webmagic.oo;
import us.codecraft.webmagic.selector.Selector;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class Extractor {
protected final Selector selector;
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
static enum Source {Html, Url}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
this.notNull = notNull;
this.multi = multi;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
return notNull;
}
}
...@@ -10,25 +10,15 @@ import java.lang.reflect.Method; ...@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午9:48 <br> * Time: 下午9:48 <br>
*/ */
class FieldExtractor { class FieldExtractor extends Extractor{
private final Field field; private final Field field;
private final Selector selector;
private final Source source;
private Method setterMethod; private Method setterMethod;
private final boolean notNull; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
super(selector, source, notNull,multi);
static enum Source {Html, Url}
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
this.field = field; this.field = field;
this.selector = selector;
this.source = source;
this.notNull = notNull;
} }
Field getField() { Field getField() {
......
...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo; ...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.Pipeline;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
...@@ -50,8 +49,4 @@ public class OOSpider extends Spider { ...@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
return this; return this;
} }
public Spider pipeline(Pipeline pipeline) {
throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
}
} }
...@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo; ...@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.selector.CssSelector; import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import java.lang.annotation.Annotation; import java.lang.annotation.Annotation;
import java.lang.reflect.Field; import java.lang.reflect.Field;
...@@ -42,20 +39,22 @@ class PageModelExtractor { ...@@ -42,20 +39,22 @@ class PageModelExtractor {
this.clazz = clazz; this.clazz = clazz;
initTargetUrlPatterns(); initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
if (clazz.isAssignableFrom(AfterExtractor.class)){ if (clazz.isAssignableFrom(AfterExtractor.class)) {
try { try {
afterExtractor=(AfterExtractor)clazz.newInstance(); afterExtractor = (AfterExtractor) clazz.newInstance();
} catch (Exception e) { } catch (Exception e) {
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
} }
} }
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
if (!field.getType().isAssignableFrom(String.class)){
throw new IllegalStateException("Field "+field.getName()+" must be string");
}
ExtractBy extractBy = field.getAnnotation(ExtractBy.class); ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) { if (extractBy != null) {
if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String value = extractBy.value(); String value = extractBy.value();
Selector selector; Selector selector;
switch (extractBy.type()) { switch (extractBy.type()) {
...@@ -68,10 +67,13 @@ class PageModelExtractor { ...@@ -68,10 +67,13 @@ class PageModelExtractor {
case XPath: case XPath:
selector = new XpathSelector(value); selector = new XpathSelector(value);
break; break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default: default:
selector = new XpathSelector(value); selector = new Xpath2Selector(value);
} }
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull()); FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);
...@@ -80,11 +82,16 @@ class PageModelExtractor { ...@@ -80,11 +82,16 @@ class PageModelExtractor {
} }
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) { if (extractByUrl != null) {
if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String regexPattern = extractByUrl.value(); String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) { if (regexPattern.trim().equals("")) {
regexPattern = ".*"; regexPattern = ".*";
} }
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull()); FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);
...@@ -138,24 +145,42 @@ class PageModelExtractor { ...@@ -138,24 +145,42 @@ class PageModelExtractor {
try { try {
o = clazz.newInstance(); o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) { for (FieldExtractor fieldExtractor : fieldExtractors) {
String value; if (fieldExtractor.multi) {
switch (fieldExtractor.getSource()) { List<String> value;
case Html: switch (fieldExtractor.getSource()) {
value = fieldExtractor.getSelector().select(page.getHtml().toString()); case Html:
break; value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
case Url: break;
value = fieldExtractor.getSelector().select(page.getUrl().toString()); case Url:
break; value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
default: break;
value = fieldExtractor.getSelector().select(page.getHtml().toString()); default:
} value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
if (value==null&&fieldExtractor.isNotNull()){ }
page.getResultItems().setSkip(true); if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
} else {
String value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
}
if (value == null && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
} }
setField(o, fieldExtractor, value);
} }
if (afterExtractor!=null){ if (afterExtractor != null) {
afterExtractor.afterProcess(page,o); afterExtractor.afterProcess(page, o);
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
...@@ -167,7 +192,7 @@ class PageModelExtractor { ...@@ -167,7 +192,7 @@ class PageModelExtractor {
return o; return o;
} }
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod() != null) { if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value); fieldExtractor.getSetterMethod().invoke(o, value);
} }
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.oo;
import java.util.List;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
...@@ -11,7 +13,10 @@ public class OschinaBlog { ...@@ -11,7 +13,10 @@ public class OschinaBlog {
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content; private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
} }
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.oo;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
...@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site; ...@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
*/ */
public class TestFetcher { public class TestFetcher {
@Ignore("takes long") // @Ignore("takes long")
@Test @Test
public void test() { public void test() {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment