Commit 65518f76 authored by yihua.huang's avatar yihua.huang

add list support

parent d4de60a5
......@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
@Target({ElementType.FIELD,ElementType.TYPE})
public @interface ExtractBy {
//TODO: add list support
String value();
public enum Type {XPath, Regex, Css};
public enum Type {XPath2, XPath, Regex, Css}
Type type() default Type.XPath;
Type type() default Type.XPath2;
boolean notNull() default true;
boolean multi() default false;
}
......@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
boolean notNull() default true;
boolean multi() default false;
}
package us.codecraft.webmagic.oo;
import us.codecraft.webmagic.selector.Selector;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class Extractor {
protected final Selector selector;
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
static enum Source {Html, Url}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
this.notNull = notNull;
this.multi = multi;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
return notNull;
}
}
......@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class FieldExtractor {
class FieldExtractor extends Extractor{
private final Field field;
private final Selector selector;
private final Source source;
private Method setterMethod;
private final boolean notNull;
static enum Source {Html, Url}
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
super(selector, source, notNull,multi);
this.field = field;
this.selector = selector;
this.source = source;
this.notNull = notNull;
}
Field getField() {
......
......@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com <br>
......@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
return this;
}
public Spider pipeline(Pipeline pipeline) {
throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
}
}
......@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.selector.CssSelector;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import us.codecraft.webmagic.selector.*;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
......@@ -42,20 +39,22 @@ class PageModelExtractor {
this.clazz = clazz;
initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>();
if (clazz.isAssignableFrom(AfterExtractor.class)){
if (clazz.isAssignableFrom(AfterExtractor.class)) {
try {
afterExtractor=(AfterExtractor)clazz.newInstance();
afterExtractor = (AfterExtractor) clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
if (!field.getType().isAssignableFrom(String.class)){
throw new IllegalStateException("Field "+field.getName()+" must be string");
}
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
......@@ -68,10 +67,13 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default:
selector = new XpathSelector(value);
selector = new Xpath2Selector(value);
}
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
......@@ -80,11 +82,16 @@ class PageModelExtractor {
}
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
......@@ -138,24 +145,42 @@ class PageModelExtractor {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
String value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
}
if (value==null&&fieldExtractor.isNotNull()){
page.getResultItems().setSkip(true);
if (fieldExtractor.multi) {
List<String> value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
}
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
} else {
String value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
}
if (value == null && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
}
setField(o, fieldExtractor, value);
}
setField(o, fieldExtractor, value);
}
if (afterExtractor!=null){
afterExtractor.afterProcess(page,o);
if (afterExtractor != null) {
afterExtractor.afterProcess(page, o);
}
} catch (InstantiationException e) {
e.printStackTrace();
......@@ -167,7 +192,7 @@ class PageModelExtractor {
return o;
}
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
......
package us.codecraft.webmagic.oo;
import java.util.List;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
......@@ -11,7 +13,10 @@ public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
}
package us.codecraft.webmagic.oo;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Site;
......@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
*/
public class TestFetcher {
@Ignore("takes long")
// @Ignore("takes long")
@Test
public void test() {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment