Commit 50edd22e authored by yihua.huang's avatar yihua.huang

add annotation

parent 7020b864
package us.codecraft.webmagic.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface Fetcher {
String value();
public enum Type {XPath, Regex, Css};
Type type() default Type.XPath;
}
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class FieldFetcher {
private final Field field;
private final Selector selector;
FieldFetcher(Field field, Selector selector) {
this.field = field;
this.selector = selector;
}
Field getField() {
return field;
}
Selector getSelector() {
return selector;
}
}
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
public class ObjectPageProcessor implements PageProcessor {
private List<PageModelFetcher> pageModelFetcherList;
private Site site;
private Set<Pattern> targetUrlPatterns;
public static ObjectPageProcessor create(Site site, Class... clazzs) {
List<PageModelFetcher> pageModelFetcherList = new ArrayList<PageModelFetcher>();
for (Class clazz : clazzs) {
PageModelFetcher pageModelFetcher = PageModelFetcher.create(clazz);
pageModelFetcherList.add(pageModelFetcher);
}
ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelFetcherList);
return objectPageProcessor;
}
private ObjectPageProcessor(Site site, List<PageModelFetcher> pageModelFetcherList) {
this.site = site;
this.pageModelFetcherList = pageModelFetcherList;
targetUrlPatterns = new HashSet<Pattern>();
for (PageModelFetcher pageModelFetcher : pageModelFetcherList) {
targetUrlPatterns.addAll(pageModelFetcher.getTargetUrlPatterns());
}
}
@Override
public void process(Page page) {
for (PageModelFetcher pageModelFetcher : pageModelFetcherList) {
Object process = pageModelFetcher.process(page);
page.putField(pageModelFetcher.getClazz().getCanonicalName(), process);
}
for (String link : page.getHtml().links().all()) {
for (Pattern targetUrlPattern : targetUrlPatterns) {
if (targetUrlPattern.matcher(link).matches()){
page.addTargetRequest(new Request(link));
}
}
}
}
@Override
public Site getSite() {
return site;
}
}
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.selector.CssSelector;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:33 <br>
*/
class PageModelFetcher {
private List<Pattern> targetUrlPatterns;
private Class clazz;
private List<FieldFetcher> fieldFetchers;
public static PageModelFetcher create(Class clazz) {
PageModelFetcher pageModelFetcher = new PageModelFetcher();
pageModelFetcher.init(clazz);
return pageModelFetcher;
}
private void init(Class clazz) {
this.clazz = clazz;
initTargetUrlPatterns();
fieldFetchers = new ArrayList<FieldFetcher>();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
Fetcher fetcher = field.getAnnotation(Fetcher.class);
String value = fetcher.value();
Selector selector;
switch (fetcher.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldFetchers.add(new FieldFetcher(field, selector));
}
}
private void initTargetUrlPatterns() {
targetUrlPatterns = new ArrayList<Pattern>();
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
} else {
String[] value = ((TargetUrl) annotation).value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*")));
}
}
}
public Object process(Page page) {
boolean matched = false;
for (Pattern targetPattern : targetUrlPatterns) {
if (targetPattern.matcher(page.getUrl().toString()).matches()) {
matched = true;
}
}
if (!matched) {
return null;
}
Object o = null;
try {
o = clazz.newInstance();
for (FieldFetcher fieldFetcher : fieldFetchers) {
fieldFetcher.getField().set(o, fieldFetcher.getSelector().select(page.getHtml().toString()));
}
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
return o;
}
Class getClazz() {
return clazz;
}
List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}
}
package us.codecraft.webmagic.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface TargetUrl {
String[] value();
}
......@@ -8,7 +8,7 @@ import java.util.List;
* Date: 13-4-20
* Time: 下午8:02
*/
interface Selector {
public interface Selector {
public String select(String text);
......
package us.codecraft.webmagic.annotation;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
*/
@TargetUrl("http://djjchobits.iteye.com/blog/\\d+")
public class Blog {
@Fetcher("//title")
private String title;
@Fetcher(value = "div#main",type = Fetcher.Type.Css)
private String content;
@Override
public String toString() {
return "Blog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
}
package us.codecraft.webmagic.annotation;
import org.junit.Test;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:42 <br>
*/
public class TestFetcher {
@Test
public void test() {
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://djjchobits.iteye.com/blog/569000"), Blog.class)).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment