Commit 866ab0a0 authored by yihua.huang's avatar yihua.huang

update email

parent 7c9e9ce8
...@@ -5,7 +5,7 @@ import java.util.Map; ...@@ -5,7 +5,7 @@ import java.util.Map;
/** /**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br> * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-7-25 <br> * @date: 13-7-25 <br>
* Time: 下午12:20 <br> * Time: 下午12:20 <br>
*/ */
......
...@@ -90,10 +90,6 @@ public class Spider implements Runnable, Task { ...@@ -90,10 +90,6 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor); return new Spider(pageProcessor);
} }
public static Spider create(Site site,Class... pageModels) {
return new Spider(ObjectPageProcessor.create(site,pageModels));
}
/** /**
* 重新设置startUrls,会覆盖Site本身的startUrls。 * 重新设置startUrls,会覆盖Site本身的startUrls。
* *
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Page;
/**
* 实现这个接口即可在抽取后进行后处理。<br>
*
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/
public interface AfterExtractor<T> {
public void afterProcess(Page page, T t);
}
...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention; ...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午8:40 <br> * Time: 下午8:40 <br>
*/ */
......
...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention; ...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午8:40 <br> * Time: 下午8:40 <br>
*/ */
......
...@@ -6,7 +6,7 @@ import java.lang.reflect.Field; ...@@ -6,7 +6,7 @@ import java.lang.reflect.Field;
import java.lang.reflect.Method; import java.lang.reflect.Method;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午9:48 <br> * Time: 下午9:48 <br>
*/ */
......
...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention; ...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午8:40 <br> * Time: 下午8:40 <br>
*/ */
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:51 <br>
*/
public class OOSpider extends Spider{
/**
* 使用已定义的抽取规则新建一个Spider。
*
* @param pageProcessor 已定义的抽取规则
*/
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor);
}
public static OOSpider create(Site site,Class... pageModels) {
OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels));
ooSpider.pipeline(new ObjectPipeline());
return ooSpider;
}
}
...@@ -12,7 +12,7 @@ import java.util.Set; ...@@ -12,7 +12,7 @@ import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午8:46 <br> * Time: 下午8:46 <br>
*/ */
......
...@@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems; ...@@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-2 <br> * @date: 13-8-2 <br>
* Time: 上午10:47 <br> * Time: 上午10:47 <br>
*/ */
public class ObjectPipeline implements Pipeline { public class ObjectPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
public ObjectPipeline() {
}
public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
pageModelPipelines.put(clazz, pageModelPipeline);
return this;
} }
public <T> T read() { @Override
return null; public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
classPageModelPipelineEntry.getValue().process(o, task);
}
}
} }
} }
...@@ -16,7 +16,7 @@ import java.util.List; ...@@ -16,7 +16,7 @@ import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午9:33 <br> * Time: 下午9:33 <br>
*/ */
...@@ -30,6 +30,8 @@ class PageModelExtractor { ...@@ -30,6 +30,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
private AfterExtractor afterExtractor;
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz); pageModelExtractor.init(clazz);
...@@ -40,6 +42,13 @@ class PageModelExtractor { ...@@ -40,6 +42,13 @@ class PageModelExtractor {
this.clazz = clazz; this.clazz = clazz;
initTargetUrlPatterns(); initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
if (clazz.isAssignableFrom(AfterExtractor.class)){
try {
afterExtractor=(AfterExtractor)clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
if (!field.getType().isAssignableFrom(String.class)){ if (!field.getType().isAssignableFrom(String.class)){
...@@ -147,6 +156,9 @@ class PageModelExtractor { ...@@ -147,6 +156,9 @@ class PageModelExtractor {
} }
setField(o, fieldExtractor, value); setField(o, fieldExtractor, value);
} }
if (afterExtractor!=null){
afterExtractor.afterProcess(page,o);
}
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
} catch (IllegalAccessException e) { } catch (IllegalAccessException e) {
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:34 <br>
*/
public interface PageModelPipeline<T> {
public void process(T t, Task task);
}
...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention; ...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午8:40 <br> * Time: 下午8:40 <br>
*/ */
...@@ -14,4 +14,5 @@ import java.lang.annotation.Target; ...@@ -14,4 +14,5 @@ import java.lang.annotation.Target;
public @interface TargetUrl { public @interface TargetUrl {
String[] value(); String[] value();
} }
...@@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader; ...@@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader;
/** /**
* 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br> * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br>
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-7-26 <br> * @date: 13-7-26 <br>
* Time: 下午3:10 <br> * Time: 下午3:10 <br>
*/ */
......
...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation; ...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
...@@ -16,12 +15,9 @@ public class TestFetcher { ...@@ -16,12 +15,9 @@ public class TestFetcher {
@Test @Test
public void test() { public void test() {
ObjectPipeline objectPipeline = new ObjectPipeline(); ObjectPipeline objectPipeline = new ObjectPipeline();
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)) OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
.pipeline(objectPipeline).runAsync(); .pipeline(objectPipeline);
OschinaBlog oschinaBlog = null; OschinaBlog oschinaBlog = null;
while ((oschinaBlog = objectPipeline.read()) != null) {
System.out.println(oschinaBlog);
}
} }
......
package us.codecraft.webmagic.annotation.samples; package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy; import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl; import us.codecraft.webmagic.annotation.TargetUrl;
/** /**
...@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ ...@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run();
} }
public String getTitle() { public String getTitle() {
......
package us.codecraft.webmagic.annotation.samples; package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy; import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl; import us.codecraft.webmagic.annotation.TargetUrl;
/** /**
...@@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{ ...@@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run();
} }
public String getTitle() { public String getTitle() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment