Commit 866ab0a0 authored by yihua.huang's avatar yihua.huang

update email

parent 7c9e9ce8
......@@ -5,7 +5,7 @@ import java.util.Map;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-7-25 <br>
* Time: 下午12:20 <br>
*/
......
......@@ -90,10 +90,6 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor);
}
public static Spider create(Site site,Class... pageModels) {
return new Spider(ObjectPageProcessor.create(site,pageModels));
}
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
*
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Page;
/**
* 实现这个接口即可在抽取后进行后处理。<br>
*
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:42 <br>
*/
public interface AfterExtractor<T> {
public void afterProcess(Page page, T t);
}
......@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
......
......@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
......
......@@ -6,7 +6,7 @@ import java.lang.reflect.Field;
import java.lang.reflect.Method;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
......
......@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:51 <br>
*/
public class OOSpider extends Spider{
/**
* 使用已定义的抽取规则新建一个Spider。
*
* @param pageProcessor 已定义的抽取规则
*/
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor);
}
public static OOSpider create(Site site,Class... pageModels) {
OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels));
ooSpider.pipeline(new ObjectPipeline());
return ooSpider;
}
}
......@@ -12,7 +12,7 @@ import java.util.Set;
import java.util.regex.Pattern;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
......
......@@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-2 <br>
* Time: 上午10:47 <br>
*/
public class ObjectPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
public ObjectPipeline() {
}
public <T> T read() {
return null;
public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
pageModelPipelines.put(clazz, pageModelPipeline);
return this;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
classPageModelPipelineEntry.getValue().process(o, task);
}
}
}
}
......@@ -16,7 +16,7 @@ import java.util.List;
import java.util.regex.Pattern;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:33 <br>
*/
......@@ -30,6 +30,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors;
private AfterExtractor afterExtractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
......@@ -40,6 +42,13 @@ class PageModelExtractor {
this.clazz = clazz;
initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>();
if (clazz.isAssignableFrom(AfterExtractor.class)){
try {
afterExtractor=(AfterExtractor)clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
if (!field.getType().isAssignableFrom(String.class)){
......@@ -147,6 +156,9 @@ class PageModelExtractor {
}
setField(o, fieldExtractor, value);
}
if (afterExtractor!=null){
afterExtractor.afterProcess(page,o);
}
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
......
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 上午9:34 <br>
*/
public interface PageModelPipeline<T> {
public void process(T t, Task task);
}
......@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
......@@ -14,4 +14,5 @@ import java.lang.annotation.Target;
public @interface TargetUrl {
String[] value();
}
......@@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader;
/**
* 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br>
* @author yihua.huang@dianping.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-7-26 <br>
* Time: 下午3:10 <br>
*/
......
......@@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/**
* @author yihua.huang@dianping.com <br>
......@@ -16,12 +15,9 @@ public class TestFetcher {
@Test
public void test() {
ObjectPipeline objectPipeline = new ObjectPipeline();
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class))
.pipeline(objectPipeline).runAsync();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
.pipeline(objectPipeline);
OschinaBlog oschinaBlog = null;
while ((oschinaBlog = objectPipeline.read()) != null) {
System.out.println(oschinaBlog);
}
}
......
package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl;
/**
......@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
}
public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run();
OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run();
}
public String getTitle() {
......
package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl;
/**
......@@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{
}
public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run();
}
public String getTitle() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment