Commit 6e32a19f authored by yihua.huang's avatar yihua.huang

update api for direct download

parent 807aefe9
...@@ -6,6 +6,7 @@ import org.apache.log4j.Logger; ...@@ -6,6 +6,7 @@ import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -424,21 +425,25 @@ public class Spider implements Runnable, Task { ...@@ -424,21 +425,25 @@ public class Spider implements Runnable, Task {
* @param urls * @param urls
* @return * @return
*/ */
public List<ResultItems> getAll(Collection<String> urls) { public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false; destroyWhenExit = false;
spawnUrl = false; spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls); startRequests = UrlUtils.convertToRequests(urls);
CollectorPipeline collectorPipeline = new CollectorPipeline(); CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline); pipelines.add(collectorPipeline);
run(); run();
spawnUrl = true; spawnUrl = true;
destroyWhenExit = true; destroyWhenExit = true;
return collectorPipeline.getCollector(); return collectorPipeline.getCollected();
} }
public ResultItems get(String url) { protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public <T> T get(String url) {
List<String> urls = Lists.newArrayList(url); List<String> urls = Lists.newArrayList(url);
List<ResultItems> resultItemses = getAll(urls); List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) { if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0); return resultItemses.get(0);
} else { } else {
......
package us.codecraft.webmagic.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* Pipeline that can collect and store results. <br>
* Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)}
*
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @since 0.4.0 * @since 0.4.0
*/ */
public class CollectorPipeline implements Pipeline{ public interface CollectorPipeline<T> extends Pipeline {
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override /**
public void process(ResultItems resultItems, Task task) { * Get all results collected.
collector.add(resultItems); *
} * @return collected results
*/
public List<ResultItems> getCollector() { public List<T> getCollected();
return collector;
}
} }
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class ResultItemsCollectorPipeline implements CollectorPipeline<ResultItems> {
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List<ResultItems> getCollected() {
return collector;
}
}
...@@ -38,7 +38,7 @@ public class BaiduBaikePageProcesser implements PageProcessor { ...@@ -38,7 +38,7 @@ public class BaiduBaikePageProcesser implements PageProcessor {
list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.getAll(list); List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) { for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll()); System.out.println(resultItemse.getAll());
} }
......
package us.codecraft.webmagic.example; package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.direct.Param;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -13,21 +11,31 @@ import java.util.List; ...@@ -13,21 +11,31 @@ import java.util.List;
* @since 0.4.0 * @since 0.4.0
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class BaiduBaike implements AfterExtractor{ public class BaiduBaike{
private String word; @ExtractBy("//h1[@class=title]/div[@class=lemmaTitleH1]/text()")
private String name;
@ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()")
private String description; private String description;
@Override @Override
public void afterProcess(Page page) { public String toString() {
return "BaiduBaike{" +
"name='" + name + '\'' +
", description='" + description + '\'' +
'}';
} }
public static void main(String[] args) { public static void main(String[] args) {
List<Param> words = new ArrayList<Param>(); List<String> list = new ArrayList<String>();
words.add(new Param().put("word","红烧肉")); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
OOSpider.direct(words, BaiduBaike.class).thread(10).run(); list.add(String.format(urlTemplate,"水力发电"));
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate, "地热发电"));
List<BaiduBaike> baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).<BaiduBaike>getAll(list);
System.out.println(baiduBaikes);
} }
} }
...@@ -41,8 +41,9 @@ public class GithubRepo implements HasKey { ...@@ -41,8 +41,9 @@ public class GithubRepo implements HasKey {
private String url; private String url;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100) OOSpider.create(Site.me().setSleepTime(100)
, new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run(); , new ConsolePageModelPipeline(), GithubRepo.class)
.addUrl("https://github.com/code4craft").thread(10).run();
} }
@Override @Override
......
...@@ -31,8 +31,9 @@ public class OschinaBlog { ...@@ -31,8 +31,9 @@ public class OschinaBlog {
private Date date; private Date date;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") OOSpider.create(Site.me()
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
} }
public String getTitle() { public String getTitle() {
......
...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; ...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/** /**
* Print page model in console.<br> * Print page model in console.<br>
......
...@@ -3,6 +3,7 @@ package us.codecraft.webmagic.model; ...@@ -3,6 +3,7 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation; import java.lang.annotation.Annotation;
......
...@@ -2,10 +2,12 @@ package us.codecraft.webmagic.model; ...@@ -2,10 +2,12 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.direct.Param; import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Collection; import java.util.ArrayList;
import java.util.List;
/** /**
* The spider for page model extractor.<br> * The spider for page model extractor.<br>
...@@ -36,12 +38,16 @@ import java.util.Collection; ...@@ -36,12 +38,16 @@ import java.util.Collection;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
public class OOSpider extends Spider { public class OOSpider<T> extends Spider {
private ModelPageProcessor modelPageProcessor; private ModelPageProcessor modelPageProcessor;
private ModelPipeline modelPipeline; private ModelPipeline modelPipeline;
private PageModelPipeline pageModelPipeline;
private List<Class> pageModelClasses = new ArrayList<Class>();
protected OOSpider(ModelPageProcessor modelPageProcessor) { protected OOSpider(ModelPageProcessor modelPageProcessor) {
super(modelPageProcessor); super(modelPageProcessor);
this.modelPageProcessor = modelPageProcessor; this.modelPageProcessor = modelPageProcessor;
...@@ -62,11 +68,17 @@ public class OOSpider extends Spider { ...@@ -62,11 +68,17 @@ public class OOSpider extends Spider {
this(ModelPageProcessor.create(site, pageModels)); this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline(); this.modelPipeline = new ModelPipeline();
super.addPipeline(modelPipeline); super.addPipeline(modelPipeline);
if (pageModelPipeline != null) {
for (Class pageModel : pageModels) { for (Class pageModel : pageModels) {
if (pageModelPipeline != null) {
this.modelPipeline.put(pageModel, pageModelPipeline); this.modelPipeline.put(pageModel, pageModelPipeline);
} }
pageModelClasses.add(pageModel);
}
} }
@Override
protected CollectorPipeline getCollectorPipeline() {
return new PageModelCollectorPipeline<T>(pageModelClasses.get(0));
} }
public static OOSpider create(Site site, Class... pageModels) { public static OOSpider create(Site site, Class... pageModels) {
...@@ -77,34 +89,6 @@ public class OOSpider extends Spider { ...@@ -77,34 +89,6 @@ public class OOSpider extends Spider {
return new OOSpider(site, pageModelPipeline, pageModels); return new OOSpider(site, pageModelPipeline, pageModels);
} }
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(null, pageModelPipeline, pageModels);
}
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public static OOSpider direct(Collection<Param> params, Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) { for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel); modelPageProcessor.addPageModel(pageModel);
......
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import java.lang.annotation.Annotation;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
class PageModelCollectorPipeline<T> implements CollectorPipeline<T> {
private final CollectorPageModelPipeline<T> classPipeline = new CollectorPageModelPipeline<T>();
private final Class<?> clazz;
PageModelCollectorPipeline(Class<?> clazz) {
this.clazz = clazz;
}
@Override
public List<T> getCollected() {
return classPipeline.getCollected();
}
@Override
public synchronized void process(ResultItems resultItems, Task task) {
Object o = resultItems.get(clazz.getCanonicalName());
if (o != null) {
Annotation annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation == null || !((ExtractBy) annotation).multi()) {
classPipeline.process((T) o, task);
} else {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPipeline.process((T) o1, task);
}
}
}
}
}
...@@ -195,7 +195,7 @@ class PageModelExtractor { ...@@ -195,7 +195,7 @@ class PageModelExtractor {
private void initClassExtractors() { private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class); Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) { if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*")); targetUrlPatterns.add(Pattern.compile("(.*)"));
} else { } else {
TargetUrl targetUrl = (TargetUrl) annotation; TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value(); String[] value = targetUrl.value();
......
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
*/
public class CollectorPageModelPipeline<T> implements PageModelPipeline<T> {
private List<T> collected = new ArrayList<T>();
@Override
public synchronized void process(T t, Task task) {
collected.add(t);
}
public List<T> getCollected() {
return collected;
}
}
...@@ -5,7 +5,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; ...@@ -5,7 +5,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.utils.FilePersistentBase; import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileWriter; import java.io.FileWriter;
......
...@@ -6,7 +6,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; ...@@ -6,7 +6,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.utils.FilePersistentBase; import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileWriter; import java.io.FileWriter;
......
package us.codecraft.webmagic.model; package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import junit.framework.Assert; import junit.framework.Assert;
import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.pipeline.PageModelPipeline;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
......
...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.MockDownloader; ...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo; import us.codecraft.webmagic.example.GithubRepo;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples; ...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.HelpUrl;
......
...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples; ...@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
]]></data> ]]></data>
</comment> </comment>
<comment> <comment>
<key><![CDATA[us.codecraft.webmagic.model.OOSpider(us.codecraft.webmagic.Site, us.codecraft.webmagic.model.PageModelPipeline, java.lang.Class...)]]></key> <key><![CDATA[us.codecraft.webmagic.model.OOSpider(us.codecraft.webmagic.Site, us.codecraft.webmagic.pipeline.PageModelPipeline, java.lang.Class...)]]></key>
<data><![CDATA[ 创建一个爬虫。<br> <data><![CDATA[ 创建一个爬虫。<br>
@param site @param site
@param pageModelPipeline @param pageModelPipeline
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
<date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated> <date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated>
</meta> </meta>
<comment> <comment>
<key><![CDATA[us.codecraft.webmagic.model.PageModelPipeline]]></key> <key><![CDATA[us.codecraft.webmagic.pipeline.PageModelPipeline]]></key>
<data><![CDATA[ @author code4crafter@gmail.com <br> <data><![CDATA[ @author code4crafter@gmail.com <br>
Date: 13-8-3 <br> Date: 13-8-3 <br>
Time: 上午9:34 <br> Time: 上午9:34 <br>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment