Commit 6e32a19f authored by yihua.huang's avatar yihua.huang

update api for direct download

parent 807aefe9
......@@ -6,6 +6,7 @@ import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
......@@ -424,21 +425,25 @@ public class Spider implements Runnable, Task {
* @param urls
* @return
*/
public List<ResultItems> getAll(Collection<String> urls) {
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls);
CollectorPipeline collectorPipeline = new CollectorPipeline();
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollector();
return collectorPipeline.getCollected();
}
public ResultItems get(String url) {
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public <T> T get(String url) {
List<String> urls = Lists.newArrayList(url);
List<ResultItems> resultItemses = getAll(urls);
List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
......
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* Pipeline that can collect and store results. <br>
* Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)}
*
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class CollectorPipeline implements Pipeline{
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
public interface CollectorPipeline<T> extends Pipeline {
public List<ResultItems> getCollector() {
return collector;
}
/**
* Get all results collected.
*
* @return collected results
*/
public List<T> getCollected();
}
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class ResultItemsCollectorPipeline implements CollectorPipeline<ResultItems> {
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List<ResultItems> getCollected() {
return collector;
}
}
......@@ -38,7 +38,7 @@ public class BaiduBaikePageProcesser implements PageProcessor {
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.getAll(list);
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
......
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.direct.Param;
import java.util.ArrayList;
import java.util.List;
......@@ -13,21 +11,31 @@ import java.util.List;
* @since 0.4.0
* @author code4crafter@gmail.com
*/
public class BaiduBaike implements AfterExtractor{
public class BaiduBaike{
private String word;
@ExtractBy("//h1[@class=title]/div[@class=lemmaTitleH1]/text()")
private String name;
@ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()")
private String description;
@Override
public void afterProcess(Page page) {
public String toString() {
return "BaiduBaike{" +
"name='" + name + '\'' +
", description='" + description + '\'' +
'}';
}
public static void main(String[] args) {
List<Param> words = new ArrayList<Param>();
words.add(new Param().put("word","红烧肉"));
OOSpider.direct(words, BaiduBaike.class).thread(10).run();
List<String> list = new ArrayList<String>();
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电"));
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate, "地热发电"));
List<BaiduBaike> baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).<BaiduBaike>getAll(list);
System.out.println(baiduBaikes);
}
}
......@@ -41,8 +41,9 @@ public class GithubRepo implements HasKey {
private String url;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100)
, new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run();
OOSpider.create(Site.me().setSleepTime(100)
, new ConsolePageModelPipeline(), GithubRepo.class)
.addUrl("https://github.com/code4craft").thread(10).run();
}
@Override
......
......@@ -31,8 +31,9 @@ public class OschinaBlog {
private Date date;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
OOSpider.create(Site.me()
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
}
public String getTitle() {
......
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/**
* Print page model in console.<br>
......
......@@ -3,6 +3,7 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
......
......@@ -2,10 +2,12 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.direct.Param;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Collection;
import java.util.ArrayList;
import java.util.List;
/**
* The spider for page model extractor.<br>
......@@ -36,12 +38,16 @@ import java.util.Collection;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public class OOSpider extends Spider {
public class OOSpider<T> extends Spider {
private ModelPageProcessor modelPageProcessor;
private ModelPipeline modelPipeline;
private PageModelPipeline pageModelPipeline;
private List<Class> pageModelClasses = new ArrayList<Class>();
protected OOSpider(ModelPageProcessor modelPageProcessor) {
super(modelPageProcessor);
this.modelPageProcessor = modelPageProcessor;
......@@ -62,13 +68,19 @@ public class OOSpider extends Spider {
this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline();
super.addPipeline(modelPipeline);
if (pageModelPipeline != null) {
for (Class pageModel : pageModels) {
for (Class pageModel : pageModels) {
if (pageModelPipeline != null) {
this.modelPipeline.put(pageModel, pageModelPipeline);
}
pageModelClasses.add(pageModel);
}
}
@Override
protected CollectorPipeline getCollectorPipeline() {
return new PageModelCollectorPipeline<T>(pageModelClasses.get(0));
}
public static OOSpider create(Site site, Class... pageModels) {
return new OOSpider(site, null, pageModels);
}
......@@ -77,34 +89,6 @@ public class OOSpider extends Spider {
return new OOSpider(site, pageModelPipeline, pageModels);
}
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(null, pageModelPipeline, pageModels);
}
/**
* @since 0.3.3
* NO implement yet!
*/
public static OOSpider direct(Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public static OOSpider direct(Collection<Param> params, Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel);
......
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import java.lang.annotation.Annotation;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
class PageModelCollectorPipeline<T> implements CollectorPipeline<T> {
private final CollectorPageModelPipeline<T> classPipeline = new CollectorPageModelPipeline<T>();
private final Class<?> clazz;
PageModelCollectorPipeline(Class<?> clazz) {
this.clazz = clazz;
}
@Override
public List<T> getCollected() {
return classPipeline.getCollected();
}
@Override
public synchronized void process(ResultItems resultItems, Task task) {
Object o = resultItems.get(clazz.getCanonicalName());
if (o != null) {
Annotation annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation == null || !((ExtractBy) annotation).multi()) {
classPipeline.process((T) o, task);
} else {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPipeline.process((T) o1, task);
}
}
}
}
}
......@@ -195,7 +195,7 @@ class PageModelExtractor {
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
targetUrlPatterns.add(Pattern.compile("(.*)"));
} else {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
......
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
*/
public class CollectorPageModelPipeline<T> implements PageModelPipeline<T> {
private List<T> collected = new ArrayList<T>();
@Override
public synchronized void process(T t, Task task) {
collected.add(t);
}
public List<T> getCollected() {
return collected;
}
}
......@@ -5,7 +5,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileWriter;
......
......@@ -6,7 +6,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileWriter;
......
package us.codecraft.webmagic;
import junit.framework.Assert;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/**
* @author code4crafter@gmail.com
......
......@@ -6,6 +6,7 @@ import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/**
* @author code4crafter@gmail.com <br>
......
......@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
......
......@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
......
......@@ -12,7 +12,7 @@
]]></data>
</comment>
<comment>
<key><![CDATA[us.codecraft.webmagic.model.OOSpider(us.codecraft.webmagic.Site, us.codecraft.webmagic.model.PageModelPipeline, java.lang.Class...)]]></key>
<key><![CDATA[us.codecraft.webmagic.model.OOSpider(us.codecraft.webmagic.Site, us.codecraft.webmagic.pipeline.PageModelPipeline, java.lang.Class...)]]></key>
<data><![CDATA[ 创建一个爬虫。<br>
@param site
@param pageModelPipeline
......
......@@ -4,7 +4,7 @@
<date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated>
</meta>
<comment>
<key><![CDATA[us.codecraft.webmagic.model.PageModelPipeline]]></key>
<key><![CDATA[us.codecraft.webmagic.pipeline.PageModelPipeline]]></key>
<data><![CDATA[ @author code4crafter@gmail.com <br>
Date: 13-8-3 <br>
Time: 上午9:34 <br>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment