Commit d7c7a781 authored by yihua.huang's avatar yihua.huang

complete test cases

parent c17a31a2
package us.codecraft.webmagic;
import junit.framework.Assert;
import us.codecraft.webmagic.model.PageModelPipeline;
/**
* @author code4crafter@gmail.com
*/
public class MockPageModelPipeline implements PageModelPipeline{
@Override
public void process(Object o, Task task) {
Assert.assertNotNull(o);
}
}
package us.codecraft.webmagic;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com
*/
public class MockPipeline implements Pipeline{
@Override
public void process(ResultItems resultItems, Task task) {
}
}
package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
public class GithubRepo implements HasKey {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;
@ExtractBy("//div[@id='readme']")
private String readme;
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
private List<String> language;
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
private String star;
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
private String fork;
@ExtractByUrl
private String url;
@Test
public void test() {
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
, new PageModelPipeline<GithubRepo>() {
@Override
public void process(GithubRepo o, Task task) {
Assert.assertEquals("78",o.getStar().trim());
Assert.assertEquals("65",o.getFork().trim());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}
@Override
public String key() {
return author + ":" + name;
}
public String getName() {
return name;
}
public String getReadme() {
return readme;
}
public String getAuthor() {
return author;
}
public List<String> getLanguage() {
return language;
}
public String getUrl() {
return url;
}
public String getStar() {
return star;
}
public String getFork() {
return fork;
}
}
package us.codecraft.webmagic.processor;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com
*/
public class GithubRepoProcessor implements PageProcessor {
@Override
public void process(Page page) {
page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString());
page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString());
}
@Override
public Site getSite() {
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
}
@Test
public void test() {
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}
}
...@@ -14,8 +14,6 @@ import java.util.Scanner; ...@@ -14,8 +14,6 @@ import java.util.Scanner;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-7 <br>
* Time: 下午9:24 <br>
*/ */
public class QuickStarter { public class QuickStarter {
......
...@@ -14,8 +14,6 @@ import java.util.List; ...@@ -14,8 +14,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-10 <br>
* Time: 下午6:37 <br>
*/ */
@TargetUrl("https://github.com/\\w+/\\w+") @TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) @HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"})
......
...@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ ...@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
} }
public String getTitle() { public String getTitle() {
......
...@@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; ...@@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-11 <br>
* Time: 下午9:29 <br>
*/ */
@TargetUrl("http://www.36kr.com/p/\\d+.html") @TargetUrl("http://www.36kr.com/p/\\d+.html")
@HelpUrl("http://www.36kr.com/#/page/\\d+") @HelpUrl("http://www.36kr.com/#/page/\\d+")
......
...@@ -16,8 +16,6 @@ import java.util.List; ...@@ -16,8 +16,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午8:17 <br>
*/ */
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements MultiPageModel { public class News163 implements MultiPageModel {
......
...@@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; ...@@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/ */
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*") @HelpUrl("http://www.oschina.net/question/*")
......
...@@ -11,8 +11,6 @@ import java.util.List; ...@@ -11,8 +11,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements HasKey{ public class OschinaBlog implements HasKey{
......
...@@ -8,8 +8,6 @@ import java.util.List; ...@@ -8,8 +8,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/ */
public class DiandianBlogProcessor implements PageProcessor { public class DiandianBlogProcessor implements PageProcessor {
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.PlainText;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/
public class DiaoyuwengProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
}
@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/
public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd"));
}
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
}
}
...@@ -9,8 +9,6 @@ import java.util.List; ...@@ -9,8 +9,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/ */
public class HuxiuProcessor implements PageProcessor { public class HuxiuProcessor implements PageProcessor {
@Override @Override
...@@ -18,13 +16,12 @@ public class HuxiuProcessor implements PageProcessor { ...@@ -18,13 +16,12 @@ public class HuxiuProcessor implements PageProcessor {
List<String> requests = page.getHtml().links().regex(".*article.*").all(); List<String> requests = page.getHtml().links().regex(".*article.*").all();
page.addTargetRequests(requests); page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
page.putField("content",page.getHtml().smartContent()); page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
} }
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
} }
public static void main(String[] args) { public static void main(String[] args) {
......
...@@ -10,8 +10,6 @@ import java.util.List; ...@@ -10,8 +10,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/ */
public class InfoQMiniBookProcessor implements PageProcessor { public class InfoQMiniBookProcessor implements PageProcessor {
......
...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 上午7:31 <br>
*/ */
public class IteyeBlogProcessor implements PageProcessor { public class IteyeBlogProcessor implements PageProcessor {
...@@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor { ...@@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor {
@Override @Override
public Site getSite() { public Site getSite() {
if (site == null) { if (site == null) {
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
setSleepTime(100).setRetryTimes(3);
} }
return site; return site;
} }
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 下午5:31
*/
public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 下午5:31
*/
public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
}
}
...@@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor { ...@@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor {
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
} }
} }
...@@ -9,8 +9,6 @@ import java.util.List; ...@@ -9,8 +9,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/ */
public class OschinaBlogPageProcesser implements PageProcessor { public class OschinaBlogPageProcesser implements PageProcessor {
......
...@@ -8,8 +8,6 @@ import java.util.List; ...@@ -8,8 +8,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/ */
public class OschinaPageProcesser implements PageProcessor { public class OschinaPageProcesser implements PageProcessor {
......
...@@ -8,8 +8,6 @@ import java.util.List; ...@@ -8,8 +8,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/ */
public class QzoneBlogProcessor implements PageProcessor { public class QzoneBlogProcessor implements PageProcessor {
@Override @Override
......
...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/ */
public class SinaBlogProcesser implements PageProcessor { public class SinaBlogProcesser implements PageProcessor {
......
...@@ -8,8 +8,6 @@ import java.util.List; ...@@ -8,8 +8,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/ */
public class TianyaPageProcesser implements PageProcessor { public class TianyaPageProcesser implements PageProcessor {
......
#!/bin/sh
touch wordpress.xml
cat wp-head.xml >> wordpress.xml
for f in `ls`;
do
cat ${f} >> ../wordpress.xml
done;
cat wp-bottom.xml >> wordpress.xml
\ No newline at end of file
<item>
<title>${title}</title>
<link>http://127.0.0.1/wordpress/?p=${id}</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
<description></description>
<content:encoded><![CDATA[${content}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>${id}</wp:post_id>
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
</item>
</channel>
</rss>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->
<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!-- on the site. For each author, you may choose to map to an -->
<!-- existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!-- contained in this file into your site. -->
<!-- generator="WordPress/3.3.1" created="2012-06-10 09:15" -->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.1/"
>
<channel>
<wp:wxr_version>1.1</wp:wxr_version>
<wp:base_site_url>http://127.0.0.1/wordpress</wp:base_site_url>
<wp:base_blog_url>http://127.0.0.1/wordpress</wp:base_blog_url>
<wp:author><wp:author_id>1</wp:author_id><wp:author_login>admin</wp:author_login><wp:author_email>flashsword20@163.com</wp:author_email><wp:author_display_name><![CDATA[admin]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author>
<generator>http://wordpress.org/?v=3.3.1</generator>
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
*/
public class DiaoyuwengProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment