Commit 8f954c79 authored by yihua.huang's avatar yihua.huang

fix samples

parent 312e1bce
......@@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
......@@ -57,10 +57,6 @@ public class Spider implements Runnable, Task {
return this;
}
public Thread thread() {
return new Thread(this);
}
public Spider schedular(Schedular schedular) {
this.schedular = schedular;
return this;
......@@ -74,7 +70,7 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
for (String startUrl : startUrls) {
schedular.push(new Request(startUrl), this);
}
Request request = schedular.poll(this);
......
......@@ -30,7 +30,7 @@ public class FreemarkerPipeline implements Pipeline {
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
this.template = configuration.getTemplate(template);
this.path = path;
File file = new File(path);
new File(path);
}
public FreemarkerPipeline(String template) throws IOException {
......
......@@ -13,7 +13,7 @@ import java.io.IOException;
public class FreemarkerPipelineTest {
@Test
public void test() throws IOException {
public void testTemplateLoad() throws IOException {
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
}
}
<item>
<title>$it.Title</title>
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
<description></description>
<content:encoded><![CDATA[${text}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
$tags
</item>
\ No newline at end of file
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -11,15 +12,14 @@ import java.util.List;
* Date: 13-4-21
* Time: 下午8:08
*/
public class DianpingBlogProcessor implements PageProcessor {
public class DianpingProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
List<String> requests = page.getHtml().as().rs(".*shop.*").toStrings();
page.addTargetRequests(requests);
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
requests = page.getHtml().rs(".*search/category/.*").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")){
if (page.getUrl().toString().contains("shop")) {
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc());
}
......@@ -30,4 +30,9 @@ public class DianpingBlogProcessor implements PageProcessor {
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor();
Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment