Commit 413b97c9 authored by code4craft's avatar code4craft

add meican

parent 1d870f3c
......@@ -2,11 +2,10 @@ package us.codecraft.spider;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.spider.pipeline.ConsolePipeline;
import us.codecraft.spider.pipeline.FilePipeline;
import us.codecraft.spider.processor.SimplePageProcessor;
import us.codecraft.spider.samples.DianpingBlogProcessor;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.spider.samples.HuxiuProcessor;
import us.codecraft.spider.samples.MeicanProcessor;
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
/**
......@@ -25,8 +24,7 @@ public class SpiderTest {
@Test
public void testGlobalSpider(){
SimplePageProcessor pageProcessor = new SimplePageProcessor("http://blog.163.com/", "http://blog.163.com/*/blog/static/*");
pageProcessor.getSite().setEncoding("gbk");
PageProcessor pageProcessor = new MeicanProcessor();
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
processor(pageProcessor).run();
// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html");
......
package us.codecraft.spider.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
/**
* User: cairne
* Date: 13-5-20
* Time: 下午5:31
*/
public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
page.addTargetRequests("http://kaichiba.com/shop/"+i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
}
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.spider.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-5-20
* Time: 下午5:31
*/
public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings());
page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
}
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment