Commit e04d6253 authored by yihua.huang's avatar yihua.huang

add manual

parent f41c8ef7
This diff is collapsed.
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
...@@ -12,7 +14,7 @@ import us.codecraft.webmagic.pipeline.JsonFilePipeline; ...@@ -12,7 +14,7 @@ import us.codecraft.webmagic.pipeline.JsonFilePipeline;
* Time: 上午7:52 <br> * Time: 上午7:52 <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements Blog{ public class OschinaBlog {
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
...@@ -20,23 +22,12 @@ public class OschinaBlog implements Blog{ ...@@ -20,23 +22,12 @@ public class OschinaBlog implements Blog{
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content; private String content;
@Override @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
public String toString() { private List<String> tags;
return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new ConsolePageModelPipeline(), OschinaBlog.class).run();
} }
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
} }
...@@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples; ...@@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
...@@ -13,18 +15,24 @@ import java.util.List; ...@@ -13,18 +15,24 @@ import java.util.List;
*/ */
public class OschinaBlogPageProcesser implements PageProcessor { public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all(); List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(strings); page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().smartContent()); page.putField("content", page.getHtml().$("div.content").toString());
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+")); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
} }
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/"). return site;
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment