Commit e04d6253 authored by yihua.huang's avatar yihua.huang

add manual

parent f41c8ef7
This diff is collapsed.
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
......@@ -12,7 +14,7 @@ import us.codecraft.webmagic.pipeline.JsonFilePipeline;
* Time: 上午7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements Blog{
public class OschinaBlog {
@ExtractBy("//title")
private String title;
......@@ -20,23 +22,12 @@ public class OschinaBlog implements Blog{
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new ConsolePageModelPipeline(), OschinaBlog.class).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}
......@@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -13,18 +15,24 @@ import java.util.List;
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent());
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment