Commit 1c1bf895 authored by yihua.huang's avatar yihua.huang

Merge branch 'master' of github.com:code4craft/webmagic

parents 8774cce7 906e68cb
...@@ -61,27 +61,7 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 ...@@ -61,27 +61,7 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫
} }
} }
--- ### 示例
TODO
public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://my\\.oschina\\.net/\\w+/blog/\\d+)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().sc());
page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
}
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Chrome/26.0.1410.65 Safari/537.31");
}
}
可参考作者博客[使用webmagic抓取页面并保存为wordpress文件](http://my.oschina.net/flashsword/blog/136846)
...@@ -8,7 +8,8 @@ import java.util.*; ...@@ -8,7 +8,8 @@ import java.util.*;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* 找到clear * readability算法,基础是找到所有p标签的父节点
* 写的比较乱,最终效果还在尝试中
* User: cairne * User: cairne
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午4:42 * Time: 下午4:42
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment