Commit 787b9529 authored by yihua.huang's avatar yihua.huang

release notes and docs

parent 1f86ce77
...@@ -28,13 +28,15 @@ Release Notes ...@@ -28,13 +28,15 @@ Release Notes
} }
增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
增加基于redis的分布式支持。 增加基于redis的分布式支持。
增加XPath2.0语法支持(webmagic-saxon模块)。 增加XPath2.0语法支持(webmagic-saxon模块)。
增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。 增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
修复一些已有bug。 修复了不支持https的bug。
补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/) 补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)
......
...@@ -220,9 +220,17 @@ public class Spider implements Runnable, Task { ...@@ -220,9 +220,17 @@ public class Spider implements Runnable, Task {
} }
} }
public void test(String url){ /**
* 用某些特定URL进行爬虫测试
* @param urls 要抓取的url
*/
public void test(String... urls){
checkComponent(); checkComponent();
processRequest(new Request(url)); if (urls.length>0){
for (String url : urls) {
processRequest(new Request(url));
}
}
} }
private void processRequest(Request request) { private void processRequest(Request request) {
......
...@@ -8,6 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractByUrl; ...@@ -8,6 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.util.List; import java.util.List;
...@@ -32,12 +33,19 @@ public class GithubRepo implements HasKey { ...@@ -32,12 +33,19 @@ public class GithubRepo implements HasKey {
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true) @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true)
private List<String> language; private List<String> language;
@ExtractBy("//a[@class='social-count js-social-count']/text()")
private String star;
@ExtractBy("//a[@class='social-count js-social-count']/text()")
private String fork;
@ExtractByUrl @ExtractByUrl
private String url; private String url;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0), OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run(); new JsonFilePageModelPipeline(), GithubRepo.class)
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
} }
@Override @Override
...@@ -64,4 +72,12 @@ public class GithubRepo implements HasKey { ...@@ -64,4 +72,12 @@ public class GithubRepo implements HasKey {
public String getUrl() { public String getUrl() {
return url; return url;
} }
public String getStar() {
return star;
}
public String getFork() {
return fork;
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment