Commit 76729c93 authored by xbynet's avatar xbynet Committed by GitHub

Merge pull request #2 from code4craft/master

合并官方最新代码
parents 650468c0 e9341d02
This diff is collapsed.
...@@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m ...@@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
### 贡献者:
以下是为WebMagic提交过代码或者issue的朋友:
* [ccliangbo](https://github.com/ccliangbo)
* [yuany](https://github.com/yuany)
* [yxssfxwzy](https://github.com/yxssfxwzy)
* [linkerlin](https://github.com/linkerlin)
* [d0ngw](https://github.com/d0ngw)
* [xuchaoo](https://github.com/xuchaoo)
* [supermicah](https://github.com/supermicah)
* [SimpleExpress](https://github.com/SimpleExpress)
* [aruanruan](https://github.com/aruanruan)
* [l1z2g9](https://github.com/l1z2g9)
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
* [ywooer](https://github.com/ywooer)
* [yyw258520](https://github.com/yyw258520)
* [perfecking](https://github.com/perfecking)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
* [fengwuze](https://github.com/fengwuze)
### 邮件组: ### 邮件组:
Gmail: Gmail:
......
...@@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor { ...@@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){ if (page.getResultItems().get("name")==null){
//skip this page //skip this page
page.setSkip(true); page.setSkip(true);
...@@ -89,7 +89,7 @@ You can also use annotation way: ...@@ -89,7 +89,7 @@ You can also use annotation way:
@HelpUrl("https://github.com/\\w+") @HelpUrl("https://github.com/\\w+")
public class GithubRepo { public class GithubRepo {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name; private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*") @ExtractByUrl("https://github\\.com/(\\w+)/.*")
...@@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) ...@@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
![image](http://code4craft.github.io/images/posts/webmagic.png) ![image](http://code4craft.github.io/images/posts/webmagic.png)
Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/) There are more examples in `webmagic-samples` package.
There are some samples in `webmagic-samples` package.
### Lisence: ### Lisence:
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
### Contributors:
Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
* [ccliangbo](https://github.com/ccliangbo)
* [yuany](https://github.com/yuany)
* [yxssfxwzy](https://github.com/yxssfxwzy)
* [linkerlin](https://github.com/linkerlin)
* [d0ngw](https://github.com/d0ngw)
* [xuchaoo](https://github.com/xuchaoo)
* [supermicah](https://github.com/supermicah)
* [SimpleExpress](https://github.com/SimpleExpress)
* [aruanruan](https://github.com/aruanruan)
* [l1z2g9](https://github.com/l1z2g9)
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
* [ywooer](https://github.com/ywooer)
* [yyw258520](https://github.com/yyw258520)
* [perfecking](https://github.com/perfecking)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
* [fengwuze](https://github.com/fengwuze)
### Thanks: ### Thanks:
To write webmagic, I refered to the projects below : To write webmagic, I refered to the projects below :
......
...@@ -64,6 +64,12 @@ ...@@ -64,6 +64,12 @@
<version>4.11</version> <version>4.11</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.10.19</version>
<scope>test</scope>
</dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
...@@ -97,7 +103,7 @@ ...@@ -97,7 +103,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId> <artifactId>fastjson</artifactId>
<version>1.2.21</version> <version>1.2.28</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.dreamhead</groupId> <groupId>com.github.dreamhead</groupId>
...@@ -130,7 +136,7 @@ ...@@ -130,7 +136,7 @@
<dependency> <dependency>
<groupId>commons-collections</groupId> <groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId> <artifactId>commons-collections</artifactId>
<version>3.2.1</version> <version>3.2.2</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
......
...@@ -40,6 +40,11 @@ ...@@ -40,6 +40,11 @@
<artifactId>slf4j-api</artifactId> <artifactId>slf4j-api</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId> <artifactId>slf4j-log4j12</artifactId>
......
...@@ -107,7 +107,6 @@ public class Page { ...@@ -107,7 +107,6 @@ public class Page {
* @param requests requests * @param requests requests
*/ */
public void addTargetRequests(List<String> requests) { public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) {
for (String s : requests) { for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue; continue;
...@@ -116,7 +115,6 @@ public class Page { ...@@ -116,7 +115,6 @@ public class Page {
targetRequests.add(new Request(s)); targetRequests.add(new Request(s));
} }
} }
}
/** /**
* add urls to fetch * add urls to fetch
...@@ -125,7 +123,6 @@ public class Page { ...@@ -125,7 +123,6 @@ public class Page {
* @param priority priority * @param priority priority
*/ */
public void addTargetRequests(List<String> requests, long priority) { public void addTargetRequests(List<String> requests, long priority) {
synchronized (targetRequests) {
for (String s : requests) { for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue; continue;
...@@ -134,7 +131,6 @@ public class Page { ...@@ -134,7 +131,6 @@ public class Page {
targetRequests.add(new Request(s).setPriority(priority)); targetRequests.add(new Request(s).setPriority(priority));
} }
} }
}
/** /**
* add url to fetch * add url to fetch
...@@ -145,11 +141,9 @@ public class Page { ...@@ -145,11 +141,9 @@ public class Page {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return; return;
} }
synchronized (targetRequests) {
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString)); targetRequests.add(new Request(requestString));
} }
}
/** /**
* add requests to fetch * add requests to fetch
...@@ -157,10 +151,8 @@ public class Page { ...@@ -157,10 +151,8 @@ public class Page {
* @param request request * @param request request
*/ */
public void addTargetRequest(Request request) { public void addTargetRequest(Request request) {
synchronized (targetRequests) {
targetRequests.add(request); targetRequests.add(request);
} }
}
/** /**
* get url of current page * get url of current page
......
...@@ -85,27 +85,10 @@ public class Request implements Serializable { ...@@ -85,27 +85,10 @@ public class Request implements Serializable {
return url; return url;
} }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (!url.equals(request.url)) return false;
return true;
}
public Map<String, Object> getExtras() { public Map<String, Object> getExtras() {
return extras; return extras;
} }
@Override
public int hashCode() {
return url.hashCode();
}
public void setExtras(Map<String, Object> extras) { public void setExtras(Map<String, Object> extras) {
this.extras = extras; this.extras = extras;
} }
...@@ -132,23 +115,52 @@ public class Request implements Serializable { ...@@ -132,23 +115,52 @@ public class Request implements Serializable {
return params; return params;
} }
/** /**
* POST/GET参数设置 * set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param params params
* */ * */
public void setParams(Map<String, String> params) { public void setParams(Map<String, String> params) {
this.params = params; this.params = params;
} }
/** /**
* POST/GET参数设置 * set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param key key
* @param value value
* */ * */
public void putParams(String key,String value) { public void putParams(String key,String value) {
params.put(key,value); params.put(key,value);
} }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null;
}
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0);
return result;
}
@Override @Override
public String toString() { public String toString() {
return "Request{" + return "Request{" +
"url='" + url + '\'' + "url='" + url + '\'' +
", method='" + method + '\'' + ", method='" + method + '\'' +
", extras=" + extras + ", extras=" + extras +
", params=" + params +
", priority=" + priority + ", priority=" + priority +
'}'; '}';
} }
......
...@@ -305,7 +305,7 @@ public class Spider implements Runnable, Task { ...@@ -305,7 +305,7 @@ public class Spider implements Runnable, Task {
initComponent(); initComponent();
logger.info("Spider " + getUUID() + " started!"); logger.info("Spider " + getUUID() + " started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this); final Request request = scheduler.poll(this);
if (request == null) { if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break; break;
...@@ -313,16 +313,15 @@ public class Spider implements Runnable, Task { ...@@ -313,16 +313,15 @@ public class Spider implements Runnable, Task {
// wait until new url added // wait until new url added
waitNewUrl(); waitNewUrl();
} else { } else {
final Request requestFinal = request;
threadPool.execute(new Runnable() { threadPool.execute(new Runnable() {
@Override @Override
public void run() { public void run() {
try { try {
processRequest(requestFinal); processRequest(request);
onSuccess(requestFinal); onSuccess(request);
} catch (Exception e) { } catch (Exception e) {
onError(requestFinal); onError(request);
logger.error("process request " + requestFinal + " error", e); logger.error("process request " + request + " error", e);
} finally { } finally {
pageCount.incrementAndGet(); pageCount.incrementAndGet();
signalNewUrl(); signalNewUrl();
...@@ -587,6 +586,7 @@ public class Spider implements Runnable, Task { ...@@ -587,6 +586,7 @@ public class Spider implements Runnable, Task {
if (threadNum <= 0) { if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!"); throw new IllegalArgumentException("threadNum should be more than one!");
} }
this.executorService = executorService;
return this; return this;
} }
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
...@@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder; ...@@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair; import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
...@@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site; ...@@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections; import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException; import java.io.IOException;
...@@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader {
proxyHost = site.getHttpProxy(); proxyHost = site.getHttpProxy();
} }
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴��� HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤ httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode(); statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode); request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) { if (statusAccept(acceptStatCode, statusCode)) {
...@@ -167,37 +162,42 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -167,37 +162,42 @@ public class HttpClientDownloader extends AbstractDownloader {
String method = request.getMethod(); String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get //default get
RequestBuilder requestBuilder=RequestBuilder.get(); return addQueryParams(RequestBuilder.get(),request.getParams());
if (request.getParams() != null) { } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
for (Map.Entry<String, String> entry : request.getParams().entrySet()) { return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
requestBuilder.addParameter(entry.getKey(), entry.getValue()); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return addQueryParams(RequestBuilder.delete(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return addQueryParams(RequestBuilder.trace(),request.getParams());
} }
throw new IllegalArgumentException("Illegal HTTP Method " + method);
} }
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>(); List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
if (nameValuePair != null && nameValuePair.length > 0) { if (nameValuePair != null && nameValuePair.length > 0) {
allNameValuePair= Arrays.asList(nameValuePair); allNameValuePair= Arrays.asList(nameValuePair);
} }
if (request.getParams() != null) { if (params != null) {
for (String key : request.getParams().keySet()) { for (String key : params.keySet()) {
allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key))); allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
} }
} }
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
return requestBuilder; return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
} }
throw new IllegalArgumentException("Illegal HTTP Method " + method);
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
if (params != null) {
for (Map.Entry<String, String> entry : params.entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}
return requestBuilder;
} }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
...@@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset; return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
} }
} }
...@@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor { ...@@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){ if (page.getResultItems().get("name")==null){
//skip this page //skip this page
page.setSkip(true); page.setSkip(true);
......
...@@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable { ...@@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable {
private List<Integer> failedErrorType = new ArrayList<Integer>(); private List<Integer> failedErrorType = new ArrayList<Integer>();
Proxy(HttpHost httpHost, String user, String password) { public Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost; this.httpHost = httpHost;
this.user = user; this.user = user;
this.password = password; this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
} }
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
this.httpHost = httpHost; this.httpHost = httpHost;
this.user = user; this.user = user;
this.password = password; this.password = password;
......
...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; ...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
/** /**
* Remove duplicate urls and only push urls which are not duplicate.<br><br> * Remove duplicate urls and only push urls which are not duplicate.<br><br>
...@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { ...@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
@Override @Override
public void push(Request request, Task task) { public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl()); logger.trace("get a candidate url {}", request.getUrl());
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) { if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
logger.debug("push to queue {}", request.getUrl()); logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task); pushWhenNoDuplicate(request, task);
} }
...@@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { ...@@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
} }
protected boolean noNeedToRemoveDuplicate(Request request) {
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
}
protected void pushWhenNoDuplicate(Request request, Task task) { protected void pushWhenNoDuplicate(Request request, Task task) {
} }
......
...@@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor ...@@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor
} }
@Override @Override
public synchronized Request poll(Task task) { public Request poll(Task task) {
return queue.poll(); return queue.poll();
} }
......
...@@ -28,8 +28,7 @@ public class RegexSelector implements Selector { ...@@ -28,8 +28,7 @@ public class RegexSelector implements Selector {
} }
// Check bracket for regex group. Add default group 1 if there is no group. // Check bracket for regex group. Add default group 1 if there is no group.
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == if ( ! hasGroup(regexStr) ){
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) {
regexStr = "(" + regexStr + ")"; regexStr = "(" + regexStr + ")";
} }
this.regexStr = regexStr; this.regexStr = regexStr;
...@@ -45,6 +44,30 @@ public class RegexSelector implements Selector { ...@@ -45,6 +44,30 @@ public class RegexSelector implements Selector {
this(regexStr, 1); this(regexStr, 1);
} }
private boolean hasGroup(String regexStr) {
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) {
return false;
}
return true;
}
@Override @Override
public String select(String text) { public String select(String text) {
return selectGroup(text).get(group); return selectGroup(text).get(group);
......
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.Charset;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
* Time: 10:36
* @since 0.6.2
*/
public abstract class CharsetUtils {
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType);
if (StringUtils.isNotBlank(contentType)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset);
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}
package us.codecraft.webmagic;
import org.junit.Test;
import us.codecraft.webmagic.utils.HttpConstant;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
*/
public class RequestTest {
@Test
public void testEqualsAndHashCode() throws Exception {
Request requestA = new Request("http://www.google.com/");
Request requestB = new Request("http://www.google.com/");
assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode());
assertThat(requestA).isEqualTo(requestB);
requestA.setMethod(HttpConstant.Method.GET);
requestA.setMethod(HttpConstant.Method.POST);
assertThat(requestA).isNotEqualTo(requestB);
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
}
}
...@@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable; ...@@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable;
import com.github.dreamhead.moco.Runner; import com.github.dreamhead.moco.Runner;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
...@@ -103,4 +107,42 @@ public class HttpClientDownloaderTest { ...@@ -103,4 +107,42 @@ public class HttpClientDownloaderTest {
} }
}); });
} }
@Test
public void test_selectRequestMethod() throws Exception {
HttpServer server = httpserver(12306);
server.get(eq(query("q"), "webmagic")).response("get");
server.post(eq(form("q"), "webmagic")).response("post");
server.put(eq(form("q"), "webmagic")).response("put");
server.delete(eq(query("q"), "webmagic")).response("delete");
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setUrl("http://127.0.0.1:12306/search");
request.putParams("q", "webmagic");
request.setMethod(HttpConstant.Method.GET);
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
request.setMethod(HttpConstant.Method.POST);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
request.setMethod(HttpConstant.Method.PUT);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
request.setMethod(HttpConstant.Method.DELETE);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
request.setMethod(HttpConstant.Method.HEAD);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
request.setMethod(HttpConstant.Method.TRACE);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
}
});
}
} }
package us.codecraft.webmagic.scheduler;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mockito;
import org.mockito.runners.MockitoJUnitRunner;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
* Time: 上午11:26
*/
@RunWith(MockitoJUnitRunner.class)
public class DuplicateRemovedSchedulerTest {
private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() {
@Override
public Request poll(Task task) {
return null;
}
};
@Test
public void test_no_duplicate_removed_for_post_request() throws Exception {
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
Request request = new Request("https://www.google.com/");
request.setMethod(HttpConstant.Method.POST);
duplicateRemovedScheduler.push(request, null);
verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class));
}
@Test
public void test_duplicate_removed_for_get_request() throws Exception {
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
Request request = new Request("https://www.google.com/");
request.setMethod(HttpConstant.Method.GET);
duplicateRemovedScheduler.push(request, null);
verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
}
}
...@@ -22,4 +22,20 @@ public class RegexSelectorTest { ...@@ -22,4 +22,20 @@ public class RegexSelectorTest {
String select = regexSelector.select(source); String select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo(source); Assertions.assertThat(select).isEqualTo(source);
} }
@Test
public void testRegexWithZeroWidthAssertions() {
String regex = "^.*(?=\\?)";
String source = "hello world?xxxx";
RegexSelector regexSelector = new RegexSelector(regex);
String select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo("hello world");
regex = "\\d{3}(?!\\d)";
source = "123456asdf";
regexSelector = new RegexSelector(regex);
select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo("456");
}
} }
...@@ -20,6 +20,9 @@ public class UrlUtilsTest { ...@@ -20,6 +20,9 @@ public class UrlUtilsTest {
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/");
assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz");
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
......
...@@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor ...@@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
public boolean isDuplicate(Request request, Task task) { public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
try { try {
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl()); return jedis.sadd(getSetKey(task), request.getUrl()) > 0;
if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl());
}
return isDuplicate;
} finally { } finally {
pool.returnResource(jedis); pool.returnResource(jedis);
} }
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>2.46.0</version> <version>2.41.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
......
...@@ -45,7 +45,7 @@ class WebDriverPool { ...@@ -45,7 +45,7 @@ class WebDriverPool {
private WebDriver mDriver = null; private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true; private boolean mAutoQuitDriver = true;
private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini"; private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
private static final String DRIVER_FIREFOX = "firefox"; private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome"; private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs"; private static final String DRIVER_PHANTOMJS = "phantomjs";
...@@ -64,7 +64,11 @@ class WebDriverPool { ...@@ -64,7 +64,11 @@ class WebDriverPool {
public void configure() throws IOException { public void configure() throws IOException {
// Read config file // Read config file
sConfig = new Properties(); sConfig = new Properties();
sConfig.load(new FileReader(CONFIG_FILE)); String configFile = DEFAULT_CONFIG_FILE;
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));
// Prepare capabilities // Prepare capabilities
sCaps = new DesiredCapabilities(); sCaps = new DesiredCapabilities();
......
...@@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor { ...@@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) { if (page.getUrl().toString().contains("pins")) {
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString()); page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
} else { } else {
page.getResultItems().setSkip(true); page.getResultItems().setSkip(true);
} }
......
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
phantomjs_driver_path=../../src/main.js
phantomjs_driver_loglevel=DEBUG
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment