Commit 76729c93 authored by xbynet's avatar xbynet Committed by GitHub

Merge pull request #2 from code4craft/master

合并官方最新代码
parents 650468c0 e9341d02
This diff is collapsed.
......@@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
### 贡献者:
以下是为WebMagic提交过代码或者issue的朋友:
* [ccliangbo](https://github.com/ccliangbo)
* [yuany](https://github.com/yuany)
* [yxssfxwzy](https://github.com/yxssfxwzy)
* [linkerlin](https://github.com/linkerlin)
* [d0ngw](https://github.com/d0ngw)
* [xuchaoo](https://github.com/xuchaoo)
* [supermicah](https://github.com/supermicah)
* [SimpleExpress](https://github.com/SimpleExpress)
* [aruanruan](https://github.com/aruanruan)
* [l1z2g9](https://github.com/l1z2g9)
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
* [ywooer](https://github.com/ywooer)
* [yyw258520](https://github.com/yyw258520)
* [perfecking](https://github.com/perfecking)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
* [fengwuze](https://github.com/fengwuze)
### 邮件组:
Gmail:
......
......@@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
......@@ -89,7 +89,7 @@ You can also use annotation way:
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
@ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
......@@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
![image](http://code4craft.github.io/images/posts/webmagic.png)
Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/)
There are some samples in `webmagic-samples` package.
There are more examples in `webmagic-samples` package.
### Lisence:
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
### Contributors:
Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
* [ccliangbo](https://github.com/ccliangbo)
* [yuany](https://github.com/yuany)
* [yxssfxwzy](https://github.com/yxssfxwzy)
* [linkerlin](https://github.com/linkerlin)
* [d0ngw](https://github.com/d0ngw)
* [xuchaoo](https://github.com/xuchaoo)
* [supermicah](https://github.com/supermicah)
* [SimpleExpress](https://github.com/SimpleExpress)
* [aruanruan](https://github.com/aruanruan)
* [l1z2g9](https://github.com/l1z2g9)
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
* [ywooer](https://github.com/ywooer)
* [yyw258520](https://github.com/yyw258520)
* [perfecking](https://github.com/perfecking)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
* [fengwuze](https://github.com/fengwuze)
### Thanks:
To write webmagic, I refered to the projects below :
......
......@@ -64,6 +64,12 @@
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.10.19</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
......@@ -97,7 +103,7 @@
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.21</version>
<version>1.2.28</version>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
......@@ -130,7 +136,7 @@
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
......
......@@ -40,6 +40,11 @@
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
......
......@@ -107,7 +107,6 @@ public class Page {
* @param requests requests
*/
public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
......@@ -116,7 +115,6 @@ public class Page {
targetRequests.add(new Request(s));
}
}
}
/**
* add urls to fetch
......@@ -125,7 +123,6 @@ public class Page {
* @param priority priority
*/
public void addTargetRequests(List<String> requests, long priority) {
synchronized (targetRequests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
......@@ -134,7 +131,6 @@ public class Page {
targetRequests.add(new Request(s).setPriority(priority));
}
}
}
/**
* add url to fetch
......@@ -145,11 +141,9 @@ public class Page {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
synchronized (targetRequests) {
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
}
/**
* add requests to fetch
......@@ -157,10 +151,8 @@ public class Page {
* @param request request
*/
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
targetRequests.add(request);
}
}
/**
* get url of current page
......
......@@ -85,27 +85,10 @@ public class Request implements Serializable {
return url;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (!url.equals(request.url)) return false;
return true;
}
public Map<String, Object> getExtras() {
return extras;
}
@Override
public int hashCode() {
return url.hashCode();
}
public void setExtras(Map<String, Object> extras) {
this.extras = extras;
}
......@@ -132,23 +115,52 @@ public class Request implements Serializable {
return params;
}
/**
* POST/GET参数设置
* set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param params params
* */
public void setParams(Map<String, String> params) {
this.params = params;
}
/**
* POST/GET参数设置
* set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param key key
* @param value value
* */
public void putParams(String key,String value) {
params.put(key,value);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null;
}
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
", params=" + params +
", priority=" + priority +
'}';
}
......
......@@ -305,7 +305,7 @@ public class Spider implements Runnable, Task {
initComponent();
logger.info("Spider " + getUUID() + " started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
final Request request = scheduler.poll(this);
if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
......@@ -313,16 +313,15 @@ public class Spider implements Runnable, Task {
// wait until new url added
waitNewUrl();
} else {
final Request requestFinal = request;
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
processRequest(requestFinal);
onSuccess(requestFinal);
processRequest(request);
onSuccess(request);
} catch (Exception e) {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
onError(request);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
......@@ -587,6 +586,7 @@ public class Spider implements Runnable, Task {
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
this.executorService = executorService;
return this;
}
......
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
......@@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
......@@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
......@@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader {
proxyHost = site.getHttpProxy();
}
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴���
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
......@@ -167,37 +162,42 @@ public class HttpClientDownloader extends AbstractDownloader {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
RequestBuilder requestBuilder=RequestBuilder.get();
if (request.getParams() != null) {
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return addQueryParams(RequestBuilder.delete(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return addQueryParams(RequestBuilder.trace(),request.getParams());
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
if (nameValuePair != null && nameValuePair.length > 0) {
allNameValuePair= Arrays.asList(nameValuePair);
}
if (request.getParams() != null) {
for (String key : request.getParams().keySet()) {
allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key)));
if (params != null) {
for (String key : params.keySet()) {
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
}
}
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
if (params != null) {
for (Map.Entry<String, String> entry : params.entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}
return requestBuilder;
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
......@@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}
......@@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
......
......@@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable {
private List<Integer> failedErrorType = new ArrayList<Integer>();
Proxy(HttpHost httpHost, String user, String password) {
public Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
......
......@@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
......@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
......@@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
protected boolean noNeedToRemoveDuplicate(Request request) {
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
}
protected void pushWhenNoDuplicate(Request request, Task task) {
}
......
......@@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor
}
@Override
public synchronized Request poll(Task task) {
public Request poll(Task task) {
return queue.poll();
}
......
......@@ -28,8 +28,7 @@ public class RegexSelector implements Selector {
}
// Check bracket for regex group. Add default group 1 if there is no group.
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) {
if ( ! hasGroup(regexStr) ){
regexStr = "(" + regexStr + ")";
}
this.regexStr = regexStr;
......@@ -45,6 +44,30 @@ public class RegexSelector implements Selector {
this(regexStr, 1);
}
private boolean hasGroup(String regexStr) {
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) {
return false;
}
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) {
return false;
}
return true;
}
@Override
public String select(String text) {
return selectGroup(text).get(group);
......
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.Charset;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
* Time: 10:36
* @since 0.6.2
*/
public abstract class CharsetUtils {
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType);
if (StringUtils.isNotBlank(contentType)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset);
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}
package us.codecraft.webmagic;
import org.junit.Test;
import us.codecraft.webmagic.utils.HttpConstant;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
*/
public class RequestTest {
@Test
public void testEqualsAndHashCode() throws Exception {
Request requestA = new Request("http://www.google.com/");
Request requestB = new Request("http://www.google.com/");
assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode());
assertThat(requestA).isEqualTo(requestB);
requestA.setMethod(HttpConstant.Method.GET);
requestA.setMethod(HttpConstant.Method.POST);
assertThat(requestA).isNotEqualTo(requestB);
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
}
}
......@@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable;
import com.github.dreamhead.moco.Runner;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
......@@ -103,4 +107,42 @@ public class HttpClientDownloaderTest {
}
});
}
@Test
public void test_selectRequestMethod() throws Exception {
HttpServer server = httpserver(12306);
server.get(eq(query("q"), "webmagic")).response("get");
server.post(eq(form("q"), "webmagic")).response("post");
server.put(eq(form("q"), "webmagic")).response("put");
server.delete(eq(query("q"), "webmagic")).response("delete");
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setUrl("http://127.0.0.1:12306/search");
request.putParams("q", "webmagic");
request.setMethod(HttpConstant.Method.GET);
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
request.setMethod(HttpConstant.Method.POST);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
request.setMethod(HttpConstant.Method.PUT);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
request.setMethod(HttpConstant.Method.DELETE);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
request.setMethod(HttpConstant.Method.HEAD);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
request.setMethod(HttpConstant.Method.TRACE);
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
}
});
}
}
package us.codecraft.webmagic.scheduler;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mockito;
import org.mockito.runners.MockitoJUnitRunner;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
/**
* @author code4crafter@gmail.com
* Date: 17/3/11
* Time: 上午11:26
*/
@RunWith(MockitoJUnitRunner.class)
public class DuplicateRemovedSchedulerTest {
private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() {
@Override
public Request poll(Task task) {
return null;
}
};
@Test
public void test_no_duplicate_removed_for_post_request() throws Exception {
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
Request request = new Request("https://www.google.com/");
request.setMethod(HttpConstant.Method.POST);
duplicateRemovedScheduler.push(request, null);
verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class));
}
@Test
public void test_duplicate_removed_for_get_request() throws Exception {
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
Request request = new Request("https://www.google.com/");
request.setMethod(HttpConstant.Method.GET);
duplicateRemovedScheduler.push(request, null);
verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
}
}
......@@ -22,4 +22,20 @@ public class RegexSelectorTest {
String select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo(source);
}
@Test
public void testRegexWithZeroWidthAssertions() {
String regex = "^.*(?=\\?)";
String source = "hello world?xxxx";
RegexSelector regexSelector = new RegexSelector(regex);
String select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo("hello world");
regex = "\\d{3}(?!\\d)";
source = "123456asdf";
regexSelector = new RegexSelector(regex);
select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo("456");
}
}
......@@ -20,6 +20,9 @@ public class UrlUtilsTest {
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/");
assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz");
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
......
......@@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl());
}
return isDuplicate;
return jedis.sadd(getSetKey(task), request.getUrl()) > 0;
} finally {
pool.returnResource(jedis);
}
......
......@@ -13,7 +13,7 @@
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.46.0</version>
<version>2.41.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
......
......@@ -45,7 +45,7 @@ class WebDriverPool {
private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini";
private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs";
......@@ -64,7 +64,11 @@ class WebDriverPool {
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
sConfig.load(new FileReader(CONFIG_FILE));
String configFile = DEFAULT_CONFIG_FILE;
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));
// Prepare capabilities
sCaps = new DesiredCapabilities();
......
......@@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) {
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString());
page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
} else {
page.getResultItems().setSkip(true);
}
......
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
phantomjs_driver_path=../../src/main.js
phantomjs_driver_loglevel=DEBUG
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment