Commit 6ead04a7 authored by Yihua Huang's avatar Yihua Huang Committed by GitHub

Merge pull request #524 from code4craft/proxyRefactor

parents 25df6650 0f4d6e8b
......@@ -6,7 +6,7 @@
......@@ -3,7 +3,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="" xmlns:xsi="" xsi:schemaLocation="">
\ No newline at end of file
package us.codecraft.webmagic;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
......@@ -40,17 +39,14 @@ public class Page {
private Selectable url;
private Map<String,List<String>> headers;
private int statusCode;
private boolean needCycleRetry;
private List<Request> targetRequests = new ArrayList<Request>();
* Http响应头
private Header[] headers=null;
public Page() {
......@@ -77,7 +73,7 @@ public class Page {
public Html getHtml() {
if (html == null) {
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
html = new Html(rawText, request.getUrl());
return html;
......@@ -217,11 +213,11 @@ public class Page {
return this;
public Header[] getHeaders() {
public Map<String, List<String>> getHeaders() {
return headers;
public void setHeaders(Header[] headers) {
public void setHeaders(Map<String, List<String>> headers) {
this.headers = headers;
......@@ -232,7 +228,9 @@ public class Page {
", resultItems=" + resultItems +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
", needCycleRetry=" + needCycleRetry +
", targetRequests=" + targetRequests +
", headers=" + headers+
package us.codecraft.webmagic;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicHeader;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.UrlUtils;
* Object contains url to crawl.<br>
* It contains some additional information.<br>
......@@ -28,33 +19,24 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
public static final String STATUS_CODE = "statusCode";
public static final String PROXY = "proxy";
private String url;
private String method;
private HttpRequestBody requestBody;
* Store additional information in extras.
private Map<String, Object> extras;
* POST/GET param set
* */
private Map<String,String> params=new HashMap<String, String>();
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
private HttpEntity entity;
* cookies for current url, if not set use Site's cookies
private List<Cookie> cookies=new ArrayList<Cookie>();
private Map<String, String> cookies = new HashMap<String, String>();
private List<Header> headers=new ArrayList<Header>();
private Map<String, String> headers = new HashMap<String, String>();
* Priority of the request.<br>
......@@ -133,27 +115,11 @@ public class Request implements Serializable {
this.method = method;
public Map<String, String> getParams() {
return params;
* set params for request
* <br>
* DO NOT set this for request already has params, like ''
* @param params params
* */
public void setParams(Map<String, String> params) {
this.params = params;
* set params for request
* <br>
* DO NOT set this for request already has params, like ''
* @param key key
* @param value value
* */
public void putParams(String key,String value) {
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
return result;
......@@ -164,63 +130,33 @@ public class Request implements Serializable {
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null;
public void addHeader(String name,String value){
Header header=new BasicHeader(name,value);
return method != null ? method.equals(request.method) : request.method == null;
public List<Header> getHeaders(){
return headers;
public Request addCookie(String name, String value) {
cookies.put(name, value);
return this;
public void addCookie(String key,String value){
BasicClientCookie c=new BasicClientCookie(key, value);
public Request addHeader(String name, String value) {
headers.put(name, value);
return this;
public List<Cookie> getCookies() {
public Map<String, String> getCookies() {
return cookies;
public void setCookies(List<Cookie> cookies) {
this.cookies = cookies;
* 设置json参数
public void setJsonParam(String jsonStr,String encoding){
StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
* 设置xml参数
public void setXmlParam(String xmlStr,String encoding){
StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
public HttpEntity getEntity() {
return entity;
public Map<String, String> getHeaders() {
return headers;
public void setEntity(HttpEntity entity) {
this.entity = entity;
public HttpRequestBody getRequestBody() {
return requestBody;
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
result = 31 * result + (entity != null ? entity.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
return result;
public void setRequestBody(HttpRequestBody requestBody) {
this.requestBody = requestBody;
......@@ -229,10 +165,8 @@ public class Request implements Serializable {
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
", params=" + params +
", priority=" + priority +
", headers=" + headers +
", entity=" + entity +
", cookies="+ cookies+
package us.codecraft.webmagic;
import org.apache.http.HttpHost;
import org.apache.http.auth.UsernamePasswordCredentials;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.proxy.SimpleProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
......@@ -28,11 +21,6 @@ public class Site {
private String charset;
* startUrls is the urls the crawler to start with.
private List<Request> startRequests = new ArrayList<Request>();
private int sleepTime = 5000;
private int retryTimes = 0;
......@@ -49,24 +37,8 @@ public class Site {
private Map<String, String> headers = new HashMap<String, String>();
private HttpHost httpProxy;
private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
private ProxyPool httpProxyPool;
private boolean useGzip = true;
* @see us.codecraft.webmagic.utils.HttpConstant.Header
* @deprecated
public static interface HeaderConst {
public static final String REFERER = "Referer";
static {
......@@ -225,52 +197,6 @@ public class Site {
return acceptStatCode;
* get start urls
* @return start urls
* @see #getStartRequests
* @deprecated
public List<String> getStartUrls() {
return UrlUtils.convertToUrls(startRequests);
public List<Request> getStartRequests() {
return startRequests;
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
* @param startUrl startUrl
* @return this
* @see Spider#addUrl(String...)
* @deprecated
public Site addStartUrl(String startUrl) {
return addStartRequest(new Request(startUrl));
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
* @param startRequest startRequest
* @return this
* @see Spider#addRequest(Request...)
* @deprecated
public Site addStartRequest(Request startRequest) {
if (domain == null && startRequest.getUrl() != null) {
domain = UrlUtils.getDomain(startRequest.getUrl());
return this;
* Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
......@@ -350,21 +276,6 @@ public class Site {
return this;
public HttpHost getHttpProxy() {
return httpProxy;
* set up httpProxy for this site
* @param httpProxy httpProxy
* @return this
public Site setHttpProxy(HttpHost httpProxy) {
this.httpProxy = httpProxy;
return this;
public boolean isUseGzip() {
return useGzip;
......@@ -400,7 +311,11 @@ public class Site {
return new Task() {
public String getUUID() {
return Site.this.getDomain();
String uuid = Site.this.getDomain();
if (uuid == null) {
uuid = UUID.randomUUID().toString();
return uuid;
......@@ -428,8 +343,6 @@ public class Site {
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
......@@ -441,7 +354,6 @@ public class Site {
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
......@@ -458,7 +370,6 @@ public class Site {
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", startRequests=" + startRequests +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
......@@ -468,53 +379,4 @@ public class Site {
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
* @param proxyPool proxyPool
* @return this
public Site setHttpProxyPool(ProxyPool proxyPool) {
this.httpProxyPool = proxyPool;
return this;
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
* @param httpProxyList httpProxyList
* @param isUseLastProxy isUseLastProxy
* @return this
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy);
return this;
public Site enableHttpProxyPool() {
this.httpProxyPool=new SimpleProxyPool();
return this;
public UsernamePasswordCredentials getUsernamePasswordCredentials() {
return usernamePasswordCredentials;
public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) {
this.usernamePasswordCredentials = usernamePasswordCredentials;
return this;
public ProxyPool getHttpProxyPool() {
return httpProxyPool;
public Proxy getHttpProxyFromPool() {
return httpProxyPool.getProxy();
public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
......@@ -126,7 +126,6 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; = pageProcessor.getSite();
this.startRequests = pageProcessor.getSite().getStartRequests();
......@@ -419,8 +418,6 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this);
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
......@@ -482,7 +479,9 @@ public class Spider implements Runnable, Task {
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
for (Request request : UrlUtils.convertToRequests(urls)) {
package us.codecraft.webmagic.downloader;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
......@@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
......@@ -92,38 +87,20 @@ public class HttpClientGenerator {
return this;
public CloseableHttpClient getClient(Site site, Proxy proxy) {
return generateClient(site, proxy);
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
private CloseableHttpClient generateClient(Site site, Proxy proxy) {
CredentialsProvider credsProvider = null;
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
credsProvider= new BasicCredentialsProvider();
new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){
credsProvider = new BasicCredentialsProvider();
new AuthScope(site.getHttpProxy()),//可以访问的范围
if (site != null && site.getUserAgent() != null) {
if (site.getUserAgent() != null) {
} else {
if (site == null || site.isUseGzip()) {
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
......@@ -140,16 +117,12 @@ public class HttpClientGenerator {
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
if (site != null) {
SocketConfig socketConfig =;
if (site != null) {
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
package us.codecraft.webmagic.downloader;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
* @author
* Date: 17/4/8
* Time: 19:43
* @since 0.7.0
public class HttpClientRequestContext {
private HttpUriRequest httpUriRequest;
private HttpClientContext httpClientContext;
public HttpUriRequest getHttpUriRequest() {
return httpUriRequest;
public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
this.httpUriRequest = httpUriRequest;
public HttpClientContext getHttpClientContext() {
return httpClientContext;
public void setHttpClientContext(HttpClientContext httpClientContext) {
this.httpClientContext = httpClientContext;
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthState;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.cookie.BasicClientCookie;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.Map;
* @author
* Date: 17/3/18
* Time: 11:28
* @since 0.7.0
public class HttpUriRequestConverter {
public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
return httpClientRequestContext;
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null) {
AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
return httpContext;
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (site.getHeaders() != null) {
for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
if (proxy != null) {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
HttpUriRequest httpUriRequest =;
if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
httpUriRequest.addHeader(header.getKey(), header.getValue());
return httpUriRequest;
private RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(,request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
throw new IllegalArgumentException("Illegal HTTP Method " + method);
private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
if (request.getRequestBody() != null) {
ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
return requestBuilder;
package us.codecraft.webmagic.model;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicNameValuePair;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
* @author
* Date: 17/4/8
public class HttpRequestBody {
public static abstract class ContentType {
public static final String JSON = "application/json";
public static final String XML = "text/xml";
public static final String FORM = "application/x-www-form-urlencoded";
public static final String MULTIPART = "multipart/form-data";
private final byte[] body;
private final String contentType;
private final String encoding;
public HttpRequestBody(byte[] body, String contentType, String encoding) {
this.body = body;
this.contentType = contentType;
this.encoding = encoding;
public String getContentType() {
return contentType;
public String getEncoding() {
return encoding;
public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException {
return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException {
return new HttpRequestBody(body, contentType, encoding);
public static HttpRequestBody form(Map<String,Object> params, String encoding) throws UnsupportedEncodingException {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(params.size());
for (Map.Entry<String, Object> entry : params.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
public byte[] getBody() {
return body;
......@@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
......@@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor {
private Site site;
public SimplePageProcessor(String startUrl, String urlPattern) { =
public SimplePageProcessor(String urlPattern) { =;
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
* Object has these status of lifecycle above.<br>
* @author <br>
* @since 0.5.1
* @see SimpleProxyPool
public class Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
public class Proxy {
private final HttpHost httpHost;
private String user;
private String host;
private int port;
private String username;
private String password;
private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private int failedNum = 0;
private int successNum = 0;
private int borrowNum = 0;
private List<Integer> failedErrorType = new ArrayList<Integer>();
public Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
public Proxy(String host, int port) { = host;
this.port = port;
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
this.httpHost = httpHost;
this.user = user;
public Proxy(String host, int port, String username, String password) { = host;
this.port = port;
this.username = username;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
public int getSuccessNum() {
return successNum;
public void successNumIncrement(int increment) {
this.successNum += increment;
public Long getLastUseTime() {
return lastBorrowTime;
public void setLastBorrowTime(Long lastBorrowTime) {
this.lastBorrowTime = lastBorrowTime;
public void recordResponse() {
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
this.lastBorrowTime = System.currentTimeMillis();
public List<Integer> getFailedErrorType() {
return failedErrorType;
public void setFailedErrorType(List<Integer> failedErrorType) {
this.failedErrorType = failedErrorType;
public void fail(int failedErrorType) {
public String getHost() {
return host;
public void setFailedNum(int failedNum) {
this.failedNum = failedNum;
public int getPort() {
return port;
public int getFailedNum() {
return failedNum;
public String getUsername() {
return username;
public String getFailedType() {
String re = "";
for (Integer i : this.failedErrorType) {
re += i + " . ";
return re;
public HttpHost getHttpHost() {
return httpHost;
public int getReuseTimeInterval() {
return reuseTimeInterval;
public void setReuseTimeInterval(int reuseTimeInterval) {
this.reuseTimeInterval = reuseTimeInterval;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
public int compareTo(Delayed o) {
Proxy that = (Proxy) o;
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
public String toString() {
String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
successNum * 100.0 / borrowNum, borrowNum);
return re;
public String getUser()
return user;
public String getPassword()
public String getPassword() {
return password;
public void borrowNumIncrement(int increment) {
this.borrowNum += increment;
public int getBorrowNum() {
return borrowNum;
package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.Task;
* Created by edwardsbean on 15-2-28.
public interface ProxyPool {
public void returnProxy(HttpHost host, int statusCode);
public Proxy getProxy();
public boolean isEnable();
public interface ProxyProvider {
void returnProxy(Proxy proxy, boolean banned, Task task);
Proxy getProxy(Task task);
package us.codecraft.webmagic.proxy;
import org.apache.http.HttpResponse;
* @author
* Date: 17/3/20
* Time: 下午10:52
public interface ResponseChecker {
boolean isBanned(HttpResponse httpResponse);
package us.codecraft.webmagic.proxy;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
* Object has these status of lifecycle above.<br>
* @author <br>
* @since 0.5.1
* @see TimerReuseProxyPool
public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private int failedNum = 0;
private int successNum = 0;
private int borrowNum = 0;
private List<Integer> failedErrorType = new ArrayList<Integer>();
public TimerReuseProxy(String host, int port, String username, String password) {
super(host, port, username, password);
public int getSuccessNum() {
return successNum;
public void successNumIncrement(int increment) {
this.successNum += increment;
public Long getLastUseTime() {
return lastBorrowTime;
public void setLastBorrowTime(Long lastBorrowTime) {
this.lastBorrowTime = lastBorrowTime;
public void recordResponse() {
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
this.lastBorrowTime = System.currentTimeMillis();
public List<Integer> getFailedErrorType() {
return failedErrorType;
public void setFailedErrorType(List<Integer> failedErrorType) {
this.failedErrorType = failedErrorType;
public void fail(int failedErrorType) {
public void setFailedNum(int failedNum) {
this.failedNum = failedNum;
public int getFailedNum() {
return failedNum;
public String getFailedType() {
String re = "";
for (Integer i : this.failedErrorType) {
re += i + " . ";
return re;
public int getReuseTimeInterval() {
return reuseTimeInterval;
public void setReuseTimeInterval(int reuseTimeInterval) {
this.reuseTimeInterval = reuseTimeInterval;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
public int compareTo(Delayed o) {
TimerReuseProxy that = (TimerReuseProxy) o;
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
public void borrowNumIncrement(int increment) {
this.borrowNum += increment;
public int getBorrowNum() {
return borrowNum;
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Task;
* Pooled Proxy Object
* @author <br>
* @see Proxy
* @since 0.5.1
public class TimerReuseProxyPool implements ProxyProvider {
public void returnProxy(Proxy proxy, boolean banned, Task task) {
public Proxy getProxy(Task task) {
return null;
// private Logger logger = LoggerFactory.getLogger(getClass());
// private BlockingQueue<TimerReuseProxy> proxyQueue = new DelayQueue<TimerReuseProxy>();
// private Map<String, TimerReuseProxy> allProxy = new ConcurrentHashMap<String, TimerReuseProxy>();
// private int reuseInterval = 1500;// ms
// private int reviveTime = 2 * 60 * 60 * 1000;// ms
// private int saveProxyInterval = 10 * 60 * 1000;// ms
// private boolean isEnable = false;
// private boolean validateWhenInit = false;
// // private boolean isUseLastProxy = true;
// public TimerReuseProxyPool(List<String[]> httpProxyList) {
// this(httpProxyList, true);
// }
// private void addProxy(Map<String, Proxy> httpProxyMap) {
// isEnable = true;
// for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
// try {
// if (allProxy.containsKey(entry.getKey())) {
// continue;
// }
// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
// entry.getValue().setFailedNum(0);
// entry.getValue().setReuseTimeInterval(reuseInterval);
// proxyQueue.add(entry.getValue());
// allProxy.put(entry.getKey(), entry.getValue());
// }
// } catch (NumberFormatException e) {
// logger.error("HttpHost init error:", e);
// }
// }
//"proxy pool size>>>>" + allProxy.size());
// }
// public void addProxy(Proxy... httpProxyList) {
// isEnable = true;
// for (Proxy proxy : httpProxyList) {
// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
// proxyQueue.add(p);
// allProxy.put(p.getProxyHost().getHost(), p);
// }
// }
//"proxy pool size>>>>" + allProxy.size());
// }
// public TimerReuseProxy getProxy() {
// TimerReuseProxy proxy = null;
// try {
// Long time = System.currentTimeMillis();
// proxy = proxyQueue.take();
// double costTime = (System.currentTimeMillis() - time) / 1000.0;
// if (costTime > reuseInterval) {
//"get proxy time >>>> " + costTime);
// }
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
// p.setLastBorrowTime(System.currentTimeMillis());
// p.borrowNumIncrement(1);
// } catch (InterruptedException e) {
// logger.error("get proxy error", e);
// }
// if (proxy == null) {
// throw new NoSuchElementException();
// }
// return proxy;
// }
// public void returnProxy(Proxy proxy, int statusCode) {
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
// if (p == null) {
// return;
// }
// switch (statusCode) {
// case TimerReuseProxy.SUCCESS:
// p.setReuseTimeInterval(reuseInterval);
// p.setFailedNum(0);
// p.setFailedErrorType(new ArrayList<Integer>());
// p.recordResponse();
// p.successNumIncrement(1);
// break;
// case TimerReuseProxy.ERROR_403:
// // banned,try longer interval
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
// + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
// break;
// case TimerReuseProxy.ERROR_BANNED:
// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
// + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
// break;
// case TimerReuseProxy.ERROR_404:
// //;
// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
// break;
// default:
// break;
// }
// if (p.getFailedNum() > 20) {
// p.setReuseTimeInterval(reviveTime);
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
// return;
// }
// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
// if (!ProxyUtils.validateProxy(proxy)) {
// p.setReuseTimeInterval(reviveTime);
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
// return;
// }
// }
// try {
// proxyQueue.put(p);
// } catch (InterruptedException e) {
// logger.warn("proxyQueue return proxy error", e);
// }
// }
// public String allProxyStatus() {
// String re = "all proxy info >>>> \n";
// for (Entry<String, Proxy> entry : allProxy.entrySet()) {
// re += entry.getValue().toString() + "\n";
// }
// return re;
// }
// public int getIdleNum() {
// return proxyQueue.size();
// }
// public int getReuseInterval() {
// return reuseInterval;
// }
// public void setReuseInterval(int reuseInterval) {
// this.reuseInterval = reuseInterval;
// }
// public void enable(boolean isEnable) {
// this.isEnable = isEnable;
// }
// public boolean isEnable() {
// return isEnable;
// }
// public int getReviveTime() {
// return reviveTime;
// }
// public void setReviveTime(int reviveTime) {
// this.reviveTime = reviveTime;
// }
// public boolean isValidateWhenInit() {
// return validateWhenInit;
// }
// public void validateWhenInit(boolean validateWhenInit) {
// this.validateWhenInit = validateWhenInit;
// }
// public int getSaveProxyInterval() {
// return saveProxyInterval;
// }
// public void setSaveProxyInterval(int saveProxyInterval) {
// this.saveProxyInterval = saveProxyInterval;
// }
// public String getProxyFilePath() {
// return proxyFilePath;
// }
// public void setProxyFilePath(String proxyFilePath) {
// this.proxyFilePath = proxyFilePath;
// }
......@@ -44,6 +44,16 @@ public class Html extends HtmlNode {
private Document document;
public Html(String text, String url) {
try {
this.document = Jsoup.parse(text, url);
} catch (Exception e) {
this.document = null;
logger.warn("parse document error ", e);
public Html(String text) {
try {
......@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
public Selectable links() {
return xpath("//a/@href");
return selectElements(new LinksSelector());
package us.codecraft.webmagic.selector;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
* Links selector based on jsoup. Use absolute url. <br>
* @author <br>
* @since 0.7.0
public class LinksSelector extends BaseElementSelector {
public String select(Element element) {
throw new UnsupportedOperationException();
public List<String> selectList(Element element) {
Elements elements ="a");
List<String> links = new ArrayList<String>(elements.size());
for (Element element0 : elements) {
if (!StringUtil.isBlank(element0.baseUri())) {
} else {
return links;
public Element selectElement(Element element) {
throw new UnsupportedOperationException();
public List<Element> selectElements(Element element) {
throw new UnsupportedOperationException();
public boolean hasAttribute() {
return true;
package us.codecraft.webmagic.utils;
import org.apache.http.Header;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
* @author
* Date: 17/3/27
public abstract class HttpClientUtils {
public static Map<String,List<String>> convertHeaders(Header[] headers){
Map<String,List<String>> results = new HashMap<String, List<String>>();
for (Header header : headers) {
List<String> list = results.get(header.getName());
if (list == null) {
list = new ArrayList<String>();
results.put(header.getName(), list);
return results;
package us.codecraft.webmagic.utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.proxy.Proxy;
import java.util.Enumeration;
import java.util.regex.Pattern;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* Pooled Proxy Object
......@@ -23,68 +16,19 @@ import org.slf4j.LoggerFactory;
public class ProxyUtils {
private static InetAddress localAddr;
private static String networkInterface = "eth7";
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
static {
private static void init() {
// first way to get local IP
try {
localAddr = InetAddress.getLocalHost();"local IP:" + localAddr.getHostAddress());
} catch (UnknownHostException e) {"try again\n");
if (localAddr != null) {
// other way to get local IP
Enumeration<InetAddress> localAddrs;
try {
// modify your network interface name
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
if (ni == null) {
localAddrs = ni.getInetAddresses();
if (localAddrs == null || !localAddrs.hasMoreElements()) {
logger.error("choose NetworkInterface\n" + getNetworkInterface());
while (localAddrs.hasMoreElements()) {
InetAddress tmp = localAddrs.nextElement();
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
localAddr = tmp;"local IP:" + localAddr.getHostAddress());
} catch (Exception e) {
logger.error("Failure when init ProxyUtil", e);
logger.error("choose NetworkInterface\n" + getNetworkInterface());
public static boolean validateProxy(HttpHost p) {
if (localAddr == null) {
logger.error("cannot get local IP");
return false;
boolean isReachable = false;
public static boolean validateProxy(Proxy p) {
Socket socket = null;
try {
socket = new Socket();
socket.bind(new InetSocketAddress(localAddr, 0));
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort());
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort());
socket.connect(endpointSocketAddr, 3000);
logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
isReachable = true;
return true;
} catch (IOException e) {
logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
logger.warn("FAILRE - CAN not connect! remote: " + p);
return false;
} finally {
if (socket != null) {
try {
......@@ -94,30 +38,7 @@ public class ProxyUtils {
return isReachable;
private static String getNetworkInterface() {
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
Enumeration<NetworkInterface> enumeration = null;
try {
enumeration = NetworkInterface.getNetworkInterfaces();
} catch (SocketException e1) {
while (enumeration.hasMoreElements()) {
NetworkInterface networkInterface = enumeration.nextElement();
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
while (addr.hasMoreElements()) {
String s = addr.nextElement().getHostAddress();
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
return networkInterfaceName;
......@@ -80,7 +80,7 @@ public class UrlUtils {
if (i > 0) {
domain = StringUtils.substring(domain, 0, i);
return domain;
return removePort(domain);
public static String removePort(String domain) {
......@@ -48,4 +48,14 @@ public class HtmlTest {
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
public void testGetHrefsByJsoup(){
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","");
html = new Html("<html><base href=''><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
......@@ -19,12 +19,12 @@ public class SpiderTest {
@Ignore("long time")
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor("", "*")).addPipeline(new Pipeline() {
Spider spider = Spider.create(new SimplePageProcessor( "*")).addPipeline(new Pipeline() {
public void process(ResultItems resultItems, Task task) {
......@@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
* @author May 30, 2014
......@@ -27,30 +24,6 @@ public class ProxyTest {
public void testProxy() {
SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false);
for (int i = 0; i < 2; i++) {
List<Fetch> fetchList = new ArrayList<Fetch>();
while (proxyPool.getIdleNum() != 0) {
Proxy proxy = proxyPool.getProxy();
HttpHost httphost = proxy.getHttpHost();
// httphostList.add(httphost);
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
Fetch tmp = new Fetch(httphost);
for (Fetch fetch : fetchList) {
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
class Fetch extends Thread {
HttpHost hp;
package us.codecraft.webmagic.selector;
import org.junit.Test;
import java.util.List;
* @author
* Date: 17/4/8
* Time: 下午9:41
public class LinksSelectorTest {
private String html = "<div><a href=''></a></div><div><a href=''></a></div>";
public void testLinks() throws Exception {
List<String> links = new LinksSelector().selectList(html);
......@@ -3,7 +3,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="" xmlns:xsi="" xsi:schemaLocation="">
\ No newline at end of file
package us.codecraft.webmagic.downloader;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.UrlUtils;
* Download file and saved to file for cache.<br>
* @author
* @since 0.2.1
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
private Downloader downloaderWhenFileMiss;
private final PageProcessor pageProcessor;
private Logger logger = LoggerFactory.getLogger(getClass());
public FileCache(String startUrl, String urlPattern) {
this(startUrl, urlPattern, "/data/webmagic/temp/");
public FileCache(String startUrl, String urlPattern, String path) {
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
downloaderWhenFileMiss = new HttpClientDownloader();
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
return this;
public Page download(Request request, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
Page page = null;
try {
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
String line = bufferedReader.readLine();
if (line.equals("url:\t" + request.getUrl())) {
final String html = getHtml(bufferedReader);
page = new Page();
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
} catch (IOException e) {
if (e instanceof FileNotFoundException) {"File not exist for url " + request.getUrl());
} else {
logger.warn("File read error for url " + request.getUrl(), e);
if (page == null) {
page = downloadWhenMiss(request, task);
return page;
public void setThread(int thread) {
private String getHtml(BufferedReader bufferedReader) throws IOException {
String line;
StringBuilder htmlBuilder = new StringBuilder();
line = bufferedReader.readLine();
line = StringUtils.removeStart(line, "html:\t");
while ((line = bufferedReader.readLine()) != null) {
return htmlBuilder.toString();
private Page downloadWhenMiss(Request request, Task task) {
Page page = null;
if (downloaderWhenFileMiss != null) {
page =, task);
return page;
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
printWriter.println("html:\t" + resultItems.get("html"));
} catch (IOException e) {
logger.warn("write file error", e);
public void process(Page page) {
public Site getSite() {
return pageProcessor.getSite();
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
* @author <br>
public class FileCacheTest {
@Ignore("takes long")
public void test() {
FileCache fileCache = new FileCache("", "*");
......@@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor {
public Site getSite() {
......@@ -3,7 +3,7 @@
......@@ -21,7 +21,7 @@ public class DianpingFtlDataScanner implements AfterExtractor {
private List<String> data;
public static void main(String[] args) {
OOSpider.create("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
OOSpider.create(, DianpingFtlDataScanner.class)
......@@ -41,9 +41,10 @@ public class GithubRepo implements HasKey {
private String url;
public static void main(String[] args) {
new JsonFilePageModelPipeline(), GithubRepo.class)
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
......@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
public static void main(String[] args) {
OOSpider.create(""), IteyeBlog.class).run();
OOSpider.create(, IteyeBlog.class).addUrl("").run();
public String getTitle() {
......@@ -32,12 +32,12 @@ public class Kr36NewsModel {
public static void main(String[] args) throws IOException, JMException {
//Just for benchmark
Spider thread = OOSpider.create("").setSleepTime(0), new PageModelPipeline() {
Spider thread = OOSpider.create(, new PageModelPipeline() {
public void process(Object o, Task task) {
}, Kr36NewsModel.class).thread(20);
}, Kr36NewsModel.class).thread(20).addUrl("");
SpiderMonitor spiderMonitor = SpiderMonitor.instance();
......@@ -22,7 +22,7 @@ public class OschinaAnswer implements AfterExtractor{
private String content;
public static void main(String[] args) {
OOSpider.create(""), OschinaAnswer.class).run();
OOSpider.create(, OschinaAnswer.class).addUrl("").run();
......@@ -26,7 +26,7 @@ public class OschinaBlog{
public static void main(String[] args) {
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("")
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
,new PageModelPipeline() {
......@@ -34,7 +34,7 @@ public class OschinaBlog{
public void process(Object o, Task task) {
}, OschinaBlog.class).thread(10).run();
}, OschinaBlog.class).thread(10).addUrl("").run();
public String getTitle() {
......@@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor {
public Site getSite() {
if (site == null) {
site ="").addStartUrl("").
site ="").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
......@@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor {
public Site getSite() {
if (site==null){
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
return site;
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
Spider.create(new DiaoyuwengProcessor()).addUrl("").run();
......@@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor {
public Site getSite() {
return"").addStartUrl("").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
return"").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("").run();
......@@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor {
public Site getSite() {
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
Spider.create(new HuxiuProcessor()).addUrl("").run();
......@@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public Site getSite() {
if (site == null) {
site ="").addStartUrl("").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
site ="").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
......@@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
......@@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor {
public Site getSite() {
if (site == null) {
site ="").addStartUrl("");
site ="");
return site;
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).run();
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("").run();
......@@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor {
public Site getSite() {
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
Spider.create(new KaichibaProcessor()).addUrl("").run();
......@@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor {
public Site getSite() {
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
Spider.create(new MeicanProcessor()).addUrl("").run();
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor {
public Site getSite() {
public static void main(String[] args) {
Spider.create(new NjuBBSProcessor()).addUrl("").run();
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import java.util.List;
* @author <br>
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site ="").addStartUrl("");
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
public Site getSite() {
return site;
public static void main(String[] args) throws JMException {
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
* @author <br>
public class OschinaPageProcesser implements PageProcessor {
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
public Site getSite() {
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
......@@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor {
public Site getSite() {
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
......@@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor {
public Site getSite() {
return"").addStartUrl(""); //To change body of implemented methods use File | Settings | File Templates.
return""); //To change body of implemented methods use File | Settings | File Templates.
......@@ -28,10 +28,10 @@ public class SpiderTest {
// PageProcessor pageProcessor = new MeicanProcessor();
// FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("", "*-1-1.html");
SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "*-1-1.html");
Spider.create(pageProcessor2).addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
Spider.create(pageProcessor2).addUrl("").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
......@@ -17,7 +17,7 @@ public class ProcessorBenchmark {
public void test() {
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(""), OschinaBlog.class);
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(, OschinaBlog.class);
Page page = new Page();
page.setRequest(new Request(""));
page.setUrl(new PlainText(""));
......@@ -3,7 +3,7 @@
......@@ -3,7 +3,7 @@
......@@ -3,7 +3,7 @@
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment