Commit edfc319c authored by yihua.huang's avatar yihua.huang

update httpclient to 4.3.1

parent 160a149b
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.2.4</version> <version>4.3.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
...@@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
private HttpClientPool httpClientPool; private volatile CloseableHttpClient httpClient;
private int poolSize = 1; private int poolSize = 1;
...@@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader { ...@@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
return (Html) page.getHtml(); return (Html) page.getHtml();
} }
private HttpClientPool getHttpClientPool(){ private CloseableHttpClient getHttpClient(Site site) {
if (httpClientPool==null){ if (httpClient == null) {
httpClientPool = new HttpClientPool(poolSize); synchronized (this) {
if (httpClient == null) {
httpClient = new HttpClientPool(poolSize).getClient(site);
}
}
} }
return httpClientPool; return httpClient;
} }
@Override @Override
...@@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader { ...@@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
if (task != null) { if (task != null) {
site = task.getSite(); site = task.getSite();
} }
int retryTimes = 0;
Set<Integer> acceptStatCode; Set<Integer> acceptStatCode;
String charset = null; String charset = null;
Map<String,String> headers = null; Map<String, String> headers = null;
if (site != null) { if (site != null) {
retryTimes = site.getRetryTimes();
acceptStatCode = site.getAcceptStatCode(); acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset(); charset = site.getCharset();
headers = site.getHeaders(); headers = site.getHeaders();
...@@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader { ...@@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader {
acceptStatCode.add(200); acceptStatCode.add(200);
} }
logger.info("downloading page " + request.getUrl()); logger.info("downloading page " + request.getUrl());
HttpClient httpClient = getHttpClientPool().getClient(site); HttpGet httpGet = new HttpGet(request.getUrl());
try { if (headers != null) {
HttpGet httpGet = new HttpGet(request.getUrl()); for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
if (headers!=null){
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
}
}
if (!httpGet.containsHeader("Accept-Encoding")) {
httpGet.addHeader("Accept-Encoding", "gzip");
} }
HttpResponse httpResponse = null; }
int tried = 0; CloseableHttpResponse httpResponse = null;
boolean retry; try {
do { httpResponse = getHttpClient(site).execute(httpGet);
try {
httpResponse = httpClient.execute(httpGet);
retry = false;
} catch (IOException e) {
tried++;
if (tried > retryTimes) {
logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
}
return page;
}
return null;
}
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
retry = true;
}
} while (retry);
int statusCode = httpResponse.getStatusLine().getStatusCode(); int statusCode = httpResponse.getStatusLine().getStatusCode();
if (acceptStatCode.contains(statusCode)) { if (acceptStatCode.contains(statusCode)) {
handleGzip(httpResponse);
//charset //charset
if (charset == null) { if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
...@@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader { ...@@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader {
return handleResponse(request, charset, httpResponse, task); return handleResponse(request, charset, httpResponse, task);
} else { } else {
logger.warn("code error " + statusCode + "\t" + request.getUrl()); logger.warn("code error " + statusCode + "\t" + request.getUrl());
return null;
} }
} catch (Exception e) { } catch (IOException e) {
logger.warn("download page " + request.getUrl() + " error", e); logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
return null;
} finally {
try {
if (httpResponse != null) {
httpResponse.close();
}
} catch (IOException e) {
logger.warn("close response fail", e);
}
} }
return null; }
private Page addToCycleRetry(Request request, Site site) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
}
return page;
} }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), String content = EntityUtils.toString(httpResponse.getEntity(), charset);
charset);
Page page = new Page(); Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
...@@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader { ...@@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
@Override @Override
public void setThread(int thread) { public void setThread(int thread) {
poolSize = thread; poolSize = thread;
httpClientPool = new HttpClientPool(thread);
}
private void handleGzip(HttpResponse httpResponse) {
Header ceheader = httpResponse.getEntity().getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (HeaderElement codec : codecs) {
if (codec.getName().equalsIgnoreCase("gzip")) {
//todo bugfix
httpResponse.setEntity(
new GzipDecompressingEntity(httpResponse.getEntity()));
}
}
}
} }
} }
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion; import org.apache.http.*;
import org.apache.http.client.CookieStore; import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient; import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.params.ClientPNames; import org.apache.http.config.Registry;
import org.apache.http.client.params.CookiePolicy; import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.*;
import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.*; import org.apache.http.protocol.HttpContext;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import java.io.IOException;
import java.util.Map; import java.util.Map;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.3.3
*/ */
public class HttpClientPool { public class HttpClientPool {
private int poolSize; private PoolingHttpClientConnectionManager connectionManager;
private PoolingClientConnectionManager connectionManager;
public HttpClientPool(int poolSize) { public HttpClientPool(int poolSize) {
this.poolSize = poolSize; Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
SchemeRegistry schemeRegistry = new SchemeRegistry(); .register("http", PlainConnectionSocketFactory.INSTANCE)
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); .register("https", SSLConnectionSocketFactory.getSocketFactory())
schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); .build();
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager = new PoolingClientConnectionManager(schemeRegistry);
connectionManager.setMaxTotal(poolSize); connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100); connectionManager.setDefaultMaxPerRoute(100);
} }
public HttpClient getClient(Site site) { public CloseableHttpClient getClient(Site site) {
return generateClient(site); return generateClient(site);
} }
private HttpClient generateClient(Site site) { private CloseableHttpClient generateClient(Site site) {
HttpParams params = new BasicHttpParams(); HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
if (site != null && site.getUserAgent() != null) { if (site != null && site.getUserAgent() != null) {
params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); httpClientBuilder.setUserAgent(site.getUserAgent());
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
} else { } else {
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000); httpClientBuilder.setUserAgent("");
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000);
} }
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); public void process(
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); final HttpRequest request,
paramsBean.setVersion(HttpVersion.HTTP_1_1); final HttpContext context) throws HttpException, IOException {
if (site != null && site.getCharset() != null) { if (!request.containsHeader("Accept-Encoding")) {
paramsBean.setContentCharset(site.getCharset()); request.addHeader("Accept-Encoding", "gzip");
} }
paramsBean.setUseExpectContinue(false);
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); }
if (site != null) { }).addInterceptorFirst(new HttpResponseInterceptor() {
generateCookie(httpClient, site);
} public void process(
return httpClient; final HttpResponse response,
final HttpContext context) throws HttpException, IOException {
HttpEntity entity = response.getEntity();
if (entity != null) {
Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
response.setEntity(
new GzipDecompressingEntity(response.getEntity()));
return;
}
}
}
}
}
});
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
return httpClientBuilder.build();
} }
private void generateCookie(DefaultHttpClient httpClient, Site site) { private void generateCookie(DefaultHttpClient httpClient, Site site) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment