Commit ec446277 authored by yihua.huang's avatar yihua.huang

some refactor in httpclientdownloader

parent 4a035e72
...@@ -7,6 +7,7 @@ import org.apache.http.annotation.ThreadSafe; ...@@ -7,6 +7,7 @@ import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig; import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
...@@ -75,26 +76,12 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -75,26 +76,12 @@ public class HttpClientDownloader extends AbstractDownloader {
acceptStatCode = Sets.newHashSet(200); acceptStatCode = Sets.newHashSet(200);
} }
logger.info("downloading page {}" , request.getUrl()); logger.info("downloading page {}" , request.getUrl());
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site != null && site.getHttpProxy() != null) {
requestConfigBuilder.setProxy(site.getHttpProxy());
}
requestBuilder.setConfig(requestConfigBuilder.build());
CloseableHttpResponse httpResponse = null; CloseableHttpResponse httpResponse = null;
try { try {
httpResponse = getHttpClient(site).execute(requestBuilder.build()); HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
int statusCode = httpResponse.getStatusLine().getStatusCode(); int statusCode = httpResponse.getStatusLine().getStatusCode();
if (acceptStatCode.contains(statusCode)) { if (statusAccept(acceptStatCode, statusCode)) {
//charset //charset
if (charset == null) { if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
...@@ -123,6 +110,34 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -123,6 +110,34 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
} }
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site != null && site.getHttpProxy() != null) {
requestConfigBuilder.setProxy(site.getHttpProxy());
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page(); Page page = new Page();
...@@ -132,9 +147,4 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -132,9 +147,4 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page; return page;
} }
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
} }
...@@ -3,9 +3,11 @@ package us.codecraft.webmagic.model.samples; ...@@ -3,9 +3,11 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import us.codecraft.webmagic.samples.formatter.StringTemplateFormatter;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.util.List; import java.util.List;
...@@ -20,7 +22,6 @@ public class GithubRepo implements HasKey { ...@@ -20,7 +22,6 @@ public class GithubRepo implements HasKey {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name; private String name;
@Formatter(value = "author%s",formatter = StringTemplateFormatter.class)
@ExtractByUrl("https://github\\.com/(\\w+)/.*") @ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author; private String author;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment