Commit 09153ff7 authored by yihua.huang's avatar yihua.huang

#22 http proxy support #32 update httpclient to 4.3.1

parent edfc319c
package us.codecraft.webmagic; package us.codecraft.webmagic;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*; import java.util.*;
...@@ -40,6 +41,8 @@ public class Site { ...@@ -40,6 +41,8 @@ public class Site {
private Map<String, String> headers = new HashMap<String, String>(); private Map<String, String> headers = new HashMap<String, String>();
private HttpHost httpProxy;
public static interface HeaderConst { public static interface HeaderConst {
public static final String REFERER = "Referer"; public static final String REFERER = "Referer";
...@@ -295,6 +298,20 @@ public class Site { ...@@ -295,6 +298,20 @@ public class Site {
return this; return this;
} }
public HttpHost getHttpProxy() {
return httpProxy;
}
/**
* set up httpProxy for this site
* @param httpProxy
* @return
*/
public Site setHttpProxy(HttpHost httpProxy) {
this.httpProxy = httpProxy;
return this;
}
public Task toTask() { public Task toTask() {
return new Task() { return new Task() {
@Override @Override
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
...@@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText; ...@@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
...@@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
private volatile CloseableHttpClient httpClient; private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private int poolSize = 1; private int poolSize = 1;
...@@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader { ...@@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader {
} }
private CloseableHttpClient getHttpClient(Site site) { private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return new HttpClientPool(poolSize).getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) { if (httpClient == null) {
synchronized (this) { synchronized (this) {
if (httpClient == null) { if (httpClient == null) {
httpClient = new HttpClientPool(poolSize).getClient(site); httpClient = new HttpClientPool(poolSize).getClient(site);
httpClients.put(domain, httpClient);
} }
} }
} }
...@@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader { ...@@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader {
charset = site.getCharset(); charset = site.getCharset();
headers = site.getHeaders(); headers = site.getHeaders();
} else { } else {
acceptStatCode = new HashSet<Integer>(); acceptStatCode = Sets.newHashSet(200);
acceptStatCode.add(200);
} }
logger.info("downloading page " + request.getUrl()); logger.info("downloading page " + request.getUrl());
HttpGet httpGet = new HttpGet(request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (headers != null) { if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue()); requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
} }
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut());
if (site.getHttpProxy()!=null){
requestConfigBuilder.setProxy(site.getHttpProxy());
} }
requestBuilder.setConfig(requestConfigBuilder.build());
CloseableHttpResponse httpResponse = null; CloseableHttpResponse httpResponse = null;
try { try {
httpResponse = getHttpClient(site).execute(httpGet); httpResponse = getHttpClient(site).execute(requestBuilder.build());
int statusCode = httpResponse.getStatusLine().getStatusCode(); int statusCode = httpResponse.getStatusLine().getStatusCode();
if (acceptStatCode.contains(statusCode)) { if (acceptStatCode.contains(statusCode)) {
//charset //charset
......
...@@ -78,7 +78,9 @@ public class HttpClientPool { ...@@ -78,7 +78,9 @@ public class HttpClientPool {
} }
}); });
if (site!=null){
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
}
return httpClientBuilder.build(); return httpClientBuilder.build();
} }
......
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.List; import java.util.List;
...@@ -24,8 +26,13 @@ public class OschinaBlog{ ...@@ -24,8 +26,13 @@ public class OschinaBlog{
private List<String> tags; private List<String> tags;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888))
,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); ,new PageModelPipeline() {
@Override
public void process(Object o, Task task) {
}
}, OschinaBlog.class).thread(10).run();
} }
public String getTitle() { public String getTitle() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment