Commit 09153ff7 authored by yihua.huang's avatar yihua.huang

#22 http proxy support #32 update httpclient to 4.3.1

parent edfc319c
package us.codecraft.webmagic;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
......@@ -40,6 +41,8 @@ public class Site {
private Map<String, String> headers = new HashMap<String, String>();
private HttpHost httpProxy;
public static interface HeaderConst {
public static final String REFERER = "Referer";
......@@ -295,6 +298,20 @@ public class Site {
return this;
}
public HttpHost getHttpProxy() {
return httpProxy;
}
/**
* set up httpProxy for this site
* @param httpProxy
* @return
*/
public Site setHttpProxy(HttpHost httpProxy) {
this.httpProxy = httpProxy;
return this;
}
public Task toTask() {
return new Task() {
@Override
......
package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
......@@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
......@@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass());
private volatile CloseableHttpClient httpClient;
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private int poolSize = 1;
......@@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader {
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return new HttpClientPool(poolSize).getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
if (httpClient == null) {
httpClient = new HttpClientPool(poolSize).getClient(site);
httpClients.put(domain, httpClient);
}
}
}
......@@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader {
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = new HashSet<Integer>();
acceptStatCode.add(200);
acceptStatCode = Sets.newHashSet(200);
}
logger.info("downloading page " + request.getUrl());
HttpGet httpGet = new HttpGet(request.getUrl());
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut());
if (site.getHttpProxy()!=null){
requestConfigBuilder.setProxy(site.getHttpProxy());
}
requestBuilder.setConfig(requestConfigBuilder.build());
CloseableHttpResponse httpResponse = null;
try {
httpResponse = getHttpClient(site).execute(httpGet);
httpResponse = getHttpClient(site).execute(requestBuilder.build());
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (acceptStatCode.contains(statusCode)) {
//charset
......
......@@ -78,7 +78,9 @@ public class HttpClientPool {
}
});
if (site!=null){
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
}
return httpClientBuilder.build();
}
......
package us.codecraft.webmagic.model.samples;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.List;
......@@ -24,8 +26,13 @@ public class OschinaBlog{
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888))
,new PageModelPipeline() {
@Override
public void process(Object o, Task task) {
}
}, OschinaBlog.class).thread(10).run();
}
public String getTitle() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment