Commit fd6d2fd6 authored by yihua.huang's avatar yihua.huang

try to keepalive TCP connection

parent 425df085
......@@ -27,13 +27,13 @@ public class Site {
*/
private List<Request> startRequests = new ArrayList<Request>();
private int sleepTime = 3000;
private int sleepTime = 5000;
private int retryTimes = 0;
private int cycleRetryTimes = 0;
private int timeOut = 2000;
private int timeOut = 5000;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
......
......@@ -428,7 +428,10 @@ public class Spider implements Runnable, Task {
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls);
startRequests.clear();
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
......
......@@ -37,7 +37,7 @@ public class HttpClientDownloader implements Downloader {
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private int poolSize = 1;
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
/**
* A simple method to download a url.
......@@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader {
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return new HttpClientGenerator(poolSize).getClient(null);
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
if (httpClient == null) {
httpClient = new HttpClientGenerator(poolSize).getClient(site);
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
......@@ -105,7 +105,7 @@ public class HttpClientDownloader implements Downloader {
.setConnectionRequestTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site.getHttpProxy() != null) {
if (site != null && site.getHttpProxy() != null) {
requestConfigBuilder.setProxy(site.getHttpProxy());
}
requestBuilder.setConfig(requestConfigBuilder.build());
......@@ -168,6 +168,6 @@ public class HttpClientDownloader implements Downloader {
@Override
public void setThread(int thread) {
poolSize = thread;
httpClientGenerator.setPoolSize(thread);
}
}
......@@ -5,6 +5,7 @@ import org.apache.http.client.CookieStore;
import org.apache.http.client.protocol.ResponseContentEncoding;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
......@@ -25,16 +26,20 @@ public class HttpClientGenerator {
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator(int poolSize) {
public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", SSLConnectionSocketFactory.getSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100);
}
public HttpClientGenerator setPoolSize(int poolSize){
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
......@@ -59,6 +64,8 @@ public class HttpClientGenerator {
}
});
}
SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
// Http client has some problem handling compressing entity for redirect
// So I disable it and do it manually
// https://issues.apache.org/jira/browse/HTTPCLIENT-1432
......
......@@ -30,10 +30,14 @@ public class BaiduBaikePageProcesser implements PageProcessor {
}
public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
List<String> list = new ArrayList<String>();
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电"));
ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
......
......@@ -13,6 +13,11 @@
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
......
......@@ -28,14 +28,22 @@ public class BaiduBaike{
}
public static void main(String[] args) {
List<String> list = new ArrayList<String>();
OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class);
//single download
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电"));
BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8");
System.out.println(baike);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate, "地热发电"));
List<BaiduBaike> baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).<BaiduBaike>getAll(list);
System.out.println(baiduBaikes);
list.add(String.format(urlTemplate,"地热发电"));
List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);
for (BaiduBaike resultItemse : resultItemses) {
System.out.println(resultItemse);
}
ooSpider.close();
}
}
......@@ -31,7 +31,7 @@ public class OschinaBlog {
private Date date;
public static void main(String[] args) {
OOSpider.create(Site.me()
OOSpider.create(Site.me().setSleepTime(0)
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment