Commit fd6d2fd6 authored by yihua.huang's avatar yihua.huang

try to keepalive TCP connection

parent 425df085
...@@ -27,13 +27,13 @@ public class Site { ...@@ -27,13 +27,13 @@ public class Site {
*/ */
private List<Request> startRequests = new ArrayList<Request>(); private List<Request> startRequests = new ArrayList<Request>();
private int sleepTime = 3000; private int sleepTime = 5000;
private int retryTimes = 0; private int retryTimes = 0;
private int cycleRetryTimes = 0; private int cycleRetryTimes = 0;
private int timeOut = 2000; private int timeOut = 5000;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>(); private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
......
...@@ -428,7 +428,10 @@ public class Spider implements Runnable, Task { ...@@ -428,7 +428,10 @@ public class Spider implements Runnable, Task {
public <T> List<T> getAll(Collection<String> urls) { public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false; destroyWhenExit = false;
spawnUrl = false; spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls); startRequests.clear();
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline(); CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline); pipelines.add(collectorPipeline);
run(); run();
......
...@@ -37,7 +37,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -37,7 +37,7 @@ public class HttpClientDownloader implements Downloader {
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>(); private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private int poolSize = 1; private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
/** /**
* A simple method to download a url. * A simple method to download a url.
...@@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader { ...@@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader {
private CloseableHttpClient getHttpClient(Site site) { private CloseableHttpClient getHttpClient(Site site) {
if (site == null) { if (site == null) {
return new HttpClientGenerator(poolSize).getClient(null); return httpClientGenerator.getClient(null);
} }
String domain = site.getDomain(); String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain); CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) { if (httpClient == null) {
synchronized (this) { synchronized (this) {
if (httpClient == null) { if (httpClient == null) {
httpClient = new HttpClientGenerator(poolSize).getClient(site); httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient); httpClients.put(domain, httpClient);
} }
} }
...@@ -105,7 +105,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -105,7 +105,7 @@ public class HttpClientDownloader implements Downloader {
.setConnectionRequestTimeout(site.getTimeOut()) .setConnectionRequestTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH); .setCookieSpec(CookieSpecs.BEST_MATCH);
if (site.getHttpProxy() != null) { if (site != null && site.getHttpProxy() != null) {
requestConfigBuilder.setProxy(site.getHttpProxy()); requestConfigBuilder.setProxy(site.getHttpProxy());
} }
requestBuilder.setConfig(requestConfigBuilder.build()); requestBuilder.setConfig(requestConfigBuilder.build());
...@@ -168,6 +168,6 @@ public class HttpClientDownloader implements Downloader { ...@@ -168,6 +168,6 @@ public class HttpClientDownloader implements Downloader {
@Override @Override
public void setThread(int thread) { public void setThread(int thread) {
poolSize = thread; httpClientGenerator.setPoolSize(thread);
} }
} }
...@@ -5,6 +5,7 @@ import org.apache.http.client.CookieStore; ...@@ -5,6 +5,7 @@ import org.apache.http.client.CookieStore;
import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.client.protocol.ResponseContentEncoding;
import org.apache.http.config.Registry; import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder; import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
...@@ -25,16 +26,20 @@ public class HttpClientGenerator { ...@@ -25,16 +26,20 @@ public class HttpClientGenerator {
private PoolingHttpClientConnectionManager connectionManager; private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator(int poolSize) { public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE) .register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", SSLConnectionSocketFactory.getSocketFactory()) .register("https", SSLConnectionSocketFactory.getSocketFactory())
.build(); .build();
connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100); connectionManager.setDefaultMaxPerRoute(100);
} }
public HttpClientGenerator setPoolSize(int poolSize){
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) { public CloseableHttpClient getClient(Site site) {
return generateClient(site); return generateClient(site);
} }
...@@ -59,6 +64,8 @@ public class HttpClientGenerator { ...@@ -59,6 +64,8 @@ public class HttpClientGenerator {
} }
}); });
} }
SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
// Http client has some problem handling compressing entity for redirect // Http client has some problem handling compressing entity for redirect
// So I disable it and do it manually // So I disable it and do it manually
// https://issues.apache.org/jira/browse/HTTPCLIENT-1432 // https://issues.apache.org/jira/browse/HTTPCLIENT-1432
......
...@@ -30,10 +30,14 @@ public class BaiduBaikePageProcesser implements PageProcessor { ...@@ -30,10 +30,14 @@ public class BaiduBaikePageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2); Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
List<String> list = new ArrayList<String>();
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电")); ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电"));
......
...@@ -13,6 +13,11 @@ ...@@ -13,6 +13,11 @@
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
</logger> </logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false"> <logger name="net.sf.ehcache" additivity="false">
<level value="warn" /> <level value="warn" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
......
...@@ -28,14 +28,22 @@ public class BaiduBaike{ ...@@ -28,14 +28,22 @@ public class BaiduBaike{
} }
public static void main(String[] args) { public static void main(String[] args) {
List<String> list = new ArrayList<String>(); OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class);
//single download
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电")); BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8");
System.out.println(baike);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate, "地热发电")); list.add(String.format(urlTemplate,"地热发电"));
List<BaiduBaike> baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).<BaiduBaike>getAll(list); List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);
System.out.println(baiduBaikes); for (BaiduBaike resultItemse : resultItemses) {
System.out.println(resultItemse);
}
ooSpider.close();
} }
} }
...@@ -31,7 +31,7 @@ public class OschinaBlog { ...@@ -31,7 +31,7 @@ public class OschinaBlog {
private Date date; private Date date;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me() OOSpider.create(Site.me().setSleepTime(0)
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run(); .addUrl("http://my.oschina.net/flashsword/blog").run();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment