Commit 86a20eab authored by yihua.huang's avatar yihua.huang

fix a httpclient pool size bug

parent fcb09f2e
...@@ -39,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger; ...@@ -39,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
*/ */
public class Spider implements Runnable, Task { public class Spider implements Runnable, Task {
private Downloader downloader = new HttpClientDownloader(); private Downloader downloader;
private List<Pipeline> pipelines = new ArrayList<Pipeline>(); private List<Pipeline> pipelines = new ArrayList<Pipeline>();
...@@ -139,12 +139,18 @@ public class Spider implements Runnable, Task { ...@@ -139,12 +139,18 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
protected void checkComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
}
@Override @Override
public void run() { public void run() {
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
throw new IllegalStateException("Spider is already running!"); throw new IllegalStateException("Spider is already running!");
} }
checkComponent();
if (startUrls != null) { if (startUrls != null) {
for (String startUrl : startUrls) { for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this); scheduler.push(new Request(startUrl), this);
...@@ -247,6 +253,7 @@ public class Spider implements Runnable, Task { ...@@ -247,6 +253,7 @@ public class Spider implements Runnable, Task {
if (threadNum <= 0) { if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!"); throw new IllegalArgumentException("threadNum should be more than one!");
} }
downloader = new HttpClientDownloader(threadNum);
if (threadNum == 1) { if (threadNum == 1) {
return this; return this;
} }
......
...@@ -21,6 +21,7 @@ import java.io.IOException; ...@@ -21,6 +21,7 @@ import java.io.IOException;
/** /**
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br> * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:15 * Time: 下午12:15
...@@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader { ...@@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
private int poolSize;
public HttpClientDownloader(int poolSize) {
this.poolSize = poolSize;
}
public HttpClientDownloader() {
this(5);
}
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
Site site = task.getSite(); Site site = task.getSite();
logger.info("downloading page " + request.getUrl()); logger.info("downloading page " + request.getUrl());
HttpClient httpClient = HttpClientPool.getInstance().getClient(site); HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
String charset = site.getCharset(); String charset = site.getCharset();
try { try {
HttpGet httpGet = new HttpGet(request.getUrl()); HttpGet httpGet = new HttpGet(request.getUrl());
...@@ -50,7 +61,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -50,7 +61,7 @@ public class HttpClientDownloader implements Downloader {
logger.warn("download page " + request.getUrl() + " error", e); logger.warn("download page " + request.getUrl() + " error", e);
return null; return null;
} }
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!"); logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
retry = true; retry = true;
} }
} while (retry); } while (retry);
......
...@@ -19,14 +19,21 @@ import java.util.Map; ...@@ -19,14 +19,21 @@ import java.util.Map;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:29 * Time: 下午12:29
*/ */
public class HttpClientPool { public class HttpClientPool {
public static final HttpClientPool INSTANCE = new HttpClientPool(5); public static volatile HttpClientPool INSTANCE;
public static HttpClientPool getInstance() { public static HttpClientPool getInstance(int poolSize) {
if (INSTANCE == null) {
synchronized (HttpClientPool.class) {
if (INSTANCE == null) {
INSTANCE = new HttpClientPool(poolSize);
}
}
}
return INSTANCE; return INSTANCE;
} }
...@@ -48,7 +55,7 @@ public class HttpClientPool { ...@@ -48,7 +55,7 @@ public class HttpClientPool {
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setVersion(HttpVersion.HTTP_1_1);
paramsBean.setContentCharset("UTF-8"); paramsBean.setContentCharset(site.getCharset());
paramsBean.setUseExpectContinue(false); paramsBean.setUseExpectContinue(false);
SchemeRegistry schemeRegistry = new SchemeRegistry(); SchemeRegistry schemeRegistry = new SchemeRegistry();
......
...@@ -27,8 +27,8 @@ public class GlobalProcessor implements PageProcessor { ...@@ -27,8 +27,8 @@ public class GlobalProcessor implements PageProcessor {
@Override @Override
public Site getSite() { public Site getSite() {
if (site==null){ if (site == null) {
site = Site.me().setDomain("www.2345.com") site = Site.me().setDomain("www.2345.com").setSleepTime(0)
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment