Commit 1d86f7c0 authored by yihua.huang's avatar yihua.huang

compile passed in httpclientDownloader

parent b71f3795
...@@ -18,7 +18,6 @@ public class Request implements Serializable { ...@@ -18,7 +18,6 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L; private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
public static final String STATUS_CODE = "statusCode";
public static final String PROXY = "proxy"; public static final String PROXY = "proxy";
private String url; private String url;
......
...@@ -419,8 +419,6 @@ public class Spider implements Runnable, Task { ...@@ -419,8 +419,6 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this); pipeline.process(page.getResultItems(), this);
} }
} }
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime()); sleep(site.getSleepTime());
} }
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.auth.AuthState; import org.apache.http.auth.AuthState;
...@@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task; ...@@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set;
/** /**
...@@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpResponse httpResponse = null; CloseableHttpResponse httpResponse = null;
int statusCode = 0; int statusCode = 0;
Site site = task.getSite(); Site site = task.getSite();
try { Proxy proxy = null;
Proxy proxy = null; HttpContext httpContext = new BasicHttpContext();
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool(); proxy = site.getHttpProxyFromPool();
} else if (site != null && site.getHttpProxy() != null){
proxy = site.getHttpProxy();
request.putExtra(Request.PROXY, site.getHttpProxy());
}
request.putExtra(Request.PROXY, proxy); request.putExtra(Request.PROXY, proxy);
HttpContext httpContext = new BasicHttpContext();
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site);
AuthState authState = new AuthState(); AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password")); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
CloseableHttpClient httpClient = getHttpClient(site, proxy); }
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site);
CloseableHttpClient httpClient = getHttpClient(site);
try {
httpResponse = httpClient.execute(httpUriRequest, httpContext); httpResponse = httpClient.execute(httpUriRequest, httpContext);
statusCode = httpResponse.getStatusLine().getStatusCode(); statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode); if (site.getAcceptStatCode().contains(statusCode)) {
if (statusAccept(acceptStatCode, statusCode)) { Page page = handleResponse(request, site.getCharset(), httpResponse, task);
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request); onSuccess(request);
return page; return page;
} else { } else {
...@@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader {
//ensure the connection is released back to pool //ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity()); EntityUtils.consumeQuietly(httpResponse.getEntity());
} }
request.putExtra(Request.STATUS_CODE, statusCode); if (proxy != null) {
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { site.getHttpProxyPool().returnProxy(proxy, statusCode);
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
.getExtra(Request.STATUS_CODE));
} }
} }
} }
...@@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader {
httpClientGenerator.setPoolSize(thread); httpClientGenerator.setPoolSize(thread);
} }
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse); String content = getContent(charset, httpResponse);
Page page = new Page(); Page page = new Page();
......
...@@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder; ...@@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.message.BasicNameValuePair; import org.apache.http.message.BasicNameValuePair;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.HttpConstant;
import java.nio.charset.Charset; import java.nio.charset.Charset;
...@@ -26,7 +25,7 @@ import java.util.Map; ...@@ -26,7 +25,7 @@ import java.util.Map;
*/ */
public class HttpUriRequestConverter { public class HttpUriRequestConverter {
public HttpUriRequest convert(Request request, Site site, Proxy proxy) { public HttpUriRequest convert(Request request, Site site) {
return null; return null;
} }
......
...@@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy; ...@@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy;
public class Proxy { public class Proxy {
private ProxyHost proxyHost; private ProxyHost proxyHost;
private String user; private String username;
private String password; private String password;
public Proxy(ProxyHost proxyHost, String user, String password) { public Proxy(ProxyHost proxyHost, String username, String password) {
this.proxyHost = proxyHost; this.proxyHost = proxyHost;
this.user = user; this.username = username;
this.password = password; this.password = password;
} }
...@@ -28,12 +28,12 @@ public class Proxy { ...@@ -28,12 +28,12 @@ public class Proxy {
this.proxyHost = proxyHost; this.proxyHost = proxyHost;
} }
public String getUser() { public String getUsername() {
return user; return username;
} }
public void setUser(String user) { public void setUsername(String username) {
this.user = user; this.username = username;
} }
public String getPassword() { public String getPassword() {
......
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
/** /**
* Created by edwardsbean on 15-2-28. * Created by edwardsbean on 15-2-28.
*/ */
public interface ProxyPool { public interface ProxyPool {
void returnProxy(HttpHost host, int statusCode); void returnProxy(Proxy proxy, int statusCode);
Proxy getProxy(); Proxy getProxy();
......
...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase; ...@@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.ProxyUtils; import us.codecraft.webmagic.utils.ProxyUtils;
import java.io.*; import java.io.*;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*; import java.util.*;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
...@@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool { ...@@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool {
isEnable = true; isEnable = true;
for (Proxy proxy : httpProxyList) { for (Proxy proxy : httpProxyList) {
if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval); TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
proxyQueue.add(p); proxyQueue.add(p);
allProxy.put(p.getProxyHost().getHost(), p); allProxy.put(p.getProxyHost().getHost(), p);
} }
...@@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool { ...@@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool {
return proxy; return proxy;
} }
public void returnProxy(HttpHost host, int statusCode) { public void returnProxy(Proxy proxy, int statusCode) {
TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress()); TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
if (p == null) { if (p == null) {
return; return;
} }
...@@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool { ...@@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool {
// banned,try longer interval // banned,try longer interval
p.fail(TimerReuseProxy.ERROR_403); p.fail(TimerReuseProxy.ERROR_403);
p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break; break;
case TimerReuseProxy.ERROR_BANNED: case TimerReuseProxy.ERROR_BANNED:
p.fail(TimerReuseProxy.ERROR_BANNED); p.fail(TimerReuseProxy.ERROR_BANNED);
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
logger.warn("this proxy is banned >>>> " + p.getHttpHost()); logger.warn("this proxy is banned >>>> " + p.getHttpHost());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break; break;
case TimerReuseProxy.ERROR_404: case TimerReuseProxy.ERROR_404:
// p.fail(Proxy.ERROR_404); // p.fail(Proxy.ERROR_404);
...@@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool { ...@@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool {
} }
if (p.getFailedNum() > 20) { if (p.getFailedNum() > 20) {
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
} }
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
if (!ProxyUtils.validateProxy(host)) { if (!ProxyUtils.validateProxy(proxy)) {
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment