Commit 16e12e3b authored by yihua.huang's avatar yihua.huang

#27 customize http header for downloader

parent 1a2c84ea
...@@ -8,8 +8,8 @@ import java.util.*; ...@@ -8,8 +8,8 @@ import java.util.*;
* Object contains setting for crawler.<br> * Object contains setting for crawler.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor * @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/ */
public class Site { public class Site {
...@@ -38,6 +38,14 @@ public class Site { ...@@ -38,6 +38,14 @@ public class Site {
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET; private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String,String> headers = new HashMap<String, String>();
public static interface HeaderConst {
public static final String REFERER = "Referer";
}
static { static {
DEFAULT_STATUS_CODE_SET.add(200); DEFAULT_STATUS_CODE_SET.add(200);
} }
...@@ -139,10 +147,12 @@ public class Site { ...@@ -139,10 +147,12 @@ public class Site {
/** /**
* set timeout for downloader in ms * set timeout for downloader in ms
*
* @param timeOut * @param timeOut
*/ */
public void setTimeOut(int timeOut) { public Site setTimeOut(int timeOut) {
this.timeOut = timeOut; this.timeOut = timeOut;
return this;
} }
/** /**
...@@ -216,7 +226,7 @@ public class Site { ...@@ -216,7 +226,7 @@ public class Site {
} }
/** /**
* Get retry times when download fail immediately, 0 by default.<br> * Get retry times immediately when download fail, 0 by default.<br>
* *
* @return retry times when download fail * @return retry times when download fail
*/ */
...@@ -224,6 +234,22 @@ public class Site { ...@@ -224,6 +234,22 @@ public class Site {
return retryTimes; return retryTimes;
} }
public Map<String, String> getHeaders() {
return headers;
}
/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public Site addHeader(String key, String value){
headers.put(key,value);
return this;
}
/** /**
* Set retry times when download fail, 0 by default.<br> * Set retry times when download fail, 0 by default.<br>
* *
......
...@@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map;
import java.util.Set; import java.util.Set;
...@@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader { ...@@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader {
int retryTimes = 0; int retryTimes = 0;
Set<Integer> acceptStatCode; Set<Integer> acceptStatCode;
String charset = null; String charset = null;
Map<String,String> headers = null;
if (site != null) { if (site != null) {
retryTimes = site.getRetryTimes(); retryTimes = site.getRetryTimes();
acceptStatCode = site.getAcceptStatCode(); acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset(); charset = site.getCharset();
headers = site.getHeaders();
} else { } else {
acceptStatCode = new HashSet<Integer>(); acceptStatCode = new HashSet<Integer>();
acceptStatCode.add(200); acceptStatCode.add(200);
...@@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader { ...@@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader {
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
try { try {
HttpGet httpGet = new HttpGet(request.getUrl()); HttpGet httpGet = new HttpGet(request.getUrl());
if (headers!=null){
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
}
}
HttpResponse httpResponse = null; HttpResponse httpResponse = null;
int tried = 0; int tried = 0;
boolean retry; boolean retry;
......
...@@ -54,7 +54,7 @@ public class HttpClientPool { ...@@ -54,7 +54,7 @@ public class HttpClientPool {
} }
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setVersion(HttpVersion.HTTP_1_1);
if (site != null && site.getCharset() != null) { if (site != null && site.getCharset() != null) {
...@@ -73,8 +73,7 @@ public class HttpClientPool { ...@@ -73,8 +73,7 @@ public class HttpClientPool {
if (site != null) { if (site != null) {
generateCookie(httpClient, site); generateCookie(httpClient, site);
} }
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient; return httpClient;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment