Commit 16e12e3b authored by yihua.huang's avatar yihua.huang

#27 customize http header for downloader

parent 1a2c84ea
......@@ -8,8 +8,8 @@ import java.util.*;
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Site {
......@@ -38,6 +38,14 @@ public class Site {
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String,String> headers = new HashMap<String, String>();
public static interface HeaderConst {
public static final String REFERER = "Referer";
}
static {
DEFAULT_STATUS_CODE_SET.add(200);
}
......@@ -139,10 +147,12 @@ public class Site {
/**
* set timeout for downloader in ms
*
* @param timeOut
*/
public void setTimeOut(int timeOut) {
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}
/**
......@@ -216,7 +226,7 @@ public class Site {
}
/**
* Get retry times when download fail immediately, 0 by default.<br>
* Get retry times immediately when download fail, 0 by default.<br>
*
* @return retry times when download fail
*/
......@@ -224,6 +234,22 @@ public class Site {
return retryTimes;
}
public Map<String, String> getHeaders() {
return headers;
}
/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public Site addHeader(String key, String value){
headers.put(key,value);
return this;
}
/**
* Set retry times when download fail, 0 by default.<br>
*
......
......@@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
......@@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader {
int retryTimes = 0;
Set<Integer> acceptStatCode;
String charset = null;
Map<String,String> headers = null;
if (site != null) {
retryTimes = site.getRetryTimes();
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = new HashSet<Integer>();
acceptStatCode.add(200);
......@@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader {
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
try {
HttpGet httpGet = new HttpGet(request.getUrl());
if (headers!=null){
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
}
}
HttpResponse httpResponse = null;
int tried = 0;
boolean retry;
......
......@@ -54,7 +54,7 @@ public class HttpClientPool {
}
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1);
if (site != null && site.getCharset() != null) {
......@@ -73,8 +73,7 @@ public class HttpClientPool {
if (site != null) {
generateCookie(httpClient, site);
}
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment