Commit 8ba2da14 authored by yihua.huang's avatar yihua.huang

request method #108 and more cookie #109 config

parent b06aa489
......@@ -21,6 +21,8 @@ public class Request implements Serializable {
private String url;
private String method;
/**
* Store additional information in extras.
*/
......@@ -106,10 +108,25 @@ public class Request implements Serializable {
this.url = url;
}
/**
* The http method of the request. Get for default.
* @return httpMethod
* @see us.codecraft.webmagic.constant.HttpConstant.Method
* @since 0.5.0
*/
public String getMethod() {
return method;
}
public void setMethod(String method) {
this.method = method;
}
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
", priority=" + priority +
'}';
......
package us.codecraft.webmagic;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.utils.UrlUtils;
......@@ -18,7 +20,9 @@ public class Site {
private String userAgent;
private Map<String, String> cookies = new LinkedHashMap<String, String>();
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
private Table<String, String, String> cookies = HashBasedTable.create();
private String charset;
......@@ -45,6 +49,10 @@ public class Site {
private boolean useGzip = true;
/**
* @see us.codecraft.webmagic.constant.HttpConstant.Header
* @deprecated
*/
public static interface HeaderConst {
public static final String REFERER = "Referer";
......@@ -72,7 +80,20 @@ public class Site {
* @return this
*/
public Site addCookie(String name, String value) {
cookies.put(name, value);
defaultCookies.put(name, value);
return this;
}
/**
* Add a cookie with specific domain.
*
* @param domain
* @param name
* @param value
* @return
*/
public Site addCookie(String domain, String name, String value) {
cookies.put(domain, name, value);
return this;
}
......@@ -93,6 +114,25 @@ public class Site {
* @return get cookies
*/
public Map<String, String> getCookies() {
return defaultCookies;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public Map<String,Map<String, String>> getAllCookies() {
return cookies.columnMap();
}
/**
* get cookies
*
* @return get cookies
*/
public Table<String,String, String> getaCookies() {
cookies.columnMap();
return cookies;
}
......@@ -203,10 +243,10 @@ public class Site {
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
*
* @deprecated
* @see Spider#addUrl(String...)
* @param startUrl
* @return this
* @see Spider#addUrl(String...)
* @deprecated
*/
public Site addStartUrl(String startUrl) {
return addStartRequest(new Request(startUrl));
......@@ -216,10 +256,10 @@ public class Site {
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
*
* @deprecated
* @see Spider#addRequest(Request...)
* @param startRequest
* @return this
* @see Spider#addRequest(Request...)
* @deprecated
*/
public Site addStartRequest(Request startRequest) {
this.startRequests.add(startRequest);
......@@ -312,6 +352,7 @@ public class Site {
/**
* set up httpProxy for this site
*
* @param httpProxy
* @return
*/
......@@ -364,7 +405,8 @@ public class Site {
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false;
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
......@@ -378,7 +420,7 @@ public class Site {
public int hashCode() {
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
result = 31 * result + sleepTime;
......@@ -395,7 +437,7 @@ public class Site {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + cookies +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", startRequests=" + startRequests +
", sleepTime=" + sleepTime +
......
package us.codecraft.webmagic.constant;
/**
* Some constants of Http protocal.
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public abstract class HttpConstant {
public static abstract class Method {
public static final String GET = "GET";
public static final String HEAD = "HEAD";
public static final String POST = "POST";
public static final String PUT = "PUT";
public static final String DELETE = "DELETE";
public static final String TRACE = "TRACE";
public static final String CONNECT = "CONNECT";
}
public static abstract class Header {
public static final String REFERER = "Referer";
public static final String USER_AGENT = "User-Agent";
}
}
......@@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.constant.HttpConstant;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
......@@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader {
} else {
acceptStatCode = Sets.newHashSet(200);
}
logger.info("downloading page {}" , request.getUrl());
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
......@@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
RequestBuilder requestBuilder = selectRequestMethod(request.getMethod()).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
......@@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader {
return requestBuilder.build();
}
protected RequestBuilder selectRequestMethod(String method) {
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return RequestBuilder.post();
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();
......
......@@ -36,7 +36,7 @@ public class HttpClientGenerator {
connectionManager.setDefaultMaxPerRoute(100);
}
public HttpClientGenerator setPoolSize(int poolSize){
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
}
......@@ -76,12 +76,17 @@ public class HttpClientGenerator {
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
CookieStore cookieStore = new BasicCookieStore();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) {
for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(domainEntry.getKey());
cookieStore.addCookie(cookie);
}
}
httpClientBuilder.setDefaultCookieStore(cookieStore);
}
......
......@@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
if (urls.add(request.getUrl()) || shouldReserved(request)) {
if (isDuplicate(request) || shouldReserved(request)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
}
protected boolean isDuplicate(Request request) {
return urls.add(request.getUrl());
}
protected boolean shouldReserved(Request request) {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment