Commit 25df6650 authored by Yihua Huang's avatar Yihua Huang Committed by GitHub

Merge pull request #513 from xbynet/master

Request支持设置header与cookie、新增POST请求时,XML、JSON参数支持、Page支持获取响应header
parents 221c1550 c93a8a27
package us.codecraft.webmagic; package us.codecraft.webmagic;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
/** /**
* Object storing extracted result and urls to fetch.<br> * Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br> * Not thread safe.<br>
...@@ -43,6 +45,11 @@ public class Page { ...@@ -43,6 +45,11 @@ public class Page {
private boolean needCycleRetry; private boolean needCycleRetry;
private List<Request> targetRequests = new ArrayList<Request>(); private List<Request> targetRequests = new ArrayList<Request>();
/**
* Http响应头
*/
private Header[] headers=null;
public Page() { public Page() {
} }
...@@ -210,6 +217,14 @@ public class Page { ...@@ -210,6 +217,14 @@ public class Page {
return this; return this;
} }
public Header[] getHeaders() {
return headers;
}
public void setHeaders(Header[] headers) {
this.headers = headers;
}
@Override @Override
public String toString() { public String toString() {
return "Page{" + return "Page{" +
...@@ -219,6 +234,11 @@ public class Page { ...@@ -219,6 +234,11 @@ public class Page {
", url=" + url + ", url=" + url +
", statusCode=" + statusCode + ", statusCode=" + statusCode +
", targetRequests=" + targetRequests + ", targetRequests=" + targetRequests +
", headers=" + headers+
'}'; '}';
} }
} }
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicHeader;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.UrlUtils;
/** /**
* Object contains url to crawl.<br> * Object contains url to crawl.<br>
* It contains some additional information.<br> * It contains some additional information.<br>
...@@ -33,6 +43,18 @@ public class Request implements Serializable { ...@@ -33,6 +43,18 @@ public class Request implements Serializable {
* POST/GET param set * POST/GET param set
* */ * */
private Map<String,String> params=new HashMap<String, String>(); private Map<String,String> params=new HashMap<String, String>();
/**
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
*/
private HttpEntity entity;
/**
* cookies for current url, if not set use Site's cookies
*/
private List<Cookie> cookies=new ArrayList<Cookie>();
private List<Header> headers=new ArrayList<Header>();
/** /**
* Priority of the request.<br> * Priority of the request.<br>
...@@ -145,12 +167,59 @@ public class Request implements Serializable { ...@@ -145,12 +167,59 @@ public class Request implements Serializable {
if (method != null ? !method.equals(request.method) : request.method != null) return false; if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null; return params != null ? params.equals(request.params) : request.params == null;
} }
public void addHeader(String name,String value){
Header header=new BasicHeader(name,value);
headers.add(header);
}
public List<Header> getHeaders(){
return headers;
}
public void addCookie(String key,String value){
BasicClientCookie c=new BasicClientCookie(key, value);
c.setDomain(UrlUtils.getDomain(url));
cookies.add(c);
}
public List<Cookie> getCookies() {
return cookies;
}
public void setCookies(List<Cookie> cookies) {
this.cookies = cookies;
}
/**
* 设置json参数
*/
public void setJsonParam(String jsonStr,String encoding){
StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
e.setContentEncoding(encoding==null?"UTF-8":encoding);
e.setContentType("application/json");
entity=e;
}
/**
* 设置xml参数
*/
public void setXmlParam(String xmlStr,String encoding){
StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
e.setContentEncoding(encoding==null?"UTF-8":encoding);
e.setContentType("text/xml");
entity=e;
}
public HttpEntity getEntity() {
return entity;
}
public void setEntity(HttpEntity entity) {
this.entity = entity;
}
@Override @Override
public int hashCode() { public int hashCode() {
int result = url != null ? url.hashCode() : 0; int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0); result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0); result = 31 * result + (params != null ? params.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
result = 31 * result + (entity != null ? entity.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
return result; return result;
} }
...@@ -162,6 +231,10 @@ public class Request implements Serializable { ...@@ -162,6 +231,10 @@ public class Request implements Serializable {
", extras=" + extras + ", extras=" + extras +
", params=" + params + ", params=" + params +
", priority=" + priority + ", priority=" + priority +
", headers=" + headers +
", entity=" + entity +
", cookies="+ cookies+
'}'; '}';
} }
} }
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig; import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair; import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
...@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils; ...@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.WMCollections; import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
/** /**
* The http downloader based on HttpClient. * The http downloader based on HttpClient.
...@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); HttpClientContext context=null;
if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
context=new HttpClientContext();
CookieStore cookieStore=new BasicCookieStore();
for(Cookie c:request.getCookies()){
cookieStore.addCookie(c);
}
context.setCookieStore(cookieStore);
}
if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
for(Header h:request.getHeaders()){
httpUriRequest.setHeader(h);
}
}
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
statusCode = httpResponse.getStatusLine().getStatusCode(); statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode); request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) { if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task); Page page = handleResponse(request, charset, httpResponse, task);
page.setHeaders(httpResponse.getAllHeaders());
onSuccess(request); onSuccess(request);
return page; return page;
} else { } else {
...@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
//default get //default get
return addQueryParams(RequestBuilder.get(),request.getParams()); return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); if(request.getEntity()!=null){
return RequestBuilder.post().setEntity(request.getEntity());
}else{
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
}
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams()); return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
......
...@@ -26,7 +26,7 @@ public abstract class CharsetUtils { ...@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
// charset // charset
// 1、encoding in http header Content-Type // 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType); charset = UrlUtils.getCharset(contentType);
if (StringUtils.isNotBlank(contentType)) { if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset); logger.debug("Auto get charset: {}", charset);
return charset; return charset;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment