Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
25df6650
Commit
25df6650
authored
Apr 02, 2017
by
Yihua Huang
Committed by
GitHub
Apr 02, 2017
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #513 from xbynet/master
Request支持设置header与cookie、新增POST请求时,XML、JSON参数支持、Page支持获取响应header
parents
221c1550
c93a8a27
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
136 additions
and
12 deletions
+136
-12
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+23
-3
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+75
-2
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+37
-6
CharsetUtils.java
...c/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
25df6650
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
/**
* Object storing extracted result and urls to fetch.<br>
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
* Not thread safe.<br>
...
@@ -43,6 +45,11 @@ public class Page {
...
@@ -43,6 +45,11 @@ public class Page {
private
boolean
needCycleRetry
;
private
boolean
needCycleRetry
;
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
/**
* Http响应头
*/
private
Header
[]
headers
=
null
;
public
Page
()
{
public
Page
()
{
}
}
...
@@ -210,6 +217,14 @@ public class Page {
...
@@ -210,6 +217,14 @@ public class Page {
return
this
;
return
this
;
}
}
public
Header
[]
getHeaders
()
{
return
headers
;
}
public
void
setHeaders
(
Header
[]
headers
)
{
this
.
headers
=
headers
;
}
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
return
"Page{"
+
return
"Page{"
+
...
@@ -219,6 +234,11 @@ public class Page {
...
@@ -219,6 +234,11 @@ public class Page {
", url="
+
url
+
", url="
+
url
+
", statusCode="
+
statusCode
+
", statusCode="
+
statusCode
+
", targetRequests="
+
targetRequests
+
", targetRequests="
+
targetRequests
+
", headers="
+
headers
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
25df6650
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
us.codecraft.webmagic.utils.Experimental
;
import
java.io.Serializable
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.entity.StringEntity
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.message.BasicHeader
;
import
us.codecraft.webmagic.utils.Experimental
;
import
us.codecraft.webmagic.utils.UrlUtils
;
/**
/**
* Object contains url to crawl.<br>
* Object contains url to crawl.<br>
* It contains some additional information.<br>
* It contains some additional information.<br>
...
@@ -33,6 +43,18 @@ public class Request implements Serializable {
...
@@ -33,6 +43,18 @@ public class Request implements Serializable {
* POST/GET param set
* POST/GET param set
* */
* */
private
Map
<
String
,
String
>
params
=
new
HashMap
<
String
,
String
>();
private
Map
<
String
,
String
>
params
=
new
HashMap
<
String
,
String
>();
/**
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
*/
private
HttpEntity
entity
;
/**
* cookies for current url, if not set use Site's cookies
*/
private
List
<
Cookie
>
cookies
=
new
ArrayList
<
Cookie
>();
private
List
<
Header
>
headers
=
new
ArrayList
<
Header
>();
/**
/**
* Priority of the request.<br>
* Priority of the request.<br>
...
@@ -145,12 +167,59 @@ public class Request implements Serializable {
...
@@ -145,12 +167,59 @@ public class Request implements Serializable {
if
(
method
!=
null
?
!
method
.
equals
(
request
.
method
)
:
request
.
method
!=
null
)
return
false
;
if
(
method
!=
null
?
!
method
.
equals
(
request
.
method
)
:
request
.
method
!=
null
)
return
false
;
return
params
!=
null
?
params
.
equals
(
request
.
params
)
:
request
.
params
==
null
;
return
params
!=
null
?
params
.
equals
(
request
.
params
)
:
request
.
params
==
null
;
}
}
public
void
addHeader
(
String
name
,
String
value
){
Header
header
=
new
BasicHeader
(
name
,
value
);
headers
.
add
(
header
);
}
public
List
<
Header
>
getHeaders
(){
return
headers
;
}
public
void
addCookie
(
String
key
,
String
value
){
BasicClientCookie
c
=
new
BasicClientCookie
(
key
,
value
);
c
.
setDomain
(
UrlUtils
.
getDomain
(
url
));
cookies
.
add
(
c
);
}
public
List
<
Cookie
>
getCookies
()
{
return
cookies
;
}
public
void
setCookies
(
List
<
Cookie
>
cookies
)
{
this
.
cookies
=
cookies
;
}
/**
* 设置json参数
*/
public
void
setJsonParam
(
String
jsonStr
,
String
encoding
){
StringEntity
e
=
new
StringEntity
(
jsonStr
,
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentEncoding
(
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentType
(
"application/json"
);
entity
=
e
;
}
/**
* 设置xml参数
*/
public
void
setXmlParam
(
String
xmlStr
,
String
encoding
){
StringEntity
e
=
new
StringEntity
(
xmlStr
,
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentEncoding
(
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentType
(
"text/xml"
);
entity
=
e
;
}
public
HttpEntity
getEntity
()
{
return
entity
;
}
public
void
setEntity
(
HttpEntity
entity
)
{
this
.
entity
=
entity
;
}
@Override
@Override
public
int
hashCode
()
{
public
int
hashCode
()
{
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
params
!=
null
?
params
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
params
!=
null
?
params
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
headers
!=
null
?
headers
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
entity
!=
null
?
entity
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookies
!=
null
?
cookies
.
hashCode
()
:
0
);
return
result
;
return
result
;
}
}
...
@@ -162,6 +231,10 @@ public class Request implements Serializable {
...
@@ -162,6 +231,10 @@ public class Request implements Serializable {
", extras="
+
extras
+
", extras="
+
extras
+
", params="
+
params
+
", params="
+
params
+
", priority="
+
priority
+
", priority="
+
priority
+
", headers="
+
headers
+
", entity="
+
entity
+
", cookies="
+
cookies
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
25df6650
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.protocol.HttpClientContext
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.impl.client.BasicCookieStore
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.util.EntityUtils
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
...
@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
...
@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.*
;
/**
/**
* The http downloader based on HttpClient.
* The http downloader based on HttpClient.
...
@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
);
HttpClientContext
context
=
null
;
if
(
request
.
getCookies
()!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getCookies
())){
context
=
new
HttpClientContext
();
CookieStore
cookieStore
=
new
BasicCookieStore
();
for
(
Cookie
c:
request
.
getCookies
()){
cookieStore
.
addCookie
(
c
);
}
context
.
setCookieStore
(
cookieStore
);
}
if
(
request
.
getHeaders
()!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getHeaders
())){
for
(
Header
h:
request
.
getHeaders
()){
httpUriRequest
.
setHeader
(
h
);
}
}
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
,
context
);
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
page
.
setHeaders
(
httpResponse
.
getAllHeaders
());
onSuccess
(
request
);
onSuccess
(
request
);
return
page
;
return
page
;
}
else
{
}
else
{
...
@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
//default get
//default get
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
if
(
request
.
getEntity
()!=
null
){
return
RequestBuilder
.
post
().
setEntity
(
request
.
getEntity
());
}
else
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
View file @
25df6650
...
@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
...
@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
// charset
// charset
// 1、encoding in http header Content-Type
// 1、encoding in http header Content-Type
charset
=
UrlUtils
.
getCharset
(
contentType
);
charset
=
UrlUtils
.
getCharset
(
contentType
);
if
(
StringUtils
.
isNotBlank
(
contentType
))
{
if
(
StringUtils
.
isNotBlank
(
contentType
)
&&
StringUtils
.
isNotBlank
(
charset
)
)
{
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
return
charset
;
return
charset
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment