Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
1c24baa8
Commit
1c24baa8
authored
Mar 28, 2017
by
xbynet@outlook.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Request支持设置header与cookie
新增POST请求时,XML、JSON参数支持 Page支持获取响应header
parent
5824e951
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
135 additions
and
11 deletions
+135
-11
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+23
-3
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+75
-2
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+37
-6
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
1c24baa8
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
/**
* Object storing extracted result and urls to fetch.<br>
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
* Not thread safe.<br>
...
@@ -43,6 +45,11 @@ public class Page {
...
@@ -43,6 +45,11 @@ public class Page {
private
boolean
needCycleRetry
;
private
boolean
needCycleRetry
;
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
/**
* Http响应头
*/
private
Header
[]
headers
=
null
;
public
Page
()
{
public
Page
()
{
}
}
...
@@ -210,6 +217,14 @@ public class Page {
...
@@ -210,6 +217,14 @@ public class Page {
return
this
;
return
this
;
}
}
public
Header
[]
getHeaders
()
{
return
headers
;
}
public
void
setHeaders
(
Header
[]
headers
)
{
this
.
headers
=
headers
;
}
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
return
"Page{"
+
return
"Page{"
+
...
@@ -219,6 +234,11 @@ public class Page {
...
@@ -219,6 +234,11 @@ public class Page {
", url="
+
url
+
", url="
+
url
+
", statusCode="
+
statusCode
+
", statusCode="
+
statusCode
+
", targetRequests="
+
targetRequests
+
", targetRequests="
+
targetRequests
+
", headers="
+
headers
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
1c24baa8
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
us.codecraft.webmagic.utils.Experimental
;
import
java.io.Serializable
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.entity.StringEntity
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.message.BasicHeader
;
import
us.codecraft.webmagic.utils.Experimental
;
import
us.codecraft.webmagic.utils.UrlUtils
;
/**
/**
* Object contains url to crawl.<br>
* Object contains url to crawl.<br>
* It contains some additional information.<br>
* It contains some additional information.<br>
...
@@ -33,6 +43,18 @@ public class Request implements Serializable {
...
@@ -33,6 +43,18 @@ public class Request implements Serializable {
* POST/GET param set
* POST/GET param set
* */
* */
private
Map
<
String
,
String
>
params
=
new
HashMap
<
String
,
String
>();
private
Map
<
String
,
String
>
params
=
new
HashMap
<
String
,
String
>();
/**
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
*/
private
HttpEntity
entity
;
/**
* cookies for current url, if not set use Site's cookies
*/
private
List
<
Cookie
>
cookies
=
new
ArrayList
<
Cookie
>();
private
List
<
Header
>
headers
=
new
ArrayList
<
Header
>();
/**
/**
* Priority of the request.<br>
* Priority of the request.<br>
...
@@ -145,12 +167,59 @@ public class Request implements Serializable {
...
@@ -145,12 +167,59 @@ public class Request implements Serializable {
if
(
method
!=
null
?
!
method
.
equals
(
request
.
method
)
:
request
.
method
!=
null
)
return
false
;
if
(
method
!=
null
?
!
method
.
equals
(
request
.
method
)
:
request
.
method
!=
null
)
return
false
;
return
params
!=
null
?
params
.
equals
(
request
.
params
)
:
request
.
params
==
null
;
return
params
!=
null
?
params
.
equals
(
request
.
params
)
:
request
.
params
==
null
;
}
}
public
void
addHeader
(
String
name
,
String
value
){
Header
header
=
new
BasicHeader
(
name
,
value
);
headers
.
add
(
header
);
}
public
List
<
Header
>
getHeaders
(){
return
headers
;
}
public
void
addCookie
(
String
key
,
String
value
){
BasicClientCookie
c
=
new
BasicClientCookie
(
key
,
value
);
c
.
setDomain
(
UrlUtils
.
getDomain
(
url
));
cookies
.
add
(
c
);
}
public
List
<
Cookie
>
getCookies
()
{
return
cookies
;
}
public
void
setCookies
(
List
<
Cookie
>
cookies
)
{
this
.
cookies
=
cookies
;
}
/**
* 设置json参数
*/
public
void
setJsonParam
(
String
jsonStr
,
String
encoding
){
StringEntity
e
=
new
StringEntity
(
jsonStr
,
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentEncoding
(
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentType
(
"application/json"
);
entity
=
e
;
}
/**
* 设置xml参数
*/
public
void
setXmlParam
(
String
xmlStr
,
String
encoding
){
StringEntity
e
=
new
StringEntity
(
xmlStr
,
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentEncoding
(
encoding
==
null
?
"UTF-8"
:
encoding
);
e
.
setContentType
(
"text/xml"
);
entity
=
e
;
}
public
HttpEntity
getEntity
()
{
return
entity
;
}
public
void
setEntity
(
HttpEntity
entity
)
{
this
.
entity
=
entity
;
}
@Override
@Override
public
int
hashCode
()
{
public
int
hashCode
()
{
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
params
!=
null
?
params
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
params
!=
null
?
params
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
headers
!=
null
?
headers
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
entity
!=
null
?
entity
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookies
!=
null
?
cookies
.
hashCode
()
:
0
);
return
result
;
return
result
;
}
}
...
@@ -162,6 +231,10 @@ public class Request implements Serializable {
...
@@ -162,6 +231,10 @@ public class Request implements Serializable {
", extras="
+
extras
+
", extras="
+
extras
+
", params="
+
params
+
", params="
+
params
+
", priority="
+
priority
+
", priority="
+
priority
+
", headers="
+
headers
+
", entity="
+
entity
+
", cookies="
+
cookies
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
1c24baa8
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.protocol.HttpClientContext
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.impl.client.BasicCookieStore
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.util.EntityUtils
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
...
@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
...
@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.*
;
/**
/**
* The http downloader based on HttpClient.
* The http downloader based on HttpClient.
...
@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
);
HttpClientContext
context
=
null
;
if
(
request
.
getCookies
()!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getCookies
())){
context
=
new
HttpClientContext
();
CookieStore
cookieStore
=
new
BasicCookieStore
();
for
(
Cookie
c:
request
.
getCookies
()){
cookieStore
.
addCookie
(
c
);
}
context
.
setCookieStore
(
cookieStore
);
}
if
(
request
.
getHeaders
()!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getHeaders
())){
for
(
Header
h:
request
.
getHeaders
()){
httpUriRequest
.
setHeader
(
h
);
}
}
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
,
context
);
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
page
.
setHeaders
(
httpResponse
.
getAllHeaders
());
onSuccess
(
request
);
onSuccess
(
request
);
return
page
;
return
page
;
}
else
{
}
else
{
...
@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
//default get
//default get
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
if
(
request
.
getEntity
()!=
null
){
return
RequestBuilder
.
post
().
setEntity
(
request
.
getEntity
());
}
else
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment