Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
fe95a684
Commit
fe95a684
authored
Apr 08, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Plain Diff
Request再次重构:去掉params,仅保留HttpRequestBody
parents
74110e6e
395396c6
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
153 additions
and
70 deletions
+153
-70
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+11
-3
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+31
-40
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+26
-3
HttpUriRequestConverter.java
...odecraft/webmagic/downloader/HttpUriRequestConverter.java
+12
-23
HttpRequestBody.java
...ain/java/us/codecraft/webmagic/model/HttpRequestBody.java
+72
-0
CharsetUtils.java
...c/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
fe95a684
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
...
@@ -46,7 +49,7 @@ public class Page {
...
@@ -46,7 +49,7 @@ public class Page {
private
boolean
needCycleRetry
;
private
boolean
needCycleRetry
;
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
public
Page
()
{
public
Page
()
{
}
}
...
@@ -232,6 +235,11 @@ public class Page {
...
@@ -232,6 +235,11 @@ public class Page {
", statusCode="
+
statusCode
+
", statusCode="
+
statusCode
+
", needCycleRetry="
+
needCycleRetry
+
", needCycleRetry="
+
needCycleRetry
+
", targetRequests="
+
targetRequests
+
", targetRequests="
+
targetRequests
+
", headers="
+
headers
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
fe95a684
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
org.apache.http.Header
;
import
org.apache.http.cookie.Cookie
;
import
us.codecraft.webmagic.model.HttpRequestBody
;
import
us.codecraft.webmagic.utils.Experimental
;
import
us.codecraft.webmagic.utils.Experimental
;
import
java.io.Serializable
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
...
@@ -23,14 +28,19 @@ public class Request implements Serializable {
...
@@ -23,14 +28,19 @@ public class Request implements Serializable {
private
String
method
;
private
String
method
;
private
HttpRequestBody
requestBody
;
/**
/**
* Store additional information in extras.
* Store additional information in extras.
*/
*/
private
Map
<
String
,
Object
>
extras
;
private
Map
<
String
,
Object
>
extras
;
/**
/**
* POST/GET param set
* cookies for current url, if not set use Site's cookies
* */
*/
private
Map
<
String
,
String
>
params
=
new
HashMap
<
String
,
String
>();
private
List
<
Cookie
>
cookies
=
new
ArrayList
<
Cookie
>();
private
List
<
Header
>
headers
=
new
ArrayList
<
Header
>();
/**
/**
* Priority of the request.<br>
* Priority of the request.<br>
...
@@ -109,57 +119,38 @@ public class Request implements Serializable {
...
@@ -109,57 +119,38 @@ public class Request implements Serializable {
this
.
method
=
method
;
this
.
method
=
method
;
}
}
public
Map
<
String
,
String
>
getParams
()
{
return
params
;
}
/**
* set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param params params
* */
public
void
setParams
(
Map
<
String
,
String
>
params
)
{
this
.
params
=
params
;
}
/**
* set params for request
* <br>
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
* @param key key
* @param value value
* */
public
void
putParams
(
String
key
,
String
value
)
{
params
.
put
(
key
,
value
);
}
@Override
public
boolean
equals
(
Object
o
)
{
if
(
this
==
o
)
return
true
;
if
(
o
==
null
||
getClass
()
!=
o
.
getClass
())
return
false
;
Request
request
=
(
Request
)
o
;
if
(
url
!=
null
?
!
url
.
equals
(
request
.
url
)
:
request
.
url
!=
null
)
return
false
;
if
(
method
!=
null
?
!
method
.
equals
(
request
.
method
)
:
request
.
method
!=
null
)
return
false
;
return
params
!=
null
?
params
.
equals
(
request
.
params
)
:
request
.
params
==
null
;
}
@Override
@Override
public
int
hashCode
()
{
public
int
hashCode
()
{
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
int
result
=
url
!=
null
?
url
.
hashCode
()
:
0
;
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
method
!=
null
?
method
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
params
!=
null
?
params
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
headers
!=
null
?
headers
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookies
!=
null
?
cookies
.
hashCode
()
:
0
);
return
result
;
return
result
;
}
}
public
List
<
Cookie
>
getCookies
()
{
return
cookies
;
}
public
List
<
Header
>
getHeaders
()
{
return
headers
;
}
public
HttpRequestBody
getRequestBody
()
{
return
requestBody
;
}
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
return
"Request{"
+
return
"Request{"
+
"url='"
+
url
+
'\''
+
"url='"
+
url
+
'\''
+
", method='"
+
method
+
'\''
+
", method='"
+
method
+
'\''
+
", extras="
+
extras
+
", extras="
+
extras
+
", params="
+
params
+
", priority="
+
priority
+
", priority="
+
priority
+
", headers="
+
headers
+
", cookies="
+
cookies
+
'}'
;
'}'
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
fe95a684
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.auth.AuthState
;
import
org.apache.http.auth.AuthState
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.protocol.HttpClientContext
;
import
org.apache.http.client.protocol.HttpClientContext
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.impl.auth.BasicScheme
;
import
org.apache.http.impl.auth.BasicScheme
;
import
org.apache.http.impl.client.BasicCookieStore
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.protocol.BasicHttpContext
;
import
org.apache.http.protocol.BasicHttpContext
;
import
org.apache.http.protocol.HttpContext
;
import
org.apache.http.protocol.HttpContext
;
import
org.apache.http.util.EntityUtils
;
import
org.apache.http.util.EntityUtils
;
...
@@ -24,11 +35,11 @@ import us.codecraft.webmagic.proxy.ProxyProvider;
...
@@ -24,11 +35,11 @@ import us.codecraft.webmagic.proxy.ProxyProvider;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.CharsetUtils
;
import
us.codecraft.webmagic.utils.CharsetUtils
;
import
us.codecraft.webmagic.utils.HttpClientUtils
;
import
us.codecraft.webmagic.utils.HttpClientUtils
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.nio.charset.Charset
;
import
java.util.HashMap
;
import
java.util.*
;
import
java.util.Map
;
/**
/**
...
@@ -88,7 +99,7 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -88,7 +99,7 @@ public class HttpClientDownloader extends AbstractDownloader {
int
statusCode
=
0
;
int
statusCode
=
0
;
Site
site
=
task
.
getSite
();
Site
site
=
task
.
getSite
();
Proxy
proxy
=
null
;
Proxy
proxy
=
null
;
HttpC
ontext
httpContext
=
new
BasicHttp
Context
();
HttpC
lientContext
httpContext
=
new
HttpClient
Context
();
if
(
proxyProvider
!=
null
)
{
if
(
proxyProvider
!=
null
)
{
proxy
=
proxyProvider
.
getProxy
(
task
);
proxy
=
proxyProvider
.
getProxy
(
task
);
AuthState
authState
=
new
AuthState
();
AuthState
authState
=
new
AuthState
();
...
@@ -97,6 +108,18 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -97,6 +108,18 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
CloseableHttpClient
httpClient
=
getHttpClient
(
site
);
CloseableHttpClient
httpClient
=
getHttpClient
(
site
);
HttpUriRequest
httpUriRequest
=
httpUriRequestConverter
.
convert
(
request
,
site
,
proxy
);
HttpUriRequest
httpUriRequest
=
httpUriRequestConverter
.
convert
(
request
,
site
,
proxy
);
if
(
request
.
getCookies
()
!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getCookies
()))
{
CookieStore
cookieStore
=
new
BasicCookieStore
();
for
(
Cookie
c
:
request
.
getCookies
())
{
cookieStore
.
addCookie
(
c
);
}
httpContext
.
setCookieStore
(
cookieStore
);
}
if
(
request
.
getHeaders
()
!=
null
&&
CollectionUtils
.
isNotEmpty
(
request
.
getHeaders
()))
{
for
(
Header
h
:
request
.
getHeaders
())
{
httpUriRequest
.
setHeader
(
h
);
}
}
try
{
try
{
httpResponse
=
httpClient
.
execute
(
httpUriRequest
,
httpContext
);
httpResponse
=
httpClient
.
execute
(
httpUriRequest
,
httpContext
);
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
View file @
fe95a684
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpHost
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.
message.BasicNameValuePair
;
import
org.apache.http.
entity.ByteArrayEntity
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
...
@@ -53,32 +47,27 @@ public class HttpUriRequestConverter {
...
@@ -53,32 +47,27 @@ public class HttpUriRequestConverter {
String
method
=
request
.
getMethod
();
String
method
=
request
.
getMethod
();
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
//default get
//default get
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
()
);
return
RequestBuilder
.
get
(
);
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
()
);
return
addFormParams
(
RequestBuilder
.
post
(),
request
);
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
()
);
return
RequestBuilder
.
head
(
);
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
return
addFormParams
(
RequestBuilder
.
put
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
()
);
return
addFormParams
(
RequestBuilder
.
put
(),
request
);
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
return
addQueryParams
(
RequestBuilder
.
delete
(),
request
.
getParams
()
);
return
RequestBuilder
.
delete
(
);
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
return
addQueryParams
(
RequestBuilder
.
trace
(),
request
.
getParams
()
);
return
RequestBuilder
.
trace
(
);
}
}
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
}
}
private
RequestBuilder
addFormParams
(
RequestBuilder
requestBuilder
,
NameValuePair
[]
nameValuePair
,
Map
<
String
,
String
>
params
)
{
private
RequestBuilder
addFormParams
(
RequestBuilder
requestBuilder
,
Request
request
)
{
List
<
NameValuePair
>
allNameValuePair
=
new
ArrayList
<
NameValuePair
>();
if
(
request
.
getRequestBody
()
!=
null
)
{
if
(
nameValuePair
!=
null
&&
nameValuePair
.
length
>
0
)
{
ByteArrayEntity
entity
=
new
ByteArrayEntity
(
request
.
getRequestBody
().
getBody
());
allNameValuePair
=
Arrays
.
asList
(
nameValuePair
);
entity
.
setContentType
(
request
.
getRequestBody
().
getContentType
());
requestBuilder
.
setEntity
(
entity
);
}
}
if
(
params
!=
null
)
{
for
(
String
key
:
params
.
keySet
())
{
allNameValuePair
.
add
(
new
BasicNameValuePair
(
key
,
params
.
get
(
key
)));
}
}
requestBuilder
.
setEntity
(
new
UrlEncodedFormEntity
(
allNameValuePair
,
Charset
.
forName
(
"utf8"
)));
return
requestBuilder
;
return
requestBuilder
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
0 → 100644
View file @
fe95a684
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.utils.URLEncodedUtils
;
import
org.apache.http.message.BasicNameValuePair
;
import
java.io.UnsupportedEncodingException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
*/
public
class
HttpRequestBody
{
public
static
abstract
class
ContentType
{
public
static
final
String
JSON
=
"application/json"
;
public
static
final
String
XML
=
"text/xml"
;
public
static
final
String
FORM
=
"application/x-www-form-urlencoded"
;
public
static
final
String
MULTIPART
=
"multipart/form-data"
;
}
private
final
byte
[]
body
;
private
final
String
contentType
;
private
final
String
encoding
;
public
HttpRequestBody
(
byte
[]
body
,
String
contentType
,
String
encoding
)
{
this
.
body
=
body
;
this
.
contentType
=
contentType
;
this
.
encoding
=
encoding
;
}
public
String
getContentType
()
{
return
contentType
;
}
public
String
getEncoding
()
{
return
encoding
;
}
public
static
HttpRequestBody
json
(
String
json
,
String
encoding
)
throws
UnsupportedEncodingException
{
return
new
HttpRequestBody
(
json
.
getBytes
(
encoding
),
ContentType
.
JSON
,
encoding
);
}
public
static
HttpRequestBody
xml
(
String
xml
,
String
encoding
)
throws
UnsupportedEncodingException
{
return
new
HttpRequestBody
(
xml
.
getBytes
(
encoding
),
ContentType
.
XML
,
encoding
);
}
public
static
HttpRequestBody
custom
(
byte
[]
body
,
String
contentType
,
String
encoding
)
throws
UnsupportedEncodingException
{
return
new
HttpRequestBody
(
body
,
contentType
,
encoding
);
}
public
static
HttpRequestBody
form
(
Map
<
String
,
Object
>
params
,
String
encoding
)
throws
UnsupportedEncodingException
{
List
<
NameValuePair
>
nameValuePairs
=
new
ArrayList
<
NameValuePair
>(
params
.
size
());
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
params
.
entrySet
())
{
nameValuePairs
.
add
(
new
BasicNameValuePair
(
entry
.
getKey
(),
String
.
valueOf
(
entry
.
getValue
())));
}
return
new
HttpRequestBody
(
URLEncodedUtils
.
format
(
nameValuePairs
,
encoding
).
getBytes
(
encoding
),
ContentType
.
FORM
,
encoding
);
}
public
byte
[]
getBody
()
{
return
body
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
View file @
fe95a684
...
@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
...
@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
// charset
// charset
// 1、encoding in http header Content-Type
// 1、encoding in http header Content-Type
charset
=
UrlUtils
.
getCharset
(
contentType
);
charset
=
UrlUtils
.
getCharset
(
contentType
);
if
(
StringUtils
.
isNotBlank
(
contentType
))
{
if
(
StringUtils
.
isNotBlank
(
contentType
)
&&
StringUtils
.
isNotBlank
(
charset
)
)
{
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
return
charset
;
return
charset
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment