Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
8ba2da14
Commit
8ba2da14
authored
Apr 24, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
request method #108 and more cookie #109 config
parent
b06aa489
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
138 additions
and
16 deletions
+138
-16
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+17
-0
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+51
-9
HttpConstant.java
...ain/java/us/codecraft/webmagic/constant/HttpConstant.java
+35
-0
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+21
-2
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+9
-4
LocalDuplicatedRemovedScheduler.java
...t/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java
+5
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
8ba2da14
...
@@ -21,6 +21,8 @@ public class Request implements Serializable {
...
@@ -21,6 +21,8 @@ public class Request implements Serializable {
private
String
url
;
private
String
url
;
private
String
method
;
/**
/**
* Store additional information in extras.
* Store additional information in extras.
*/
*/
...
@@ -106,10 +108,25 @@ public class Request implements Serializable {
...
@@ -106,10 +108,25 @@ public class Request implements Serializable {
this
.
url
=
url
;
this
.
url
=
url
;
}
}
/**
* The http method of the request. Get for default.
* @return httpMethod
* @see us.codecraft.webmagic.constant.HttpConstant.Method
* @since 0.5.0
*/
public
String
getMethod
()
{
return
method
;
}
public
void
setMethod
(
String
method
)
{
this
.
method
=
method
;
}
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
return
"Request{"
+
return
"Request{"
+
"url='"
+
url
+
'\''
+
"url='"
+
url
+
'\''
+
", method='"
+
method
+
'\''
+
", extras="
+
extras
+
", extras="
+
extras
+
", priority="
+
priority
+
", priority="
+
priority
+
'}'
;
'}'
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
8ba2da14
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
com.google.common.collect.HashBasedTable
;
import
com.google.common.collect.Table
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpHost
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
@@ -18,7 +20,9 @@ public class Site {
...
@@ -18,7 +20,9 @@ public class Site {
private
String
userAgent
;
private
String
userAgent
;
private
Map
<
String
,
String
>
cookies
=
new
LinkedHashMap
<
String
,
String
>();
private
Map
<
String
,
String
>
defaultCookies
=
new
LinkedHashMap
<
String
,
String
>();
private
Table
<
String
,
String
,
String
>
cookies
=
HashBasedTable
.
create
();
private
String
charset
;
private
String
charset
;
...
@@ -45,6 +49,10 @@ public class Site {
...
@@ -45,6 +49,10 @@ public class Site {
private
boolean
useGzip
=
true
;
private
boolean
useGzip
=
true
;
/**
* @see us.codecraft.webmagic.constant.HttpConstant.Header
* @deprecated
*/
public
static
interface
HeaderConst
{
public
static
interface
HeaderConst
{
public
static
final
String
REFERER
=
"Referer"
;
public
static
final
String
REFERER
=
"Referer"
;
...
@@ -72,7 +80,20 @@ public class Site {
...
@@ -72,7 +80,20 @@ public class Site {
* @return this
* @return this
*/
*/
public
Site
addCookie
(
String
name
,
String
value
)
{
public
Site
addCookie
(
String
name
,
String
value
)
{
cookies
.
put
(
name
,
value
);
defaultCookies
.
put
(
name
,
value
);
return
this
;
}
/**
* Add a cookie with specific domain.
*
* @param domain
* @param name
* @param value
* @return
*/
public
Site
addCookie
(
String
domain
,
String
name
,
String
value
)
{
cookies
.
put
(
domain
,
name
,
value
);
return
this
;
return
this
;
}
}
...
@@ -93,6 +114,25 @@ public class Site {
...
@@ -93,6 +114,25 @@ public class Site {
* @return get cookies
* @return get cookies
*/
*/
public
Map
<
String
,
String
>
getCookies
()
{
public
Map
<
String
,
String
>
getCookies
()
{
return
defaultCookies
;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public
Map
<
String
,
Map
<
String
,
String
>>
getAllCookies
()
{
return
cookies
.
columnMap
();
}
/**
* get cookies
*
* @return get cookies
*/
public
Table
<
String
,
String
,
String
>
getaCookies
()
{
cookies
.
columnMap
();
return
cookies
;
return
cookies
;
}
}
...
@@ -203,10 +243,10 @@ public class Site {
...
@@ -203,10 +243,10 @@ public class Site {
* Add a url to start url.<br>
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
*
*
* @deprecated
* @see Spider#addUrl(String...)
* @param startUrl
* @param startUrl
* @return this
* @return this
* @see Spider#addUrl(String...)
* @deprecated
*/
*/
public
Site
addStartUrl
(
String
startUrl
)
{
public
Site
addStartUrl
(
String
startUrl
)
{
return
addStartRequest
(
new
Request
(
startUrl
));
return
addStartRequest
(
new
Request
(
startUrl
));
...
@@ -216,10 +256,10 @@ public class Site {
...
@@ -216,10 +256,10 @@ public class Site {
* Add a url to start url.<br>
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
*
*
* @deprecated
* @see Spider#addRequest(Request...)
* @param startRequest
* @param startRequest
* @return this
* @return this
* @see Spider#addRequest(Request...)
* @deprecated
*/
*/
public
Site
addStartRequest
(
Request
startRequest
)
{
public
Site
addStartRequest
(
Request
startRequest
)
{
this
.
startRequests
.
add
(
startRequest
);
this
.
startRequests
.
add
(
startRequest
);
...
@@ -312,6 +352,7 @@ public class Site {
...
@@ -312,6 +352,7 @@ public class Site {
/**
/**
* set up httpProxy for this site
* set up httpProxy for this site
*
* @param httpProxy
* @param httpProxy
* @return
* @return
*/
*/
...
@@ -364,7 +405,8 @@ public class Site {
...
@@ -364,7 +405,8 @@ public class Site {
if
(
acceptStatCode
!=
null
?
!
acceptStatCode
.
equals
(
site
.
acceptStatCode
)
:
site
.
acceptStatCode
!=
null
)
if
(
acceptStatCode
!=
null
?
!
acceptStatCode
.
equals
(
site
.
acceptStatCode
)
:
site
.
acceptStatCode
!=
null
)
return
false
;
return
false
;
if
(
charset
!=
null
?
!
charset
.
equals
(
site
.
charset
)
:
site
.
charset
!=
null
)
return
false
;
if
(
charset
!=
null
?
!
charset
.
equals
(
site
.
charset
)
:
site
.
charset
!=
null
)
return
false
;
if
(
cookies
!=
null
?
!
cookies
.
equals
(
site
.
cookies
)
:
site
.
cookies
!=
null
)
return
false
;
if
(
defaultCookies
!=
null
?
!
defaultCookies
.
equals
(
site
.
defaultCookies
)
:
site
.
defaultCookies
!=
null
)
return
false
;
if
(
domain
!=
null
?
!
domain
.
equals
(
site
.
domain
)
:
site
.
domain
!=
null
)
return
false
;
if
(
domain
!=
null
?
!
domain
.
equals
(
site
.
domain
)
:
site
.
domain
!=
null
)
return
false
;
if
(
headers
!=
null
?
!
headers
.
equals
(
site
.
headers
)
:
site
.
headers
!=
null
)
return
false
;
if
(
headers
!=
null
?
!
headers
.
equals
(
site
.
headers
)
:
site
.
headers
!=
null
)
return
false
;
if
(
startRequests
!=
null
?
!
startRequests
.
equals
(
site
.
startRequests
)
:
site
.
startRequests
!=
null
)
if
(
startRequests
!=
null
?
!
startRequests
.
equals
(
site
.
startRequests
)
:
site
.
startRequests
!=
null
)
...
@@ -378,7 +420,7 @@ public class Site {
...
@@ -378,7 +420,7 @@ public class Site {
public
int
hashCode
()
{
public
int
hashCode
()
{
int
result
=
domain
!=
null
?
domain
.
hashCode
()
:
0
;
int
result
=
domain
!=
null
?
domain
.
hashCode
()
:
0
;
result
=
31
*
result
+
(
userAgent
!=
null
?
userAgent
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
userAgent
!=
null
?
userAgent
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookies
!=
null
?
c
ookies
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
defaultCookies
!=
null
?
defaultC
ookies
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
charset
!=
null
?
charset
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
charset
!=
null
?
charset
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
startRequests
!=
null
?
startRequests
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
startRequests
!=
null
?
startRequests
.
hashCode
()
:
0
);
result
=
31
*
result
+
sleepTime
;
result
=
31
*
result
+
sleepTime
;
...
@@ -395,7 +437,7 @@ public class Site {
...
@@ -395,7 +437,7 @@ public class Site {
return
"Site{"
+
return
"Site{"
+
"domain='"
+
domain
+
'\''
+
"domain='"
+
domain
+
'\''
+
", userAgent='"
+
userAgent
+
'\''
+
", userAgent='"
+
userAgent
+
'\''
+
", cookies="
+
c
ookies
+
", cookies="
+
defaultC
ookies
+
", charset='"
+
charset
+
'\''
+
", charset='"
+
charset
+
'\''
+
", startRequests="
+
startRequests
+
", startRequests="
+
startRequests
+
", sleepTime="
+
sleepTime
+
", sleepTime="
+
sleepTime
+
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java
0 → 100644
View file @
8ba2da14
package
us
.
codecraft
.
webmagic
.
constant
;
/**
* Some constants of Http protocal.
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public
abstract
class
HttpConstant
{
public
static
abstract
class
Method
{
public
static
final
String
GET
=
"GET"
;
public
static
final
String
HEAD
=
"HEAD"
;
public
static
final
String
POST
=
"POST"
;
public
static
final
String
PUT
=
"PUT"
;
public
static
final
String
DELETE
=
"DELETE"
;
public
static
final
String
TRACE
=
"TRACE"
;
public
static
final
String
CONNECT
=
"CONNECT"
;
}
public
static
abstract
class
Header
{
public
static
final
String
REFERER
=
"Referer"
;
public
static
final
String
USER_AGENT
=
"User-Agent"
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
8ba2da14
...
@@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page;
...
@@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.constant.HttpConstant
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
@@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
else
{
}
else
{
acceptStatCode
=
Sets
.
newHashSet
(
200
);
acceptStatCode
=
Sets
.
newHashSet
(
200
);
}
}
logger
.
info
(
"downloading page {}"
,
request
.
getUrl
());
logger
.
info
(
"downloading page {}"
,
request
.
getUrl
());
CloseableHttpResponse
httpResponse
=
null
;
CloseableHttpResponse
httpResponse
=
null
;
try
{
try
{
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
);
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
);
...
@@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
)
{
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
)
{
RequestBuilder
requestBuilder
=
RequestBuilder
.
get
(
).
setUri
(
request
.
getUrl
());
RequestBuilder
requestBuilder
=
selectRequestMethod
(
request
.
getMethod
()
).
setUri
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
if
(
headers
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
requestBuilder
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
requestBuilder
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
...
@@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader {
return
requestBuilder
.
build
();
return
requestBuilder
.
build
();
}
}
protected
RequestBuilder
selectRequestMethod
(
String
method
)
{
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
//default get
return
RequestBuilder
.
get
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
RequestBuilder
.
post
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
RequestBuilder
.
head
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
return
RequestBuilder
.
put
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
return
RequestBuilder
.
delete
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
return
RequestBuilder
.
trace
();
}
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
(),
charset
);
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
(),
charset
);
Page
page
=
new
Page
();
Page
page
=
new
Page
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
8ba2da14
...
@@ -36,7 +36,7 @@ public class HttpClientGenerator {
...
@@ -36,7 +36,7 @@ public class HttpClientGenerator {
connectionManager
.
setDefaultMaxPerRoute
(
100
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
}
}
public
HttpClientGenerator
setPoolSize
(
int
poolSize
){
public
HttpClientGenerator
setPoolSize
(
int
poolSize
)
{
connectionManager
.
setMaxTotal
(
poolSize
);
connectionManager
.
setMaxTotal
(
poolSize
);
return
this
;
return
this
;
}
}
...
@@ -76,10 +76,15 @@ public class HttpClientGenerator {
...
@@ -76,10 +76,15 @@ public class HttpClientGenerator {
private
void
generateCookie
(
HttpClientBuilder
httpClientBuilder
,
Site
site
)
{
private
void
generateCookie
(
HttpClientBuilder
httpClientBuilder
,
Site
site
)
{
CookieStore
cookieStore
=
new
BasicCookieStore
();
CookieStore
cookieStore
=
new
BasicCookieStore
();
if
(
site
.
getCookies
()
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
cookieEntry
:
site
.
getCookies
().
entrySet
())
{
for
(
Map
.
Entry
<
String
,
String
>
cookieEntry
:
site
.
getCookies
().
entrySet
())
{
BasicClientCookie
cookie
=
new
BasicClientCookie
(
cookieEntry
.
getKey
(),
cookieEntry
.
getValue
());
cookie
.
setDomain
(
site
.
getDomain
());
cookieStore
.
addCookie
(
cookie
);
}
for
(
Map
.
Entry
<
String
,
Map
<
String
,
String
>>
domainEntry
:
site
.
getAllCookies
().
entrySet
())
{
for
(
Map
.
Entry
<
String
,
String
>
cookieEntry
:
domainEntry
.
getValue
().
entrySet
())
{
BasicClientCookie
cookie
=
new
BasicClientCookie
(
cookieEntry
.
getKey
(),
cookieEntry
.
getValue
());
BasicClientCookie
cookie
=
new
BasicClientCookie
(
cookieEntry
.
getKey
(),
cookieEntry
.
getValue
());
cookie
.
setDomain
(
site
.
getDomain
());
cookie
.
setDomain
(
domainEntry
.
getKey
());
cookieStore
.
addCookie
(
cookie
);
cookieStore
.
addCookie
(
cookie
);
}
}
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java
View file @
8ba2da14
...
@@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche
...
@@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche
@Override
@Override
public
void
push
(
Request
request
,
Task
task
)
{
public
void
push
(
Request
request
,
Task
task
)
{
logger
.
trace
(
"get a candidate url {}"
,
request
.
getUrl
());
logger
.
trace
(
"get a candidate url {}"
,
request
.
getUrl
());
if
(
urls
.
add
(
request
.
getUrl
()
)
||
shouldReserved
(
request
))
{
if
(
isDuplicate
(
request
)
||
shouldReserved
(
request
))
{
logger
.
debug
(
"push to queue {}"
,
request
.
getUrl
());
logger
.
debug
(
"push to queue {}"
,
request
.
getUrl
());
pushWhenNoDuplicate
(
request
,
task
);
pushWhenNoDuplicate
(
request
,
task
);
}
}
}
}
protected
boolean
isDuplicate
(
Request
request
)
{
return
urls
.
add
(
request
.
getUrl
());
}
protected
boolean
shouldReserved
(
Request
request
)
{
protected
boolean
shouldReserved
(
Request
request
)
{
return
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
)
!=
null
;
return
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
)
!=
null
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment