Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
16e12e3b
Commit
16e12e3b
authored
Oct 11, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#27 customize http header for downloader
parent
1a2c84ea
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
6 deletions
+39
-6
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+29
-3
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+8
-0
HttpClientPool.java
...java/us/codecraft/webmagic/downloader/HttpClientPool.java
+2
-3
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
16e12e3b
...
...
@@ -8,8 +8,8 @@ import java.util.*;
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public
class
Site
{
...
...
@@ -38,6 +38,14 @@ public class Site {
private
Set
<
Integer
>
acceptStatCode
=
DEFAULT_STATUS_CODE_SET
;
private
Map
<
String
,
String
>
headers
=
new
HashMap
<
String
,
String
>();
public
static
interface
HeaderConst
{
public
static
final
String
REFERER
=
"Referer"
;
}
static
{
DEFAULT_STATUS_CODE_SET
.
add
(
200
);
}
...
...
@@ -139,10 +147,12 @@ public class Site {
/**
* set timeout for downloader in ms
*
* @param timeOut
*/
public
void
setTimeOut
(
int
timeOut
)
{
public
Site
setTimeOut
(
int
timeOut
)
{
this
.
timeOut
=
timeOut
;
return
this
;
}
/**
...
...
@@ -216,7 +226,7 @@ public class Site {
}
/**
* Get retry times
when download fail immediately
, 0 by default.<br>
* Get retry times
immediately when download fail
, 0 by default.<br>
*
* @return retry times when download fail
*/
...
...
@@ -224,6 +234,22 @@ public class Site {
return
retryTimes
;
}
public
Map
<
String
,
String
>
getHeaders
()
{
return
headers
;
}
/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public
Site
addHeader
(
String
key
,
String
value
){
headers
.
put
(
key
,
value
);
return
this
;
}
/**
* Set retry times when download fail, 0 by default.<br>
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
16e12e3b
...
...
@@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.io.IOException
;
import
java.util.HashSet
;
import
java.util.Map
;
import
java.util.Set
;
...
...
@@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader {
int
retryTimes
=
0
;
Set
<
Integer
>
acceptStatCode
;
String
charset
=
null
;
Map
<
String
,
String
>
headers
=
null
;
if
(
site
!=
null
)
{
retryTimes
=
site
.
getRetryTimes
();
acceptStatCode
=
site
.
getAcceptStatCode
();
charset
=
site
.
getCharset
();
headers
=
site
.
getHeaders
();
}
else
{
acceptStatCode
=
new
HashSet
<
Integer
>();
acceptStatCode
.
add
(
200
);
...
...
@@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader {
HttpClient
httpClient
=
HttpClientPool
.
getInstance
(
poolSize
).
getClient
(
site
);
try
{
HttpGet
httpGet
=
new
HttpGet
(
request
.
getUrl
());
if
(
headers
!=
null
){
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
httpGet
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
}
}
HttpResponse
httpResponse
=
null
;
int
tried
=
0
;
boolean
retry
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
View file @
16e12e3b
...
...
@@ -54,7 +54,7 @@ public class HttpClientPool {
}
params
.
setIntParameter
(
CoreConnectionPNames
.
SO_TIMEOUT
,
site
.
getTimeOut
());
params
.
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
site
.
getTimeOut
());
params
.
setParameter
(
ClientPNames
.
COOKIE_POLICY
,
CookiePolicy
.
BEST_MATCH
);
HttpProtocolParamBean
paramsBean
=
new
HttpProtocolParamBean
(
params
);
paramsBean
.
setVersion
(
HttpVersion
.
HTTP_1_1
);
if
(
site
!=
null
&&
site
.
getCharset
()
!=
null
)
{
...
...
@@ -73,8 +73,7 @@ public class HttpClientPool {
if
(
site
!=
null
)
{
generateCookie
(
httpClient
,
site
);
}
httpClient
.
getParams
().
setIntParameter
(
"http.socket.timeout"
,
60000
);
httpClient
.
getParams
().
setParameter
(
ClientPNames
.
COOKIE_POLICY
,
CookiePolicy
.
BEST_MATCH
);
return
httpClient
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment