Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
edfc319c
Commit
edfc319c
authored
Nov 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update httpclient to 4.3.1
parent
160a149b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
108 additions
and
120 deletions
+108
-120
pom.xml
pom.xml
+1
-1
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+52
-77
HttpClientPool.java
...java/us/codecraft/webmagic/downloader/HttpClientPool.java
+55
-42
No files found.
pom.xml
View file @
edfc319c
...
@@ -61,7 +61,7 @@
...
@@ -61,7 +61,7 @@
<dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<artifactId>
httpclient
</artifactId>
<version>
4.
2.4
</version>
<version>
4.
3.1
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.google.guava
</groupId>
<groupId>
com.google.guava
</groupId>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
edfc319c
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HeaderElement
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.util.EntityUtils
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
...
@@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
...
@@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
HttpClientPool
httpClientPool
;
private
volatile
CloseableHttpClient
httpClient
;
private
int
poolSize
=
1
;
private
int
poolSize
=
1
;
...
@@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
...
@@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
return
(
Html
)
page
.
getHtml
();
return
(
Html
)
page
.
getHtml
();
}
}
private
HttpClientPool
getHttpClientPool
(){
private
CloseableHttpClient
getHttpClient
(
Site
site
)
{
if
(
httpClientPool
==
null
){
if
(
httpClient
==
null
)
{
httpClientPool
=
new
HttpClientPool
(
poolSize
);
synchronized
(
this
)
{
if
(
httpClient
==
null
)
{
httpClient
=
new
HttpClientPool
(
poolSize
).
getClient
(
site
);
}
}
return
httpClientPool
;
}
}
return
httpClient
;
}
}
@Override
@Override
...
@@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
...
@@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
if
(
task
!=
null
)
{
if
(
task
!=
null
)
{
site
=
task
.
getSite
();
site
=
task
.
getSite
();
}
}
int
retryTimes
=
0
;
Set
<
Integer
>
acceptStatCode
;
Set
<
Integer
>
acceptStatCode
;
String
charset
=
null
;
String
charset
=
null
;
Map
<
String
,
String
>
headers
=
null
;
Map
<
String
,
String
>
headers
=
null
;
if
(
site
!=
null
)
{
if
(
site
!=
null
)
{
retryTimes
=
site
.
getRetryTimes
();
acceptStatCode
=
site
.
getAcceptStatCode
();
acceptStatCode
=
site
.
getAcceptStatCode
();
charset
=
site
.
getCharset
();
charset
=
site
.
getCharset
();
headers
=
site
.
getHeaders
();
headers
=
site
.
getHeaders
();
...
@@ -87,31 +87,45 @@ public class HttpClientDownloader implements Downloader {
...
@@ -87,31 +87,45 @@ public class HttpClientDownloader implements Downloader {
acceptStatCode
.
add
(
200
);
acceptStatCode
.
add
(
200
);
}
}
logger
.
info
(
"downloading page "
+
request
.
getUrl
());
logger
.
info
(
"downloading page "
+
request
.
getUrl
());
HttpClient
httpClient
=
getHttpClientPool
().
getClient
(
site
);
try
{
HttpGet
httpGet
=
new
HttpGet
(
request
.
getUrl
());
HttpGet
httpGet
=
new
HttpGet
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
if
(
headers
!=
null
){
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
httpGet
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
httpGet
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
}
}
}
}
if
(!
httpGet
.
containsHeader
(
"Accept-Encoding"
))
{
CloseableHttpResponse
httpResponse
=
null
;
httpGet
.
addHeader
(
"Accept-Encoding"
,
"gzip"
);
}
HttpResponse
httpResponse
=
null
;
int
tried
=
0
;
boolean
retry
;
do
{
try
{
try
{
httpResponse
=
httpClient
.
execute
(
httpGet
);
httpResponse
=
getHttpClient
(
site
).
execute
(
httpGet
);
retry
=
false
;
int
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
if
(
acceptStatCode
.
contains
(
statusCode
))
{
//charset
if
(
charset
==
null
)
{
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
charset
=
UrlUtils
.
getCharset
(
value
);
}
return
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
}
else
{
logger
.
warn
(
"code error "
+
statusCode
+
"\t"
+
request
.
getUrl
());
return
null
;
}
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
tried
++;
if
(
tried
>
retryTimes
)
{
logger
.
warn
(
"download page "
+
request
.
getUrl
()
+
" error"
,
e
);
logger
.
warn
(
"download page "
+
request
.
getUrl
()
+
" error"
,
e
);
if
(
site
.
getCycleRetryTimes
()
>
0
)
{
if
(
site
.
getCycleRetryTimes
()
>
0
)
{
return
addToCycleRetry
(
request
,
site
);
}
return
null
;
}
finally
{
try
{
if
(
httpResponse
!=
null
)
{
httpResponse
.
close
();
}
}
catch
(
IOException
e
)
{
logger
.
warn
(
"close response fail"
,
e
);
}
}
}
private
Page
addToCycleRetry
(
Request
request
,
Site
site
)
{
Page
page
=
new
Page
();
Page
page
=
new
Page
();
Object
cycleTriedTimesObject
=
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
);
Object
cycleTriedTimesObject
=
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
);
if
(
cycleTriedTimesObject
==
null
)
{
if
(
cycleTriedTimesObject
==
null
)
{
...
@@ -126,33 +140,9 @@ public class HttpClientDownloader implements Downloader {
...
@@ -126,33 +140,9 @@ public class HttpClientDownloader implements Downloader {
}
}
return
page
;
return
page
;
}
}
return
null
;
}
logger
.
info
(
"download page "
+
request
.
getUrl
()
+
" error, retry the "
+
tried
+
" time!"
);
retry
=
true
;
}
}
while
(
retry
);
int
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
if
(
acceptStatCode
.
contains
(
statusCode
))
{
handleGzip
(
httpResponse
);
//charset
if
(
charset
==
null
)
{
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
charset
=
UrlUtils
.
getCharset
(
value
);
}
return
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
}
else
{
logger
.
warn
(
"code error "
+
statusCode
+
"\t"
+
request
.
getUrl
());
}
}
catch
(
Exception
e
)
{
logger
.
warn
(
"download page "
+
request
.
getUrl
()
+
" error"
,
e
);
}
return
null
;
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
(),
String
content
=
EntityUtils
.
toString
(
httpResponse
.
getEntity
(),
charset
);
charset
);
Page
page
=
new
Page
();
Page
page
=
new
Page
();
page
.
setHtml
(
new
Html
(
UrlUtils
.
fixAllRelativeHrefs
(
content
,
request
.
getUrl
())));
page
.
setHtml
(
new
Html
(
UrlUtils
.
fixAllRelativeHrefs
(
content
,
request
.
getUrl
())));
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
...
@@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
...
@@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
@Override
@Override
public
void
setThread
(
int
thread
)
{
public
void
setThread
(
int
thread
)
{
poolSize
=
thread
;
poolSize
=
thread
;
httpClientPool
=
new
HttpClientPool
(
thread
);
}
private
void
handleGzip
(
HttpResponse
httpResponse
)
{
Header
ceheader
=
httpResponse
.
getEntity
().
getContentEncoding
();
if
(
ceheader
!=
null
)
{
HeaderElement
[]
codecs
=
ceheader
.
getElements
();
for
(
HeaderElement
codec
:
codecs
)
{
if
(
codec
.
getName
().
equalsIgnoreCase
(
"gzip"
))
{
//todo bugfix
httpResponse
.
setEntity
(
new
GzipDecompressingEntity
(
httpResponse
.
getEntity
()));
}
}
}
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
View file @
edfc319c
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.
HttpVersion
;
import
org.apache.http.
*
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.params.ClientPNames
;
import
org.apache.http.config.Registry
;
import
org.apache.http.client.params.CookiePolicy
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.conn.scheme.PlainSocketFactory
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
import
org.apache.http.conn.scheme.Scheme
;
import
org.apache.http.conn.socket.PlainConnectionSocketFactory
;
import
org.apache.http.conn.scheme.SchemeRegistry
;
import
org.apache.http.conn.ssl.SSLConnectionSocketFactory
;
import
org.apache.http.conn.ssl.SSLSocketFactory
;
import
org.apache.http.impl.client.*
;
import
org.apache.http.impl.client.BasicCookieStore
;
import
org.apache.http.impl.conn.PoolingHttpClientConnectionManager
;
import
org.apache.http.impl.client.DefaultHttpClient
;
import
org.apache.http.impl.conn.PoolingClientConnectionManager
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.p
arams.*
;
import
org.apache.http.p
rotocol.HttpContext
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
java.io.IOException
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.
1.0
* @since 0.
3.3
*/
*/
public
class
HttpClientPool
{
public
class
HttpClientPool
{
private
int
poolSize
;
private
PoolingHttpClientConnectionManager
connectionManager
;
private
PoolingClientConnectionManager
connectionManager
;
public
HttpClientPool
(
int
poolSize
)
{
public
HttpClientPool
(
int
poolSize
)
{
this
.
poolSize
=
poolSize
;
Registry
<
ConnectionSocketFactory
>
reg
=
RegistryBuilder
.<
ConnectionSocketFactory
>
create
()
SchemeRegistry
schemeRegistry
=
new
SchemeRegistry
();
.
register
(
"http"
,
PlainConnectionSocketFactory
.
INSTANCE
)
schemeRegistry
.
register
(
new
Scheme
(
"http"
,
80
,
PlainSocketFactory
.
getSocketFactory
()));
.
register
(
"https"
,
SSLConnectionSocketFactory
.
getSocketFactory
())
schemeRegistry
.
register
(
new
Scheme
(
"https"
,
443
,
SSLSocketFactory
.
getSocketFactory
()));
.
build
();
PoolingHttpClientConnectionManager
connectionManager
=
new
PoolingHttpClientConnectionManager
(
reg
);
connectionManager
=
new
PoolingClientConnectionManager
(
schemeRegistry
);
connectionManager
.
setMaxTotal
(
poolSize
);
connectionManager
.
setMaxTotal
(
poolSize
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
}
}
public
HttpClient
getClient
(
Site
site
)
{
public
Closeable
HttpClient
getClient
(
Site
site
)
{
return
generateClient
(
site
);
return
generateClient
(
site
);
}
}
private
HttpClient
generateClient
(
Site
site
)
{
private
Closeable
HttpClient
generateClient
(
Site
site
)
{
Http
Params
params
=
new
BasicHttpParams
(
);
Http
ClientBuilder
httpClientBuilder
=
HttpClients
.
custom
().
setConnectionManager
(
connectionManager
);
if
(
site
!=
null
&&
site
.
getUserAgent
()
!=
null
)
{
if
(
site
!=
null
&&
site
.
getUserAgent
()
!=
null
)
{
params
.
setParameter
(
CoreProtocolPNames
.
USER_AGENT
,
site
.
getUserAgent
());
httpClientBuilder
.
setUserAgent
(
site
.
getUserAgent
());
params
.
setIntParameter
(
CoreConnectionPNames
.
SO_TIMEOUT
,
site
.
getTimeOut
());
params
.
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
site
.
getTimeOut
());
}
else
{
}
else
{
params
.
setIntParameter
(
CoreConnectionPNames
.
SO_TIMEOUT
,
3000
);
httpClientBuilder
.
setUserAgent
(
""
);
params
.
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
3000
);
}
}
httpClientBuilder
.
addInterceptorFirst
(
new
HttpRequestInterceptor
()
{
params
.
setParameter
(
ClientPNames
.
COOKIE_POLICY
,
CookiePolicy
.
BEST_MATCH
);
public
void
process
(
HttpProtocolParamBean
paramsBean
=
new
HttpProtocolParamBean
(
params
);
final
HttpRequest
request
,
paramsBean
.
setVersion
(
HttpVersion
.
HTTP_1_1
);
final
HttpContext
context
)
throws
HttpException
,
IOException
{
if
(
site
!=
null
&&
site
.
getCharset
()
!=
null
)
{
if
(!
request
.
containsHeader
(
"Accept-Encoding"
)
)
{
paramsBean
.
setContentCharset
(
site
.
getCharset
()
);
request
.
addHeader
(
"Accept-Encoding"
,
"gzip"
);
}
}
paramsBean
.
setUseExpectContinue
(
false
);
DefaultHttpClient
httpClient
=
new
DefaultHttpClient
(
connectionManager
,
params
);
if
(
site
!=
null
)
{
generateCookie
(
httpClient
,
site
);
}
}
return
httpClient
;
}).
addInterceptorFirst
(
new
HttpResponseInterceptor
()
{
public
void
process
(
final
HttpResponse
response
,
final
HttpContext
context
)
throws
HttpException
,
IOException
{
HttpEntity
entity
=
response
.
getEntity
();
if
(
entity
!=
null
)
{
Header
ceheader
=
entity
.
getContentEncoding
();
if
(
ceheader
!=
null
)
{
HeaderElement
[]
codecs
=
ceheader
.
getElements
();
for
(
int
i
=
0
;
i
<
codecs
.
length
;
i
++)
{
if
(
codecs
[
i
].
getName
().
equalsIgnoreCase
(
"gzip"
))
{
response
.
setEntity
(
new
GzipDecompressingEntity
(
response
.
getEntity
()));
return
;
}
}
}
}
}
});
httpClientBuilder
.
setRetryHandler
(
new
DefaultHttpRequestRetryHandler
(
site
.
getRetryTimes
(),
true
));
return
httpClientBuilder
.
build
();
}
}
private
void
generateCookie
(
DefaultHttpClient
httpClient
,
Site
site
)
{
private
void
generateCookie
(
DefaultHttpClient
httpClient
,
Site
site
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment