Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
a7f9e7ca
Commit
a7f9e7ca
authored
Mar 18, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
重构一部分httpclient
parent
221c1550
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
391 additions
and
342 deletions
+391
-342
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+3
-3
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+36
-99
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+3
-26
HttpUriRequestConverter.java
...odecraft/webmagic/downloader/HttpUriRequestConverter.java
+98
-0
Proxy.java
...core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+17
-169
ProxyHost.java
.../src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java
+34
-0
ProxyPool.java
.../src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+6
-3
TimerReuseProxy.java
...ain/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
+163
-0
TimerReuseProxyPool.java
...java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
+18
-28
ProxyUtils.java
...src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
+12
-13
ProxyTest.java
.../src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
a7f9e7ca
...
...
@@ -4,7 +4,7 @@ import org.apache.http.HttpHost;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.ProxyPool
;
import
us.codecraft.webmagic.proxy.
Simpl
eProxyPool
;
import
us.codecraft.webmagic.proxy.
TimerReus
eProxyPool
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.*
;
...
...
@@ -487,12 +487,12 @@ public class Site {
* @return this
*/
public
Site
setHttpProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
this
.
httpProxyPool
=
new
Simpl
eProxyPool
(
httpProxyList
,
isUseLastProxy
);
this
.
httpProxyPool
=
new
TimerReus
eProxyPool
(
httpProxyList
,
isUseLastProxy
);
return
this
;
}
public
Site
enableHttpProxyPool
()
{
this
.
httpProxyPool
=
new
Simpl
eProxyPool
();
this
.
httpProxyPool
=
new
TimerReus
eProxyPool
();
return
this
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
a7f9e7ca
...
...
@@ -3,16 +3,16 @@ package us.codecraft.webmagic.downloader;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.auth.AuthState
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.client.protocol.HttpClientContext
;
import
org.apache.http.impl.auth.BasicScheme
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.message.BasicNameValuePair
;
import
org.apache.http.protocol.BasicHttpContext
;
import
org.apache.http.protocol.HttpContext
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -23,12 +23,13 @@ import us.codecraft.webmagic.Task;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.CharsetUtils
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.*
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Set
;
/**
...
...
@@ -46,9 +47,15 @@ public class HttpClientDownloader extends AbstractDownloader {
private
HttpClientGenerator
httpClientGenerator
=
new
HttpClientGenerator
();
private
CloseableHttpClient
getHttpClient
(
Site
site
,
Proxy
proxy
)
{
private
HttpUriRequestConverter
httpUriRequestConverter
=
new
HttpUriRequestConverter
();
public
void
setHttpUriRequestConverter
(
HttpUriRequestConverter
httpUriRequestConverter
)
{
this
.
httpUriRequestConverter
=
httpUriRequestConverter
;
}
private
CloseableHttpClient
getHttpClient
(
Site
site
)
{
if
(
site
==
null
)
{
return
httpClientGenerator
.
getClient
(
null
,
proxy
);
return
httpClientGenerator
.
getClient
(
null
);
}
String
domain
=
site
.
getDomain
();
CloseableHttpClient
httpClient
=
httpClients
.
get
(
domain
);
...
...
@@ -56,7 +63,7 @@ public class HttpClientDownloader extends AbstractDownloader {
synchronized
(
this
)
{
httpClient
=
httpClients
.
get
(
domain
);
if
(
httpClient
==
null
)
{
httpClient
=
httpClientGenerator
.
getClient
(
site
,
proxy
);
httpClient
=
httpClientGenerator
.
getClient
(
site
);
httpClients
.
put
(
domain
,
httpClient
);
}
}
...
...
@@ -66,35 +73,31 @@ public class HttpClientDownloader extends AbstractDownloader {
@Override
public
Page
download
(
Request
request
,
Task
task
)
{
Site
site
=
null
;
if
(
task
!=
null
)
{
site
=
task
.
getSite
();
}
Set
<
Integer
>
acceptStatCode
;
String
charset
=
null
;
Map
<
String
,
String
>
headers
=
null
;
if
(
site
!=
null
)
{
acceptStatCode
=
site
.
getAcceptStatCode
();
charset
=
site
.
getCharset
();
headers
=
site
.
getHeaders
();
}
else
{
acceptStatCode
=
WMCollections
.
newHashSet
(
200
);
if
(
task
==
null
||
task
.
getSite
()
==
null
)
{
throw
new
NullPointerException
(
"task or site can not be null"
);
}
logger
.
info
(
"downloading page {}"
,
request
.
getUrl
());
logger
.
debug
(
"downloading page {}"
,
request
.
getUrl
());
CloseableHttpResponse
httpResponse
=
null
;
int
statusCode
=
0
;
Site
site
=
task
.
getSite
();
try
{
HttpHost
proxyHost
=
null
;
Proxy
proxy
=
null
;
//TODO
if
(
site
!=
null
&&
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
Proxy
proxy
=
null
;
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
proxy
=
site
.
getHttpProxyFromPool
();
proxyHost
=
proxy
.
getHttpHost
();
}
else
if
(
site
!=
null
&&
site
.
getHttpProxy
()
!=
null
){
proxyHost
=
site
.
getHttpProxy
();
proxy
=
site
.
getHttpProxy
();
request
.
putExtra
(
Request
.
PROXY
,
site
.
getHttpProxy
());
}
request
.
putExtra
(
Request
.
PROXY
,
proxy
);
HttpContext
httpContext
=
new
BasicHttpContext
();
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
);
HttpUriRequest
httpUriRequest
=
httpUriRequestConverter
.
convert
(
request
,
site
);
AuthState
authState
=
new
AuthState
();
authState
.
update
(
new
BasicScheme
(),
new
UsernamePasswordCredentials
(
"userName"
,
"password"
));
httpContext
.
setAttribute
(
HttpClientContext
.
PROXY_AUTH_STATE
,
authState
);
CloseableHttpClient
httpClient
=
getHttpClient
(
site
,
proxy
);
httpResponse
=
httpClient
.
execute
(
httpUriRequest
,
httpContext
);
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
...
...
@@ -134,72 +137,6 @@ public class HttpClientDownloader extends AbstractDownloader {
return
acceptStatCode
.
contains
(
statusCode
);
}
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
,
HttpHost
proxy
)
{
RequestBuilder
requestBuilder
=
selectRequestMethod
(
request
).
setUri
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
requestBuilder
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
}
}
RequestConfig
.
Builder
requestConfigBuilder
=
RequestConfig
.
custom
();
if
(
site
!=
null
)
{
requestConfigBuilder
.
setConnectionRequestTimeout
(
site
.
getTimeOut
())
.
setSocketTimeout
(
site
.
getTimeOut
())
.
setConnectTimeout
(
site
.
getTimeOut
())
.
setCookieSpec
(
CookieSpecs
.
BEST_MATCH
);
}
if
(
proxy
!=
null
)
{
requestConfigBuilder
.
setProxy
(
proxy
);
request
.
putExtra
(
Request
.
PROXY
,
proxy
);
}
requestBuilder
.
setConfig
(
requestConfigBuilder
.
build
());
return
requestBuilder
.
build
();
}
protected
RequestBuilder
selectRequestMethod
(
Request
request
)
{
String
method
=
request
.
getMethod
();
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
//default get
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
return
addFormParams
(
RequestBuilder
.
put
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
return
addQueryParams
(
RequestBuilder
.
delete
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
return
addQueryParams
(
RequestBuilder
.
trace
(),
request
.
getParams
());
}
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
}
private
RequestBuilder
addFormParams
(
RequestBuilder
requestBuilder
,
NameValuePair
[]
nameValuePair
,
Map
<
String
,
String
>
params
)
{
List
<
NameValuePair
>
allNameValuePair
=
new
ArrayList
<
NameValuePair
>();
if
(
nameValuePair
!=
null
&&
nameValuePair
.
length
>
0
)
{
allNameValuePair
=
Arrays
.
asList
(
nameValuePair
);
}
if
(
params
!=
null
)
{
for
(
String
key
:
params
.
keySet
())
{
allNameValuePair
.
add
(
new
BasicNameValuePair
(
key
,
params
.
get
(
key
)));
}
}
requestBuilder
.
setEntity
(
new
UrlEncodedFormEntity
(
allNameValuePair
,
Charset
.
forName
(
"utf8"
)));
return
requestBuilder
;
}
private
RequestBuilder
addQueryParams
(
RequestBuilder
requestBuilder
,
Map
<
String
,
String
>
params
)
{
if
(
params
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
entry
:
params
.
entrySet
())
{
requestBuilder
.
addParameter
(
entry
.
getKey
(),
entry
.
getValue
());
}
}
return
requestBuilder
;
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
getContent
(
charset
,
httpResponse
);
Page
page
=
new
Page
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpException
;
import
org.apache.http.HttpRequest
;
import
org.apache.http.HttpRequestInterceptor
;
import
org.apache.http.auth.AuthScope
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.CredentialsProvider
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.config.SocketConfig
;
...
...
@@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
javax.net.ssl.SSLContext
;
import
javax.net.ssl.TrustManager
;
...
...
@@ -92,31 +87,13 @@ public class HttpClientGenerator {
return
this
;
}
public
CloseableHttpClient
getClient
(
Site
site
,
Proxy
proxy
)
{
return
generateClient
(
site
,
proxy
);
public
CloseableHttpClient
getClient
(
Site
site
)
{
return
generateClient
(
site
);
}
private
CloseableHttpClient
generateClient
(
Site
site
,
Proxy
proxy
)
{
CredentialsProvider
credsProvider
=
null
;
private
CloseableHttpClient
generateClient
(
Site
site
)
{
HttpClientBuilder
httpClientBuilder
=
HttpClients
.
custom
();
if
(
proxy
!=
null
&&
StringUtils
.
isNotBlank
(
proxy
.
getUser
())
&&
StringUtils
.
isNotBlank
(
proxy
.
getPassword
()))
{
credsProvider
=
new
BasicCredentialsProvider
();
credsProvider
.
setCredentials
(
new
AuthScope
(
proxy
.
getHttpHost
().
getAddress
().
getHostAddress
(),
proxy
.
getHttpHost
().
getPort
()),
new
UsernamePasswordCredentials
(
proxy
.
getUser
(),
proxy
.
getPassword
()));
httpClientBuilder
.
setDefaultCredentialsProvider
(
credsProvider
);
}
if
(
site
!=
null
&&
site
.
getHttpProxy
()!=
null
&&
site
.
getUsernamePasswordCredentials
()
!=
null
){
credsProvider
=
new
BasicCredentialsProvider
();
credsProvider
.
setCredentials
(
new
AuthScope
(
site
.
getHttpProxy
()),
//可以访问的范围
site
.
getUsernamePasswordCredentials
());
//用户名和密码
httpClientBuilder
.
setDefaultCredentialsProvider
(
credsProvider
);
}
httpClientBuilder
.
setConnectionManager
(
connectionManager
);
if
(
site
!=
null
&&
site
.
getUserAgent
()
!=
null
)
{
httpClientBuilder
.
setUserAgent
(
site
.
getUserAgent
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
0 → 100644
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.HttpHost
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.UrlEncodedFormEntity
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.message.BasicNameValuePair
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 上午11:28
*/
public
class
HttpUriRequestConverter
{
public
HttpUriRequest
convert
(
Request
request
,
Site
site
,
Proxy
proxy
)
{
return
null
;
}
private
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
,
HttpHost
proxy
)
{
RequestBuilder
requestBuilder
=
selectRequestMethod
(
request
).
setUri
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
requestBuilder
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
}
}
RequestConfig
.
Builder
requestConfigBuilder
=
RequestConfig
.
custom
();
if
(
site
!=
null
)
{
requestConfigBuilder
.
setConnectionRequestTimeout
(
site
.
getTimeOut
())
.
setSocketTimeout
(
site
.
getTimeOut
())
.
setConnectTimeout
(
site
.
getTimeOut
())
.
setCookieSpec
(
CookieSpecs
.
BEST_MATCH
);
}
if
(
proxy
!=
null
)
{
requestConfigBuilder
.
setProxy
(
proxy
);
}
requestBuilder
.
setConfig
(
requestConfigBuilder
.
build
());
return
requestBuilder
.
build
();
}
private
RequestBuilder
selectRequestMethod
(
Request
request
)
{
String
method
=
request
.
getMethod
();
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
//default get
return
addQueryParams
(
RequestBuilder
.
get
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
return
addFormParams
(
RequestBuilder
.
post
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
addQueryParams
(
RequestBuilder
.
head
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
return
addFormParams
(
RequestBuilder
.
put
(),
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
return
addQueryParams
(
RequestBuilder
.
delete
(),
request
.
getParams
());
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
return
addQueryParams
(
RequestBuilder
.
trace
(),
request
.
getParams
());
}
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
}
private
RequestBuilder
addFormParams
(
RequestBuilder
requestBuilder
,
NameValuePair
[]
nameValuePair
,
Map
<
String
,
String
>
params
)
{
List
<
NameValuePair
>
allNameValuePair
=
new
ArrayList
<
NameValuePair
>();
if
(
nameValuePair
!=
null
&&
nameValuePair
.
length
>
0
)
{
allNameValuePair
=
Arrays
.
asList
(
nameValuePair
);
}
if
(
params
!=
null
)
{
for
(
String
key
:
params
.
keySet
())
{
allNameValuePair
.
add
(
new
BasicNameValuePair
(
key
,
params
.
get
(
key
)));
}
}
requestBuilder
.
setEntity
(
new
UrlEncodedFormEntity
(
allNameValuePair
,
Charset
.
forName
(
"utf8"
)));
return
requestBuilder
;
}
private
RequestBuilder
addQueryParams
(
RequestBuilder
requestBuilder
,
Map
<
String
,
String
>
params
)
{
if
(
params
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
entry
:
params
.
entrySet
())
{
requestBuilder
.
addParameter
(
entry
.
getKey
(),
entry
.
getValue
());
}
}
return
requestBuilder
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
proxy
;
import
org.apache.http.HttpHost
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.Delayed
;
import
java.util.concurrent.TimeUnit
;
/**
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--+---+
|
v
+--------+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
+--------+
*/
/**
* Object has these status of lifecycle above.<br>
*
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see SimpleProxyPool
*/
public
class
Proxy
implements
Delayed
,
Serializable
{
private
static
final
long
serialVersionUID
=
228939737383625551L
;
public
static
final
int
ERROR_403
=
403
;
public
static
final
int
ERROR_404
=
404
;
public
static
final
int
ERROR_BANNED
=
10000
;
// banned by website
public
static
final
int
ERROR_Proxy
=
10001
;
// the proxy itself failed
public
static
final
int
SUCCESS
=
200
;
public
class
Proxy
{
private
final
HttpHost
http
Host
;
private
ProxyHost
proxy
Host
;
private
String
user
;
private
String
password
;
private
int
reuseTimeInterval
=
1500
;
// ms
private
Long
canReuseTime
=
0L
;
private
Long
lastBorrowTime
=
System
.
currentTimeMillis
();
private
Long
responseTime
=
0L
;
private
int
failedNum
=
0
;
private
int
successNum
=
0
;
private
int
borrowNum
=
0
;
private
List
<
Integer
>
failedErrorType
=
new
ArrayList
<
Integer
>();
public
Proxy
(
HttpHost
httpHost
,
String
user
,
String
password
)
{
this
.
httpHost
=
httpHost
;
this
.
user
=
user
;
this
.
password
=
password
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseTimeInterval
,
TimeUnit
.
MILLISECONDS
);
}
public
Proxy
(
HttpHost
httpHost
,
int
reuseInterval
,
String
user
,
String
password
)
{
this
.
httpHost
=
httpHost
;
public
Proxy
(
ProxyHost
proxyHost
,
String
user
,
String
password
)
{
this
.
proxyHost
=
proxyHost
;
this
.
user
=
user
;
this
.
password
=
password
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseInterval
,
TimeUnit
.
MILLISECONDS
);
}
public
int
getSuccessNum
()
{
return
successNum
;
}
public
void
successNumIncrement
(
int
increment
)
{
this
.
successNum
+=
increment
;
}
public
Long
getLastUseTime
()
{
return
lastBorrowTime
;
}
public
void
setLastBorrowTime
(
Long
lastBorrowTime
)
{
this
.
lastBorrowTime
=
lastBorrowTime
;
}
public
void
recordResponse
()
{
this
.
responseTime
=
(
System
.
currentTimeMillis
()
-
lastBorrowTime
+
responseTime
)
/
2
;
this
.
lastBorrowTime
=
System
.
currentTimeMillis
();
}
public
List
<
Integer
>
getFailedErrorType
()
{
return
failedErrorType
;
}
public
void
setFailedErrorType
(
List
<
Integer
>
failedErrorType
)
{
this
.
failedErrorType
=
failedErrorType
;
}
public
void
fail
(
int
failedErrorType
)
{
this
.
failedNum
++;
this
.
failedErrorType
.
add
(
failedErrorType
);
}
public
void
setFailedNum
(
int
failedNum
)
{
this
.
failedNum
=
failedNum
;
}
public
int
getFailedNum
()
{
return
failedNum
;
}
public
String
getFailedType
()
{
String
re
=
""
;
for
(
Integer
i
:
this
.
failedErrorType
)
{
re
+=
i
+
" . "
;
}
return
re
;
}
public
HttpHost
getHttpHost
()
{
return
httpHost
;
}
public
int
getReuseTimeInterval
(
)
{
return
reuseTimeInterval
;
public
Proxy
(
ProxyHost
proxyHost
)
{
this
.
proxyHost
=
proxyHost
;
}
public
void
setReuseTimeInterval
(
int
reuseTimeInterval
)
{
this
.
reuseTimeInterval
=
reuseTimeInterval
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseTimeInterval
,
TimeUnit
.
MILLISECONDS
);
public
ProxyHost
getProxyHost
()
{
return
proxyHost
;
}
@Override
public
long
getDelay
(
TimeUnit
unit
)
{
return
unit
.
convert
(
canReuseTime
-
System
.
nanoTime
(),
TimeUnit
.
NANOSECONDS
);
public
void
setProxyHost
(
ProxyHost
proxyHost
)
{
this
.
proxyHost
=
proxyHost
;
}
@Override
public
int
compareTo
(
Delayed
o
)
{
Proxy
that
=
(
Proxy
)
o
;
return
canReuseTime
>
that
.
canReuseTime
?
1
:
(
canReuseTime
<
that
.
canReuseTime
?
-
1
:
0
);
public
String
getUser
()
{
return
user
;
}
@Override
public
String
toString
()
{
String
re
=
String
.
format
(
"host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d"
,
httpHost
.
getAddress
().
getHostAddress
(),
responseTime
,
successNum
*
100.0
/
borrowNum
,
borrowNum
);
return
re
;
public
void
setUser
(
String
user
)
{
this
.
user
=
user
;
}
public
String
getUser
()
{
return
user
;
}
public
String
getPassword
()
{
public
String
getPassword
()
{
return
password
;
}
public
void
borrowNumIncrement
(
int
increment
)
{
this
.
borrowNum
+=
increment
;
public
void
setPassword
(
String
password
)
{
this
.
password
=
password
;
}
public
int
getBorrowNum
()
{
return
borrowNum
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java
0 → 100644
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
proxy
;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 下午12:04
*/
public
class
ProxyHost
{
private
String
host
;
private
int
port
;
public
String
getHost
()
{
return
host
;
}
public
ProxyHost
(
String
host
,
int
port
)
{
this
.
host
=
host
;
this
.
port
=
port
;
}
public
void
setHost
(
String
host
)
{
this
.
host
=
host
;
}
public
int
getPort
()
{
return
port
;
}
public
void
setPort
(
int
port
)
{
this
.
port
=
port
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
View file @
a7f9e7ca
...
...
@@ -6,7 +6,10 @@ import org.apache.http.HttpHost;
* Created by edwardsbean on 15-2-28.
*/
public
interface
ProxyPool
{
public
void
returnProxy
(
HttpHost
host
,
int
statusCode
);
public
Proxy
getProxy
();
public
boolean
isEnable
();
void
returnProxy
(
HttpHost
host
,
int
statusCode
);
Proxy
getProxy
();
boolean
isEnable
();
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
0 → 100644
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
proxy
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.Delayed
;
import
java.util.concurrent.TimeUnit
;
/**
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--+---+
|
v
+--------+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
+--------+
*/
/**
* Object has these status of lifecycle above.<br>
*
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see TimerReuseProxyPool
*/
public
class
TimerReuseProxy
extends
Proxy
implements
Delayed
,
Serializable
{
private
static
final
long
serialVersionUID
=
228939737383625551L
;
public
static
final
int
ERROR_403
=
403
;
public
static
final
int
ERROR_404
=
404
;
public
static
final
int
ERROR_BANNED
=
10000
;
// banned by website
public
static
final
int
ERROR_Proxy
=
10001
;
// the proxy itself failed
public
static
final
int
SUCCESS
=
200
;
private
int
reuseTimeInterval
=
1500
;
// ms
private
Long
canReuseTime
=
0L
;
private
Long
lastBorrowTime
=
System
.
currentTimeMillis
();
private
Long
responseTime
=
0L
;
private
int
failedNum
=
0
;
private
int
successNum
=
0
;
private
int
borrowNum
=
0
;
private
List
<
Integer
>
failedErrorType
=
new
ArrayList
<
Integer
>();
public
TimerReuseProxy
(
ProxyHost
proxyHost
,
String
user
,
String
password
)
{
super
(
proxyHost
,
user
,
password
);
}
public
TimerReuseProxy
(
ProxyHost
proxyHost
,
String
user
,
String
password
,
int
reuseTimeInterval
)
{
super
(
proxyHost
,
user
,
password
);
this
.
reuseTimeInterval
=
reuseTimeInterval
;
}
public
int
getSuccessNum
()
{
return
successNum
;
}
public
void
successNumIncrement
(
int
increment
)
{
this
.
successNum
+=
increment
;
}
public
Long
getLastUseTime
()
{
return
lastBorrowTime
;
}
public
void
setLastBorrowTime
(
Long
lastBorrowTime
)
{
this
.
lastBorrowTime
=
lastBorrowTime
;
}
public
void
recordResponse
()
{
this
.
responseTime
=
(
System
.
currentTimeMillis
()
-
lastBorrowTime
+
responseTime
)
/
2
;
this
.
lastBorrowTime
=
System
.
currentTimeMillis
();
}
public
List
<
Integer
>
getFailedErrorType
()
{
return
failedErrorType
;
}
public
void
setFailedErrorType
(
List
<
Integer
>
failedErrorType
)
{
this
.
failedErrorType
=
failedErrorType
;
}
public
void
fail
(
int
failedErrorType
)
{
this
.
failedNum
++;
this
.
failedErrorType
.
add
(
failedErrorType
);
}
public
void
setFailedNum
(
int
failedNum
)
{
this
.
failedNum
=
failedNum
;
}
public
int
getFailedNum
()
{
return
failedNum
;
}
public
String
getFailedType
()
{
String
re
=
""
;
for
(
Integer
i
:
this
.
failedErrorType
)
{
re
+=
i
+
" . "
;
}
return
re
;
}
public
int
getReuseTimeInterval
()
{
return
reuseTimeInterval
;
}
public
void
setReuseTimeInterval
(
int
reuseTimeInterval
)
{
this
.
reuseTimeInterval
=
reuseTimeInterval
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseTimeInterval
,
TimeUnit
.
MILLISECONDS
);
}
@Override
public
long
getDelay
(
TimeUnit
unit
)
{
return
unit
.
convert
(
canReuseTime
-
System
.
nanoTime
(),
TimeUnit
.
NANOSECONDS
);
}
@Override
public
int
compareTo
(
Delayed
o
)
{
TimerReuseProxy
that
=
(
TimerReuseProxy
)
o
;
return
canReuseTime
>
that
.
canReuseTime
?
1
:
(
canReuseTime
<
that
.
canReuseTime
?
-
1
:
0
);
}
public
void
borrowNumIncrement
(
int
increment
)
{
this
.
borrowNum
+=
increment
;
}
public
int
getBorrowNum
()
{
return
borrowNum
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/
Simpl
eProxyPool.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/
TimerReus
eProxyPool.java
View file @
a7f9e7ca
...
...
@@ -22,12 +22,12 @@ import java.util.concurrent.DelayQueue;
* @see Proxy
* @since 0.5.1
*/
public
class
Simpl
eProxyPool
implements
ProxyPool
{
public
class
TimerReus
eProxyPool
implements
ProxyPool
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
BlockingQueue
<
Proxy
>
proxyQueue
=
new
DelayQueue
<
Proxy
>();
private
Map
<
String
,
Proxy
>
allProxy
=
new
ConcurrentHashMap
<
String
,
Proxy
>();
private
BlockingQueue
<
TimerReuseProxy
>
proxyQueue
=
new
DelayQueue
<
TimerReuse
Proxy
>();
private
Map
<
String
,
TimerReuseProxy
>
allProxy
=
new
ConcurrentHashMap
<
String
,
TimerReuse
Proxy
>();
private
int
reuseInterval
=
1500
;
// ms
private
int
reviveTime
=
2
*
60
*
60
*
1000
;
// ms
...
...
@@ -50,15 +50,15 @@ public class SimpleProxyPool implements ProxyPool {
}
};
public
Simpl
eProxyPool
()
{
public
TimerReus
eProxyPool
()
{
this
(
null
,
true
);
}
public
Simpl
eProxyPool
(
List
<
String
[]>
httpProxyList
)
{
public
TimerReus
eProxyPool
(
List
<
String
[]>
httpProxyList
)
{
this
(
httpProxyList
,
true
);
}
public
Simpl
eProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
public
TimerReus
eProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
if
(
httpProxyList
!=
null
)
{
addProxy
(
httpProxyList
.
toArray
(
new
String
[
httpProxyList
.
size
()][]));
}
...
...
@@ -109,9 +109,9 @@ public class SimpleProxyPool implements ProxyPool {
}
private
Map
<
String
,
Proxy
>
prepareForSaving
()
{
Map
<
String
,
Proxy
>
tmp
=
new
HashMap
<
String
,
Proxy
>();
for
(
Entry
<
String
,
Proxy
>
e
:
allProxy
.
entrySet
())
{
Proxy
p
=
e
.
getValue
();
Map
<
String
,
TimerReuseProxy
>
tmp
=
new
HashMap
<
String
,
TimerReuse
Proxy
>();
for
(
Entry
<
String
,
TimerReuse
Proxy
>
e
:
allProxy
.
entrySet
())
{
TimerReuse
Proxy
p
=
e
.
getValue
();
p
.
setFailedNum
(
0
);
tmp
.
put
(
e
.
getKey
(),
p
);
}
...
...
@@ -152,30 +152,20 @@ public class SimpleProxyPool implements ProxyPool {
logger
.
info
(
"proxy pool size>>>>"
+
allProxy
.
size
());
}
public
void
addProxy
(
String
[]
...
httpProxyList
)
{
public
void
addProxy
(
Proxy
...
httpProxyList
)
{
isEnable
=
true
;
for
(
String
[]
s
:
httpProxyList
)
{
try
{
if
(
allProxy
.
containsKey
(
s
[
2
]))
{
continue
;
}
HttpHost
item
=
new
HttpHost
(
InetAddress
.
getByName
(
s
[
2
]),
Integer
.
valueOf
(
s
[
3
]));
if
(!
validateWhenInit
||
ProxyUtils
.
validateProxy
(
item
))
{
Proxy
p
=
new
Proxy
(
item
,
reuseInterval
,
s
[
0
],
s
[
1
]);
for
(
Proxy
proxy
:
httpProxyList
)
{
if
(!
validateWhenInit
||
ProxyUtils
.
validateProxy
(
proxy
.
getProxyHost
()))
{
TimerReuseProxy
p
=
new
TimerReuseProxy
(
proxy
.
getProxyHost
(),
proxy
.
getUser
(),
proxy
.
getPassword
(),
reuseInterval
);
proxyQueue
.
add
(
p
);
allProxy
.
put
(
s
[
2
],
p
);
}
}
catch
(
NumberFormatException
e
)
{
logger
.
error
(
"HttpHost init error:"
,
e
);
}
catch
(
UnknownHostException
e
)
{
logger
.
error
(
"HttpHost init error:"
,
e
);
allProxy
.
put
(
p
.
getProxyHost
().
getHost
(),
p
);
}
}
logger
.
info
(
"proxy pool size>>>>"
+
allProxy
.
size
());
}
public
Proxy
getProxy
()
{
Proxy
proxy
=
null
;
public
TimerReuse
Proxy
getProxy
()
{
TimerReuse
Proxy
proxy
=
null
;
try
{
Long
time
=
System
.
currentTimeMillis
();
proxy
=
proxyQueue
.
take
();
...
...
@@ -183,7 +173,7 @@ public class SimpleProxyPool implements ProxyPool {
if
(
costTime
>
reuseInterval
)
{
logger
.
info
(
"get proxy time >>>> "
+
costTime
);
}
Proxy
p
=
allProxy
.
get
(
proxy
.
getHttpHost
().
getAddress
().
getHostAddress
());
TimerReuseProxy
p
=
allProxy
.
get
(
proxy
.
getProxyHost
().
getHost
());
p
.
setLastBorrowTime
(
System
.
currentTimeMillis
());
p
.
borrowNumIncrement
(
1
);
}
catch
(
InterruptedException
e
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
View file @
a7f9e7ca
package
us
.
codecraft
.
webmagic
.
utils
;
import
java.io.IOException
;
import
java.net.Inet6Address
;
import
java.net.InetAddress
;
import
java.net.InetSocketAddress
;
import
java.net.NetworkInterface
;
import
java.net.Socket
;
import
java.net.SocketException
;
import
java.net.UnknownHostException
;
import
java.util.Enumeration
;
import
java.util.regex.Pattern
;
import
org.apache.http.HttpHost
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.proxy.ProxyHost
;
import
java.io.IOException
;
import
java.net.*
;
import
java.util.Enumeration
;
import
java.util.regex.Pattern
;
/**
* Pooled Proxy Object
...
...
@@ -69,7 +64,11 @@ public class ProxyUtils {
}
}
public
static
boolean
validateProxy
(
HttpHost
p
)
{
public
static
HttpHost
convert
(
ProxyHost
p
){
return
new
HttpHost
(
p
.
getHost
(),
p
.
getPort
());
}
public
static
boolean
validateProxy
(
ProxyHost
p
)
{
if
(
localAddr
==
null
)
{
logger
.
error
(
"cannot get local IP"
);
return
false
;
...
...
@@ -79,7 +78,7 @@ public class ProxyUtils {
try
{
socket
=
new
Socket
();
socket
.
bind
(
new
InetSocketAddress
(
localAddr
,
0
));
InetSocketAddress
endpointSocketAddr
=
new
InetSocketAddress
(
p
.
get
Address
().
getHostAddress
(),
p
.
getPort
());
InetSocketAddress
endpointSocketAddr
=
new
InetSocketAddress
(
p
.
get
Host
(),
p
.
getPort
());
socket
.
connect
(
endpointSocketAddr
,
3000
);
logger
.
debug
(
"SUCCESS - connection established! Local: "
+
localAddr
.
getHostAddress
()
+
" remote: "
+
p
);
isReachable
=
true
;
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
View file @
a7f9e7ca
...
...
@@ -29,7 +29,7 @@ public class ProxyTest {
@Test
public
void
testProxy
()
{
SimpleProxyPool
proxyPool
=
new
Simpl
eProxyPool
(
httpProxyList
,
false
);
TimerReuseProxyPool
proxyPool
=
new
TimerReus
eProxyPool
(
httpProxyList
,
false
);
proxyPool
.
setReuseInterval
(
500
);
assertThat
(
proxyPool
.
getIdleNum
()).
isEqualTo
(
4
);
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment