Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
1d86f7c0
Commit
1d86f7c0
authored
Mar 20, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
compile passed in httpclientDownloader
parent
b71f3795
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
30 additions
and
53 deletions
+30
-53
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+0
-1
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+0
-2
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+13
-28
HttpUriRequestConverter.java
...odecraft/webmagic/downloader/HttpUriRequestConverter.java
+1
-2
Proxy.java
...core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+7
-7
ProxyPool.java
.../src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+1
-3
TimerReuseProxyPool.java
...java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
+8
-10
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
1d86f7c0
...
...
@@ -18,7 +18,6 @@ public class Request implements Serializable {
private
static
final
long
serialVersionUID
=
2062192774891352043L
;
public
static
final
String
CYCLE_TRIED_TIMES
=
"_cycle_tried_times"
;
public
static
final
String
STATUS_CODE
=
"statusCode"
;
public
static
final
String
PROXY
=
"proxy"
;
private
String
url
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
1d86f7c0
...
...
@@ -419,8 +419,6 @@ public class Spider implements Runnable, Task {
pipeline
.
process
(
page
.
getResultItems
(),
this
);
}
}
//for proxy status management
request
.
putExtra
(
Request
.
STATUS_CODE
,
page
.
getStatusCode
());
sleep
(
site
.
getSleepTime
());
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
1d86f7c0
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.auth.AuthState
;
...
...
@@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.CharsetUtils
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Set
;
/**
...
...
@@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpResponse
httpResponse
=
null
;
int
statusCode
=
0
;
Site
site
=
task
.
getSite
();
try
{
Proxy
proxy
=
null
;
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
proxy
=
site
.
getHttpProxyFromPool
();
}
else
if
(
site
!=
null
&&
site
.
getHttpProxy
()
!=
null
){
proxy
=
site
.
getHttpProxy
();
request
.
putExtra
(
Request
.
PROXY
,
site
.
getHttpProxy
());
}
Proxy
proxy
=
null
;
HttpContext
httpContext
=
new
BasicHttpContext
();
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
proxy
=
site
.
getHttpProxyFromPool
();
request
.
putExtra
(
Request
.
PROXY
,
proxy
);
HttpContext
httpContext
=
new
BasicHttpContext
();
HttpUriRequest
httpUriRequest
=
httpUriRequestConverter
.
convert
(
request
,
site
);
AuthState
authState
=
new
AuthState
();
authState
.
update
(
new
BasicScheme
(),
new
UsernamePasswordCredentials
(
"userName"
,
"password"
));
authState
.
update
(
new
BasicScheme
(),
new
UsernamePasswordCredentials
(
proxy
.
getUsername
(),
proxy
.
getPassword
()
));
httpContext
.
setAttribute
(
HttpClientContext
.
PROXY_AUTH_STATE
,
authState
);
CloseableHttpClient
httpClient
=
getHttpClient
(
site
,
proxy
);
}
HttpUriRequest
httpUriRequest
=
httpUriRequestConverter
.
convert
(
request
,
site
);
CloseableHttpClient
httpClient
=
getHttpClient
(
site
);
try
{
httpResponse
=
httpClient
.
execute
(
httpUriRequest
,
httpContext
);
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
if
(
site
.
getAcceptStatCode
().
contains
(
statusCode
))
{
Page
page
=
handleResponse
(
request
,
site
.
getCharset
(),
httpResponse
,
task
);
onSuccess
(
request
);
return
page
;
}
else
{
...
...
@@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader {
//ensure the connection is released back to pool
EntityUtils
.
consumeQuietly
(
httpResponse
.
getEntity
());
}
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
site
!=
null
&&
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
site
.
returnHttpProxyToPool
((
HttpHost
)
request
.
getExtra
(
Request
.
PROXY
),
(
Integer
)
request
.
getExtra
(
Request
.
STATUS_CODE
));
if
(
proxy
!=
null
)
{
site
.
getHttpProxyPool
().
returnProxy
(
proxy
,
statusCode
);
}
}
}
...
...
@@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader {
httpClientGenerator
.
setPoolSize
(
thread
);
}
protected
boolean
statusAccept
(
Set
<
Integer
>
acceptStatCode
,
int
statusCode
)
{
return
acceptStatCode
.
contains
(
statusCode
);
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
getContent
(
charset
,
httpResponse
);
Page
page
=
new
Page
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
View file @
1d86f7c0
...
...
@@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder;
import
org.apache.http.message.BasicNameValuePair
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
java.nio.charset.Charset
;
...
...
@@ -26,7 +25,7 @@ import java.util.Map;
*/
public
class
HttpUriRequestConverter
{
public
HttpUriRequest
convert
(
Request
request
,
Site
site
,
Proxy
proxy
)
{
public
HttpUriRequest
convert
(
Request
request
,
Site
site
)
{
return
null
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
View file @
1d86f7c0
...
...
@@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy;
public
class
Proxy
{
private
ProxyHost
proxyHost
;
private
String
user
;
private
String
user
name
;
private
String
password
;
public
Proxy
(
ProxyHost
proxyHost
,
String
user
,
String
password
)
{
public
Proxy
(
ProxyHost
proxyHost
,
String
user
name
,
String
password
)
{
this
.
proxyHost
=
proxyHost
;
this
.
user
=
user
;
this
.
user
name
=
username
;
this
.
password
=
password
;
}
...
...
@@ -28,12 +28,12 @@ public class Proxy {
this
.
proxyHost
=
proxyHost
;
}
public
String
getUser
()
{
return
user
;
public
String
getUser
name
()
{
return
user
name
;
}
public
void
setUser
(
String
user
)
{
this
.
user
=
user
;
public
void
setUser
name
(
String
username
)
{
this
.
user
name
=
username
;
}
public
String
getPassword
()
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
View file @
1d86f7c0
package
us
.
codecraft
.
webmagic
.
proxy
;
import
org.apache.http.HttpHost
;
/**
* Created by edwardsbean on 15-2-28.
*/
public
interface
ProxyPool
{
void
returnProxy
(
HttpHost
host
,
int
statusCode
);
void
returnProxy
(
Proxy
proxy
,
int
statusCode
);
Proxy
getProxy
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
View file @
1d86f7c0
...
...
@@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase;
import
us.codecraft.webmagic.utils.ProxyUtils
;
import
java.io.*
;
import
java.net.InetAddress
;
import
java.net.UnknownHostException
;
import
java.util.*
;
import
java.util.Map.Entry
;
import
java.util.concurrent.BlockingQueue
;
...
...
@@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool {
isEnable
=
true
;
for
(
Proxy
proxy
:
httpProxyList
)
{
if
(!
validateWhenInit
||
ProxyUtils
.
validateProxy
(
proxy
.
getProxyHost
()))
{
TimerReuseProxy
p
=
new
TimerReuseProxy
(
proxy
.
getProxyHost
(),
proxy
.
getUser
(),
proxy
.
getPassword
(),
reuseInterval
);
TimerReuseProxy
p
=
new
TimerReuseProxy
(
proxy
.
getProxyHost
(),
proxy
.
getUser
name
(),
proxy
.
getPassword
(),
reuseInterval
);
proxyQueue
.
add
(
p
);
allProxy
.
put
(
p
.
getProxyHost
().
getHost
(),
p
);
}
...
...
@@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool {
return
proxy
;
}
public
void
returnProxy
(
HttpHost
host
,
int
statusCode
)
{
TimerReuseProxy
p
=
allProxy
.
get
(
host
.
getAddress
().
getHostAddress
());
public
void
returnProxy
(
Proxy
proxy
,
int
statusCode
)
{
TimerReuseProxy
p
=
allProxy
.
get
(
proxy
.
getProxyHost
());
if
(
p
==
null
)
{
return
;
}
...
...
@@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool {
// banned,try longer interval
p
.
fail
(
TimerReuseProxy
.
ERROR_403
);
p
.
setReuseTimeInterval
(
reuseInterval
*
p
.
getFailedNum
());
logger
.
info
(
host
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
logger
.
info
(
proxy
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
break
;
case
TimerReuseProxy
.
ERROR_BANNED
:
p
.
fail
(
TimerReuseProxy
.
ERROR_BANNED
);
p
.
setReuseTimeInterval
(
10
*
60
*
1000
*
p
.
getFailedNum
());
logger
.
warn
(
"this proxy is banned >>>> "
+
p
.
getHttpHost
());
logger
.
info
(
host
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
logger
.
info
(
proxy
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
break
;
case
TimerReuseProxy
.
ERROR_404
:
// p.fail(Proxy.ERROR_404);
...
...
@@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool {
}
if
(
p
.
getFailedNum
()
>
20
)
{
p
.
setReuseTimeInterval
(
reviveTime
);
logger
.
error
(
"remove proxy >>>> "
+
host
+
">>>>"
+
p
.
getFailedType
()
+
" >>>> remain proxy >>>> "
+
proxyQueue
.
size
());
logger
.
error
(
"remove proxy >>>> "
+
proxy
+
">>>>"
+
p
.
getFailedType
()
+
" >>>> remain proxy >>>> "
+
proxyQueue
.
size
());
return
;
}
if
(
p
.
getFailedNum
()
>
0
&&
p
.
getFailedNum
()
%
5
==
0
)
{
if
(!
ProxyUtils
.
validateProxy
(
host
))
{
if
(!
ProxyUtils
.
validateProxy
(
proxy
))
{
p
.
setReuseTimeInterval
(
reviveTime
);
logger
.
error
(
"remove proxy >>>> "
+
host
+
">>>>"
+
p
.
getFailedType
()
+
" >>>> remain proxy >>>> "
+
proxyQueue
.
size
());
logger
.
error
(
"remove proxy >>>> "
+
proxy
+
">>>>"
+
p
.
getFailedType
()
+
" >>>> remain proxy >>>> "
+
proxyQueue
.
size
());
return
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment