Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
a8c90053
Commit
a8c90053
authored
May 08, 2016
by
yihua.huang
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'oxf1-master'
parents
ca072c55
83c27ebb
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
82 additions
and
35 deletions
+82
-35
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+7
-1
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+19
-14
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+21
-5
Proxy.java
...core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+22
-4
ProxyPool.java
.../src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+6
-6
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+2
-2
ProxyTest.java
.../src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+5
-3
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
a8c90053
...
...
@@ -4,6 +4,7 @@ import com.google.common.collect.HashBasedTable;
import
com.google.common.collect.Table
;
import
org.apache.http.HttpHost
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.ProxyPool
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
...
@@ -474,6 +475,11 @@ public class Site {
return
this
;
}
public
Site
setHttpProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
this
.
httpProxyPool
=
new
ProxyPool
(
httpProxyList
,
isUseLastProxy
);
return
this
;
}
public
Site
enableHttpProxyPool
()
{
this
.
httpProxyPool
=
new
ProxyPool
();
return
this
;
...
...
@@ -483,7 +489,7 @@ public class Site {
return
httpProxyPool
;
}
public
HttpHost
getHttpProxyFromPool
()
{
public
Proxy
getHttpProxyFromPool
()
{
return
httpProxyPool
.
getProxy
();
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
a8c90053
...
...
@@ -24,6 +24,7 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
...
@@ -50,9 +51,9 @@ public class HttpClientDownloader extends AbstractDownloader {
private
HttpClientGenerator
httpClientGenerator
=
new
HttpClientGenerator
();
private
CloseableHttpClient
getHttpClient
(
Site
site
)
{
private
CloseableHttpClient
getHttpClient
(
Site
site
,
Proxy
proxy
)
{
if
(
site
==
null
)
{
return
httpClientGenerator
.
getClient
(
null
);
return
httpClientGenerator
.
getClient
(
null
,
proxy
);
}
String
domain
=
site
.
getDomain
();
CloseableHttpClient
httpClient
=
httpClients
.
get
(
domain
);
...
...
@@ -60,7 +61,7 @@ public class HttpClientDownloader extends AbstractDownloader {
synchronized
(
this
)
{
httpClient
=
httpClients
.
get
(
domain
);
if
(
httpClient
==
null
)
{
httpClient
=
httpClientGenerator
.
getClient
(
site
);
httpClient
=
httpClientGenerator
.
getClient
(
site
,
proxy
);
httpClients
.
put
(
domain
,
httpClient
);
}
}
...
...
@@ -88,8 +89,17 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpResponse
httpResponse
=
null
;
int
statusCode
=
0
;
try
{
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
);
httpResponse
=
getHttpClient
(
site
).
execute
(
httpUriRequest
);
HttpHost
proxyHost
=
null
;
Proxy
proxy
=
null
;
//TODO
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
proxy
=
site
.
getHttpProxyFromPool
();
proxyHost
=
proxy
.
getHttpHost
();
}
else
if
(
site
.
getHttpProxy
()!=
null
){
proxyHost
=
site
.
getHttpProxy
();
}
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
//���������˴���
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
);
//getHttpClient�������˴�����֤
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
...
...
@@ -129,7 +139,7 @@ public class HttpClientDownloader extends AbstractDownloader {
return
acceptStatCode
.
contains
(
statusCode
);
}
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
)
{
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
,
HttpHost
proxy
)
{
RequestBuilder
requestBuilder
=
selectRequestMethod
(
request
).
setUri
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
...
...
@@ -141,14 +151,9 @@ public class HttpClientDownloader extends AbstractDownloader {
.
setSocketTimeout
(
site
.
getTimeOut
())
.
setConnectTimeout
(
site
.
getTimeOut
())
.
setCookieSpec
(
CookieSpecs
.
BEST_MATCH
);
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
HttpHost
host
=
site
.
getHttpProxyFromPool
();
requestConfigBuilder
.
setProxy
(
host
);
request
.
putExtra
(
Request
.
PROXY
,
host
);
}
else
if
(
site
.
getHttpProxy
()!=
null
){
HttpHost
host
=
site
.
getHttpProxy
();
requestConfigBuilder
.
setProxy
(
host
);
request
.
putExtra
(
Request
.
PROXY
,
host
);
if
(
proxy
!=
null
)
{
requestConfigBuilder
.
setProxy
(
proxy
);
request
.
putExtra
(
Request
.
PROXY
,
proxy
);
}
requestBuilder
.
setConfig
(
requestConfigBuilder
.
build
());
return
requestBuilder
.
build
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
a8c90053
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpException
;
import
org.apache.http.HttpRequest
;
import
org.apache.http.HttpRequestInterceptor
;
import
org.apache.http.auth.AuthScope
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.CredentialsProvider
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.config.SocketConfig
;
...
...
@@ -15,6 +19,7 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.protocol.HttpContext
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
java.io.IOException
;
import
java.util.Map
;
...
...
@@ -41,12 +46,24 @@ public class HttpClientGenerator {
return
this
;
}
public
CloseableHttpClient
getClient
(
Site
site
)
{
return
generateClient
(
site
);
public
CloseableHttpClient
getClient
(
Site
site
,
Proxy
proxy
)
{
return
generateClient
(
site
,
proxy
);
}
private
CloseableHttpClient
generateClient
(
Site
site
)
{
HttpClientBuilder
httpClientBuilder
=
HttpClients
.
custom
().
setConnectionManager
(
connectionManager
);
private
CloseableHttpClient
generateClient
(
Site
site
,
Proxy
proxy
)
{
CredentialsProvider
credsProvider
=
null
;
HttpClientBuilder
httpClientBuilder
=
HttpClients
.
custom
();
if
(
proxy
!=
null
&&
StringUtils
.
isNotBlank
(
proxy
.
getUser
())
&&
StringUtils
.
isNotBlank
(
proxy
.
getPassword
()))
{
credsProvider
=
new
BasicCredentialsProvider
();
credsProvider
.
setCredentials
(
new
AuthScope
(
proxy
.
getHttpHost
().
getAddress
().
getHostAddress
(),
proxy
.
getHttpHost
().
getPort
()),
new
UsernamePasswordCredentials
(
proxy
.
getUser
(),
proxy
.
getPassword
()));
httpClientBuilder
.
setDefaultCredentialsProvider
(
credsProvider
);
}
httpClientBuilder
.
setConnectionManager
(
connectionManager
);
if
(
site
!=
null
&&
site
.
getUserAgent
()
!=
null
)
{
httpClientBuilder
.
setUserAgent
(
site
.
getUserAgent
());
}
else
{
...
...
@@ -61,7 +78,6 @@ public class HttpClientGenerator {
if
(!
request
.
containsHeader
(
"Accept-Encoding"
))
{
request
.
addHeader
(
"Accept-Encoding"
,
"gzip"
);
}
}
});
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
View file @
a8c90053
package
us
.
codecraft
.
webmagic
.
proxy
;
import
org.apache.http.HttpHost
;
import
java.io.Serializable
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.Delayed
;
import
java.util.concurrent.TimeUnit
;
import
org.apache.http.HttpHost
;
/**
* >>>> Proxy lifecycle
...
...
@@ -64,6 +64,9 @@ public class Proxy implements Delayed, Serializable {
public
static
final
int
SUCCESS
=
200
;
private
final
HttpHost
httpHost
;
private
String
user
;
private
String
password
;
private
int
reuseTimeInterval
=
1500
;
// ms
private
Long
canReuseTime
=
0L
;
...
...
@@ -76,13 +79,17 @@ public class Proxy implements Delayed, Serializable {
private
List
<
Integer
>
failedErrorType
=
new
ArrayList
<
Integer
>();
Proxy
(
HttpHost
httpHost
)
{
Proxy
(
HttpHost
httpHost
,
String
user
,
String
password
)
{
this
.
httpHost
=
httpHost
;
this
.
user
=
user
;
this
.
password
=
password
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseTimeInterval
,
TimeUnit
.
MILLISECONDS
);
}
Proxy
(
HttpHost
httpHost
,
int
reuseInterval
)
{
Proxy
(
HttpHost
httpHost
,
int
reuseInterval
,
String
user
,
String
password
)
{
this
.
httpHost
=
httpHost
;
this
.
user
=
user
;
this
.
password
=
password
;
this
.
canReuseTime
=
System
.
nanoTime
()
+
TimeUnit
.
NANOSECONDS
.
convert
(
reuseInterval
,
TimeUnit
.
MILLISECONDS
);
}
...
...
@@ -170,6 +177,17 @@ public class Proxy implements Delayed, Serializable {
return
re
;
}
public
String
getUser
()
{
return
user
;
}
public
String
getPassword
()
{
return
password
;
}
public
void
borrowNumIncrement
(
int
increment
)
{
this
.
borrowNum
+=
increment
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
View file @
a8c90053
...
...
@@ -156,14 +156,14 @@ public class ProxyPool {
isEnable
=
true
;
for
(
String
[]
s
:
httpProxyList
)
{
try
{
if
(
allProxy
.
containsKey
(
s
[
0
]))
{
if
(
allProxy
.
containsKey
(
s
[
2
]))
{
continue
;
}
HttpHost
item
=
new
HttpHost
(
InetAddress
.
getByName
(
s
[
0
]),
Integer
.
valueOf
(
s
[
1
]));
HttpHost
item
=
new
HttpHost
(
InetAddress
.
getByName
(
s
[
2
]),
Integer
.
valueOf
(
s
[
3
]));
if
(!
validateWhenInit
||
ProxyUtils
.
validateProxy
(
item
))
{
Proxy
p
=
new
Proxy
(
item
,
reuseInterval
);
Proxy
p
=
new
Proxy
(
item
,
reuseInterval
,
s
[
0
],
s
[
1
]
);
proxyQueue
.
add
(
p
);
allProxy
.
put
(
s
[
0
],
p
);
allProxy
.
put
(
s
[
2
],
p
);
}
}
catch
(
NumberFormatException
e
)
{
logger
.
error
(
"HttpHost init error:"
,
e
);
...
...
@@ -174,7 +174,7 @@ public class ProxyPool {
logger
.
info
(
"proxy pool size>>>>"
+
allProxy
.
size
());
}
public
HttpHost
getProxy
()
{
public
Proxy
getProxy
()
{
Proxy
proxy
=
null
;
try
{
Long
time
=
System
.
currentTimeMillis
();
...
...
@@ -192,7 +192,7 @@ public class ProxyPool {
if
(
proxy
==
null
)
{
throw
new
NoSuchElementException
();
}
return
proxy
.
getHttpHost
()
;
return
proxy
;
}
public
void
returnProxy
(
HttpHost
host
,
int
statusCode
)
{
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
View file @
a8c90053
...
...
@@ -90,12 +90,12 @@ public class HttpClientDownloaderTest {
private
String
getCharsetByUrl
(
String
url
)
{
HttpClientDownloader
downloader
=
new
HttpClientDownloader
();
Site
site
=
Site
.
me
();
CloseableHttpClient
httpClient
=
new
HttpClientGenerator
().
getClient
(
site
);
CloseableHttpClient
httpClient
=
new
HttpClientGenerator
().
getClient
(
site
,
null
);
// encoding in http header Content-Type
Request
requestGBK
=
new
Request
(
url
);
CloseableHttpResponse
httpResponse
=
null
;
try
{
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestGBK
,
site
,
null
));
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestGBK
,
site
,
null
,
null
));
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
View file @
a8c90053
...
...
@@ -22,9 +22,9 @@ public class ProxyTest {
public
static
void
before
()
{
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String
[]
source
=
{
"
0.0.0.1:0"
,
"0.0.0.2:0"
,
"0.0.0.3:0"
,
"
0.0.0.4:0"
};
String
[]
source
=
{
"
::0.0.0.1:0"
,
"::0.0.0.2:0"
,
"::0.0.0.3:0"
,
"::
0.0.0.4:0"
};
for
(
String
line
:
source
)
{
httpProxyList
.
add
(
new
String
[]
{
line
.
split
(
":"
)[
0
],
line
.
split
(
":"
)[
1
]
});
httpProxyList
.
add
(
new
String
[]
{
line
.
split
(
":"
)[
0
],
line
.
split
(
":"
)[
1
],
line
.
split
(
":"
)[
2
],
line
.
split
(
":"
)[
3
]
});
}
}
...
...
@@ -37,7 +37,8 @@ public class ProxyTest {
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
List
<
Fetch
>
fetchList
=
new
ArrayList
<
Fetch
>();
while
(
proxyPool
.
getIdleNum
()
!=
0
)
{
HttpHost
httphost
=
proxyPool
.
getProxy
();
Proxy
proxy
=
proxyPool
.
getProxy
();
HttpHost
httphost
=
proxy
.
getHttpHost
();
// httphostList.add(httphost);
System
.
out
.
println
(
httphost
.
getHostName
()
+
":"
+
httphost
.
getPort
());
Fetch
tmp
=
new
Fetch
(
httphost
);
...
...
@@ -69,4 +70,5 @@ public class ProxyTest {
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment