Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
fab44d3d
Commit
fab44d3d
authored
Apr 20, 2018
by
shenjunlin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复一个空指针
parent
2b9ca61e
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
137 additions
and
11 deletions
+137
-11
pom.xml
pom.xml
+1
-1
pom.xml
webmagic-core/pom.xml
+1
-1
pom.xml
webmagic-extension/pom.xml
+1
-1
ProxyWebDriverFactory.java
.../codecraft/webmagic/downloader/ProxyWebDriverFactory.java
+11
-3
SeleniumDownloader.java
.../us/codecraft/webmagic/downloader/SeleniumDownloader.java
+3
-1
WebDriverPool.java
.../java/us/codecraft/webmagic/downloader/WebDriverPool.java
+0
-1
pom.xml
webmagic-samples/pom.xml
+1
-1
MailaProxyIpUtil.java
...st/java/us/codecraft/webmagic/proxy/MailaProxyIpUtil.java
+75
-0
MonitorCouponTest.java
...t/java/us/codecraft/webmagic/proxy/MonitorCouponTest.java
+42
-0
pom.xml
webmagic-saxon/pom.xml
+1
-1
pom.xml
webmagic-scripts/pom.xml
+1
-1
No files found.
pom.xml
View file @
fab44d3d
...
...
@@ -6,7 +6,7 @@
<version>
7
</version>
</parent>
<groupId>
us.codecraft.duiba
</groupId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
<modelVersion>
4.0.0
</modelVersion>
<packaging>
pom
</packaging>
<properties>
...
...
webmagic-core/pom.xml
View file @
fab44d3d
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/pom.xml
View file @
fab44d3d
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/ProxyWebDriverFactory.java
View file @
fab44d3d
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.openqa.selenium.WebDriver
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.phantomjs.PhantomJSDriver
;
import
org.openqa.selenium.remote.CapabilityType
;
import
org.openqa.selenium.remote.DesiredCapabilities
;
...
...
@@ -9,6 +10,7 @@ import org.slf4j.LoggerFactory;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.ProxyProvider
;
import
us.codecraft.webmagic.utils.ProxyUtils
;
/**
* 获取含有代理的webDriver
...
...
@@ -19,6 +21,7 @@ public class ProxyWebDriverFactory {
public
static
WebDriver
getProxyDriver
(
ProxyProvider
proxyProvider
,
Task
task
){
DesiredCapabilities
desiredCapabilities
=
getDesiredCapabilities
(
proxyProvider
,
task
);
// WebDriver webDriver = new ChromeDriver(desiredCapabilities);
WebDriver
webDriver
=
new
PhantomJSDriver
(
desiredCapabilities
);
return
webDriver
;
}
...
...
@@ -29,11 +32,16 @@ public class ProxyWebDriverFactory {
if
(
proxyProvider
!=
null
)
{
Proxy
proxy
=
proxyProvider
.
getProxy
(
task
);
String
proxyIpAndPort
=
proxy
.
getHost
()
+
":"
+
proxy
.
getPort
();
logger
.
info
(
"使用代理IP:{}"
,
proxyIpAndPort
);
boolean
canUse
=
ProxyUtils
.
validateProxy
(
proxy
);
logger
.
info
(
"使用代理IP:{},是否可用{}"
,
proxyIpAndPort
,
canUse
);
// if (!canUse) {
// proxy = proxyProvider.getProxy(task);
// }
org
.
openqa
.
selenium
.
Proxy
seleniumProxy
=
new
org
.
openqa
.
selenium
.
Proxy
();
seleniumProxy
.
setHttpProxy
(
proxyIpAndPort
).
setFtpProxy
(
proxyIpAndPort
).
setSslProxy
(
proxyIpAndPort
);
cap
.
setCapability
(
CapabilityType
.
ForSeleniumServer
.
AVOIDING_PROXY
,
true
);
cap
.
setCapability
(
CapabilityType
.
ForSeleniumServer
.
ONLY_PROXYING_SELENIUM_TRAFFIC
,
true
);
//
cap.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
//
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System
.
setProperty
(
"http.nonProxyHosts"
,
"localhost"
);
cap
.
setCapability
(
CapabilityType
.
PROXY
,
seleniumProxy
);
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/SeleniumDownloader.java
View file @
fab44d3d
...
...
@@ -185,7 +185,9 @@ public class SeleniumDownloader implements Downloader, Closeable {
@Override
public
void
close
()
throws
IOException
{
webDriverPool
.
closeAll
();
if
(
webDriverPool
!=
null
)
{
webDriverPool
.
closeAll
();
}
}
public
void
setProxyProvider
(
ProxyProvider
proxyProvider
)
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/WebDriverPool.java
View file @
fab44d3d
...
...
@@ -116,7 +116,6 @@ public class WebDriverPool {
}
}
}
}
return
innerQueue
.
take
();
}
...
...
webmagic-samples/pom.xml
View file @
fab44d3d
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-samples/src/test/java/us/codecraft/webmagic/proxy/MailaProxyIpUtil.java
0 → 100644
View file @
fab44d3d
package
us
.
codecraft
.
webmagic
.
proxy
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
/**
* Created by sunchangji on 2017/11/16.
*/
public
class
MailaProxyIpUtil
{
private
MailaProxyIpUtil
(){}
private
static
final
Logger
LOGGER
=
LoggerFactory
.
getLogger
(
MailaProxyIpUtil
.
class
);
private
static
final
String
PROXY_IPS_URL
=
"http://pvt.daxiangdaili.com/ip/?tid=557577041643746&num=1000&delay=1"
+
"&sortby=time&protocol=https"
;
private
static
final
String
SEPARATOR
=
"\r\n"
;
private
static
final
String
SEPARATOR_IP
=
":"
;
private
static
final
RequestConfig
REQUEST_CONFIG
=
RequestConfig
.
custom
()
.
setSocketTimeout
(
10000
)
.
setConnectTimeout
(
10000
)
.
setConnectionRequestTimeout
(
10000
)
.
build
();
/**
* 获取代理ip对象
*
* @return
*/
public
static
Proxy
[]
getProxyIps
()
{
String
result
=
sendHttpGet
(
PROXY_IPS_URL
);
if
(
StringUtils
.
isBlank
(
result
))
{
return
new
Proxy
[]{};
}
String
[]
ips
=
StringUtils
.
split
(
result
,
SEPARATOR
);
int
len
=
ips
.
length
;
Proxy
[]
proxies
=
new
Proxy
[
len
];
for
(
int
i
=
0
;
i
<
len
;
i
++)
{
String
[]
ipPort
=
ips
[
i
].
split
(
SEPARATOR_IP
);
proxies
[
i
]
=
new
Proxy
(
ipPort
[
0
],
Integer
.
valueOf
(
ipPort
[
1
]));
}
return
proxies
;
}
/**
* 发送Get请求
*
* @param httpUrl
* @return
*/
private
static
String
sendHttpGet
(
String
httpUrl
)
{
HttpGet
httpGet
=
new
HttpGet
(
httpUrl
);
// 创建get请求
httpGet
.
setConfig
(
REQUEST_CONFIG
);
String
responseContent
=
""
;
try
(
CloseableHttpClient
httpClient
=
HttpClients
.
createDefault
();
CloseableHttpResponse
response
=
httpClient
.
execute
(
httpGet
))
{
HttpEntity
entity
=
response
.
getEntity
();
responseContent
=
EntityUtils
.
toString
(
entity
,
"UTF-8"
);
}
catch
(
Exception
e
)
{
LOGGER
.
warn
(
"HttpClients get proxy ip list error"
,
e
);
return
responseContent
;
}
return
responseContent
;
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/proxy/MonitorCouponTest.java
0 → 100644
View file @
fab44d3d
package
us
.
codecraft
.
webmagic
.
proxy
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.SeleniumDownloader
;
import
us.codecraft.webmagic.downloader.WebDriverPool
;
import
us.codecraft.webmagic.processor.PageProcessor
;
public
class
MonitorCouponTest
{
public
static
void
main
(
String
[]
args
)
{
Proxy
[]
proxyIps
=
MailaProxyIpUtil
.
getProxyIps
();
SeleniumDownloader
downloader
=
new
SeleniumDownloader
(
WebDriverPool
.
DriverType
.
Chrome
);
if
(
null
!=
proxyIps
&&
proxyIps
.
length
!=
0
){
downloader
.
setProxyProvider
(
SimpleProxyProvider
.
from
(
proxyIps
));
}
Request
request
=
new
Request
();
request
.
setUrl
(
"https://uland.taobao.com/coupon/edetail?e=%2Bt3mcdqXpCcGQASttHIRqUfQPq%2BJFja9CwpxDVVf0sIHyUnB597m48JdmzX2uTIj2NbK5%2FgosGvVeV%2FC36bhBsGSOsUw4E0fRtBXy%2FgzXjkFjaZhgpTjjWuFqp8TFaHM5HfRS%2B%2BJrK5WhTajwoPu9w%3D%3D&traceId=ac1d576815220294802996764d09e9&activityId=08ced324b6f04b87a9a59dd631e857bb"
);
// request.setUrl("http://hz.fang.com/");
request
.
setMethod
(
"get"
);
Spider
.
create
(
new
PageProcessor
()
{
@Override
public
void
process
(
Page
page
)
{
System
.
out
.
println
(
page
.
getHtml
());
}
@Override
public
Site
getSite
()
{
return
Site
.
me
().
setRetryTimes
(
1
)
.
setTimeOut
(
10000
)
.
setCharset
(
"UTF-8"
)
.
setSleepTime
(
2000
);
}
})
.
setDownloader
(
downloader
)
.
addRequest
(
request
)
.
thread
(
1
)
.
run
();
}
}
webmagic-saxon/pom.xml
View file @
fab44d3d
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-scripts/pom.xml
View file @
fab44d3d
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.8.
2
-SNAPSHOT
</version>
<version>
0.8.
3
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment