Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
436edb27
Commit
436edb27
authored
Dec 19, 2017
by
shenjunlin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加webDriver的实现支持
parent
be892b80
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
132 additions
and
157 deletions
+132
-157
.gitignore
.gitignore
+1
-0
pom.xml
pom.xml
+1
-1
pom.xml
webmagic-selenium/pom.xml
+17
-6
SeleniumDownloader.java
...raft/webmagic/downloader/selenium/SeleniumDownloader.java
+5
-12
WebDriverPool.java
...codecraft/webmagic/downloader/selenium/WebDriverPool.java
+49
-129
SeleniumDownloaderTest.java
.../webmagic/downloader/selenium/SeleniumDownloaderTest.java
+2
-4
WebDriverPoolTest.java
...craft/webmagic/downloader/selenium/WebDriverPoolTest.java
+1
-4
HuabanProcessor.java
...t/java/us/codecraft/webmagic/samples/HuabanProcessor.java
+2
-1
WeiboTopSpider.java
...st/java/us/codecraft/webmagic/samples/WeiboTopSpider.java
+54
-0
No files found.
.gitignore
View file @
436edb27
...
...
@@ -7,3 +7,4 @@ out/
.settings/
bin/
.myeclipse
*.log
pom.xml
View file @
436edb27
...
...
@@ -83,7 +83,7 @@
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
<version>
15.0
</version>
<version>
23.5-jre
</version>
</dependency>
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
...
...
webmagic-selenium/pom.xml
View file @
436edb27
...
...
@@ -8,30 +8,41 @@
<modelVersion>
4.0.0
</modelVersion>
<artifactId>
webmagic-selenium
</artifactId>
<properties>
<webdrivermanager.version>
2.0.1
</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
<groupId>
org.seleniumhq.selenium
</groupId>
<artifactId>
selenium-java
</artifactId>
<version>
2.41.0
</version>
<version>
3.5.3
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-core
</artifactId>
<version>
${project.version}
</version>
</dependency>
<!--<dependency>-->
<!--<groupId>com.github.detro</groupId>-->
<!--<artifactId>phantomjsdriver</artifactId>-->
<!--<version>1.2.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>
com.github.detro
</groupId>
<artifactId>
phantomjsdriv
er
</artifactId>
<version>
1.2.0
</version>
<groupId>
io.github.bonigarcia
</groupId>
<artifactId>
webdrivermanag
er
</artifactId>
<version>
${webdrivermanager.version}
</version>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-extension
</artifactId>
<version>
0.7.3
</version>
</dependency>
</dependencies>
<build>
...
...
webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
View file @
436edb27
...
...
@@ -35,16 +35,10 @@ public class SeleniumDownloader implements Downloader, Closeable {
private
int
poolSize
=
1
;
private
static
final
String
DRIVER_PHANTOMJS
=
"phantomjs"
;
private
WebDriverPool
.
DriverType
driverType
;
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public
SeleniumDownloader
(
String
chromeDriverPath
)
{
System
.
getProperties
().
setProperty
(
"webdriver.chrome.driver"
,
chromeDriverPath
);
public
SeleniumDownloader
(
WebDriverPool
.
DriverType
driverType
)
{
this
.
driverType
=
driverType
;
}
/**
...
...
@@ -53,8 +47,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
* @author bob.li.0718@gmail.com
*/
public
SeleniumDownloader
()
{
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
this
.
driverType
=
WebDriverPool
.
DriverType
.
PhantomJS
;
}
/**
...
...
@@ -116,7 +109,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
private
void
checkInit
()
{
if
(
webDriverPool
==
null
)
{
synchronized
(
this
)
{
webDriverPool
=
new
WebDriverPool
(
poolSize
);
webDriverPool
=
new
WebDriverPool
(
poolSize
,
driverType
);
}
}
}
...
...
webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
View file @
436edb27
package
us
.
codecraft
.
webmagic
.
downloader
.
selenium
;
import
io.github.bonigarcia.wdm.*
;
import
org.apache.log4j.Logger
;
import
org.openqa.selenium.Capabilities
;
import
org.openqa.selenium.WebDriver
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.firefox.FirefoxDriver
;
import
org.openqa.selenium.ie.InternetExplorerDriver
;
import
org.openqa.selenium.opera.OperaDriver
;
import
org.openqa.selenium.phantomjs.PhantomJSDriver
;
import
org.openqa.selenium.phantomjs.PhantomJSDriverService
;
import
org.openqa.selenium.remote.DesiredCapabilities
;
import
org.openqa.selenium.remote.RemoteWebDriver
;
import
java.io.FileReader
;
import
java.io.IOException
;
import
java.net.MalformedURLException
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Properties
;
import
java.util.concurrent.BlockingDeque
;
import
java.util.concurrent.LinkedBlockingDeque
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class
WebDriverPool
{
public
class
WebDriverPool
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
final
static
int
DEFAULT_CAPACITY
=
5
;
...
...
@@ -35,125 +34,46 @@ class WebDriverPool {
private
final
static
int
STAT_RUNNING
=
1
;
private
final
static
int
STAT_CLO
D
ED
=
2
;
private
final
static
int
STAT_CLO
S
ED
=
2
;
private
AtomicInteger
stat
=
new
AtomicInteger
(
STAT_RUNNING
);
/*
* new fields for configuring phantomJS
*/
private
WebDriver
mDriver
=
null
;
private
boolean
mAutoQuitDriver
=
true
;
private
static
final
String
DEFAULT_CONFIG_FILE
=
"/data/webmagic/webmagic-selenium/config.ini"
;
private
static
final
String
DRIVER_FIREFOX
=
"firefox"
;
private
static
final
String
DRIVER_CHROME
=
"chrome"
;
private
static
final
String
DRIVER_PHANTOMJS
=
"phantomjs"
;
protected
static
Properties
sConfig
;
protected
static
DesiredCapabilities
sCaps
;
/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public
void
configure
()
throws
IOException
{
// Read config file
sConfig
=
new
Properties
();
String
configFile
=
DEFAULT_CONFIG_FILE
;
if
(
System
.
getProperty
(
"selenuim_config"
)!=
null
){
configFile
=
System
.
getProperty
(
"selenuim_config"
);
}
sConfig
.
load
(
new
FileReader
(
configFile
));
// Prepare capabilities
sCaps
=
new
DesiredCapabilities
();
sCaps
.
setJavascriptEnabled
(
true
);
sCaps
.
setCapability
(
"takesScreenshot"
,
false
);
String
driver
=
sConfig
.
getProperty
(
"driver"
,
DRIVER_PHANTOMJS
);
// Fetch PhantomJS-specific configuration parameters
if
(
driver
.
equals
(
DRIVER_PHANTOMJS
))
{
// "phantomjs_exec_path"
if
(
sConfig
.
getProperty
(
"phantomjs_exec_path"
)
!=
null
)
{
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_EXECUTABLE_PATH_PROPERTY
,
sConfig
.
getProperty
(
"phantomjs_exec_path"
));
}
else
{
throw
new
IOException
(
String
.
format
(
"Property '%s' not set!"
,
PhantomJSDriverService
.
PHANTOMJS_EXECUTABLE_PATH_PROPERTY
));
}
// "phantomjs_driver_path"
if
(
sConfig
.
getProperty
(
"phantomjs_driver_path"
)
!=
null
)
{
System
.
out
.
println
(
"Test will use an external GhostDriver"
);
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY
,
sConfig
.
getProperty
(
"phantomjs_driver_path"
));
}
else
{
System
.
out
.
println
(
"Test will use PhantomJS internal GhostDriver"
);
}
}
private
DriverType
driverType
;
// Disable "web-security", enable all possible "ssl-protocols" and
// "ignore-ssl-errors" for PhantomJSDriver
// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
// String[] {
// "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
ArrayList
<
String
>
cliArgsCap
=
new
ArrayList
<
String
>();
cliArgsCap
.
add
(
"--web-security=false"
);
cliArgsCap
.
add
(
"--ssl-protocol=any"
);
cliArgsCap
.
add
(
"--ignore-ssl-errors=true"
);
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_CLI_ARGS
,
cliArgsCap
);
// Control LogLevel for GhostDriver, via CLI arguments
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_GHOSTDRIVER_CLI_ARGS
,
new
String
[]
{
"--logLevel="
+
(
sConfig
.
getProperty
(
"phantomjs_driver_loglevel"
)
!=
null
?
sConfig
.
getProperty
(
"phantomjs_driver_loglevel"
)
:
"INFO"
)
});
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if
(
isUrl
(
driver
))
{
sCaps
.
setBrowserName
(
"phantomjs"
);
mDriver
=
new
RemoteWebDriver
(
new
URL
(
driver
),
sCaps
);
}
else
if
(
driver
.
equals
(
DRIVER_FIREFOX
))
{
mDriver
=
new
FirefoxDriver
(
sCaps
);
}
else
if
(
driver
.
equals
(
DRIVER_CHROME
))
{
mDriver
=
new
ChromeDriver
(
sCaps
);
}
else
if
(
driver
.
equals
(
DRIVER_PHANTOMJS
))
{
mDriver
=
new
PhantomJSDriver
(
sCaps
);
}
public
enum
DriverType
{
Chrome
,
Firefox
,
Opera
,
PhantomJS
,
Microsoft_Edge
,
Internet_Explorer
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private
boolean
isUrl
(
String
urlString
)
{
try
{
new
URL
(
urlString
);
return
true
;
}
catch
(
MalformedURLException
mue
)
{
return
false
;
public
void
initWebDriver
()
throws
IOException
{
switch
(
this
.
driverType
)
{
case
Chrome:
ChromeDriverManager
.
getInstance
().
useTaobaoMirror
().
forceCache
().
setup
();
mDriver
=
new
ChromeDriver
();
break
;
case
Opera:
OperaDriverManager
.
getInstance
().
useTaobaoMirror
().
forceCache
().
setup
();
mDriver
=
new
OperaDriver
();
break
;
case
Firefox:
FirefoxDriverManager
.
getInstance
().
useTaobaoMirror
().
forceCache
().
setup
();
mDriver
=
new
FirefoxDriver
();
break
;
case
PhantomJS:
PhantomJsDriverManager
.
getInstance
().
useTaobaoMirror
().
forceCache
().
setup
();
mDriver
=
new
PhantomJSDriver
();
break
;
case
Internet_Explorer:
InternetExplorerDriverManager
.
getInstance
().
useTaobaoMirror
().
forceCache
().
setup
();
mDriver
=
new
InternetExplorerDriver
();
break
;
}
}
...
...
@@ -168,12 +88,19 @@ class WebDriverPool {
*/
private
BlockingDeque
<
WebDriver
>
innerQueue
=
new
LinkedBlockingDeque
<
WebDriver
>();
public
WebDriverPool
(
int
capacity
,
DriverType
driverType
)
{
this
.
capacity
=
capacity
;
this
.
driverType
=
driverType
;
}
public
WebDriverPool
(
int
capacity
)
{
this
.
capacity
=
capacity
;
this
.
driverType
=
DriverType
.
PhantomJS
;
}
public
WebDriverPool
()
{
this
(
DEFAULT_CAPACITY
);
public
WebDriverPool
(
DriverType
driverType
)
{
this
.
capacity
=
DEFAULT_CAPACITY
;
this
.
driverType
=
driverType
;
}
/**
...
...
@@ -190,20 +117,13 @@ class WebDriverPool {
if
(
webDriverList
.
size
()
<
capacity
)
{
synchronized
(
webDriverList
)
{
if
(
webDriverList
.
size
()
<
capacity
)
{
// add new WebDriver instance into pool
try
{
configure
();
initWebDriver
();
innerQueue
.
add
(
mDriver
);
webDriverList
.
add
(
mDriver
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
...
...
@@ -223,7 +143,7 @@ class WebDriverPool {
}
public
void
closeAll
()
{
boolean
b
=
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_CLO
D
ED
);
boolean
b
=
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_CLO
S
ED
);
if
(!
b
)
{
throw
new
IllegalStateException
(
"Already closed!"
);
}
...
...
webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java
View file @
436edb27
...
...
@@ -14,12 +14,10 @@ import us.codecraft.webmagic.Task;
*/
public
class
SeleniumDownloaderTest
{
private
String
chromeDriverPath
=
"/Users/yihua/Downloads/chromedriver"
;
@Ignore
(
"need chrome driver"
)
@Test
public
void
test
()
{
SeleniumDownloader
seleniumDownloader
=
new
SeleniumDownloader
(
chromeDriverPath
);
SeleniumDownloader
seleniumDownloader
=
new
SeleniumDownloader
(
WebDriverPool
.
DriverType
.
PhantomJS
);
long
time1
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
100
;
i
++)
{
Page
page
=
seleniumDownloader
.
download
(
new
Request
(
"http://huaban.com/"
),
new
Task
()
{
...
...
@@ -41,7 +39,7 @@ public class SeleniumDownloaderTest {
@Ignore
@Test
public
void
testBaiduWenku
()
{
SeleniumDownloader
seleniumDownloader
=
new
SeleniumDownloader
(
chromeDriverPath
);
SeleniumDownloader
seleniumDownloader
=
new
SeleniumDownloader
();
seleniumDownloader
.
setSleepTime
(
10000
);
long
time1
=
System
.
currentTimeMillis
();
Page
page
=
seleniumDownloader
.
download
(
new
Request
(
"http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"
),
new
Task
()
{
...
...
webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java
View file @
436edb27
...
...
@@ -11,13 +11,10 @@ import org.openqa.selenium.WebDriver;
*/
public
class
WebDriverPoolTest
{
private
String
chromeDriverPath
=
"/Users/yihua/Downloads/chromedriver"
;
@Ignore
(
"need chrome driver"
)
@Test
public
void
test
()
{
System
.
getProperties
().
setProperty
(
"webdriver.chrome.driver"
,
chromeDriverPath
);
WebDriverPool
webDriverPool
=
new
WebDriverPool
(
5
);
WebDriverPool
webDriverPool
=
new
WebDriverPool
(
5
,
WebDriverPool
.
DriverType
.
Chrome
);
for
(
int
i
=
0
;
i
<
5
;
i
++)
{
try
{
WebDriver
webDriver
=
webDriverPool
.
get
();
...
...
webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java
View file @
436edb27
...
...
@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.selenium.SeleniumDownloader
;
import
us.codecraft.webmagic.downloader.selenium.WebDriverPool
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
...
@@ -39,7 +40,7 @@ public class HuabanProcessor implements PageProcessor {
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
HuabanProcessor
()).
thread
(
5
)
.
addPipeline
(
new
FilePipeline
(
"/data/webmagic/test/"
))
.
setDownloader
(
new
SeleniumDownloader
(
"/Users/yihua/Downloads/chromedriver"
))
.
setDownloader
(
new
SeleniumDownloader
(
WebDriverPool
.
DriverType
.
PhantomJS
))
.
addUrl
(
"http://huaban.com/"
)
.
runAsync
();
}
...
...
webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/WeiboTopSpider.java
0 → 100644
View file @
436edb27
package
us
.
codecraft
.
webmagic
.
samples
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.selenium.SeleniumDownloader
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover
;
import
us.codecraft.webmagic.selector.Selectable
;
import
javax.management.JMException
;
import
java.util.List
;
/**
* 爬取的页面 http://s.weibo.com/top/summary
*/
public
class
WeiboTopSpider
implements
PageProcessor
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopSpider
.
class
);
private
Site
site
=
Site
.
me
().
setRetryTimes
(
3
).
setSleepTime
(
3000
)
.
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
@Override
public
void
process
(
Page
page
)
{
Selectable
tbody
=
page
.
getHtml
().
$
(
".hot_ranklist tbody"
);
List
<
Selectable
>
ths
=
tbody
.
$
(
"tr"
).
nodes
();
for
(
Selectable
selectable
:
ths
)
{
String
keyWords
=
selectable
.
$
(
".td_02 > div > p > a"
,
"text"
).
get
();
String
url
=
"http://s.weibo.com/"
+
selectable
.
$
(
".td_02 > div > p > a"
,
"href"
).
get
();
String
score
=
selectable
.
$
(
".td_03 > p > span"
,
"text"
).
get
();
logger
.
info
(
keyWords
);
logger
.
info
(
url
);
logger
.
info
(
score
);
}
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
static
void
main
(
String
[]
args
)
throws
JMException
{
Spider
weiboSpider
=
Spider
.
create
(
new
WeiboTopSpider
())
.
addUrl
(
"http://s.weibo.com/top/summary"
).
setDownloader
(
new
SeleniumDownloader
())
.
thread
(
1
);
weiboSpider
.
start
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment