Commit 436edb27 authored by shenjunlin's avatar shenjunlin

添加webDriver的实现支持

parent be892b80
......@@ -7,3 +7,4 @@ out/
.settings/
bin/
.myeclipse
*.log
......@@ -83,7 +83,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<version>23.5-jre</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
......
......@@ -8,30 +8,41 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId>
<properties>
<webdrivermanager.version>2.0.1</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.41.0</version>
<version>3.5.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>com.github.detro</groupId>-->
<!--<artifactId>phantomjsdriver</artifactId>-->
<!--<version>1.2.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>${webdrivermanager.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
</dependencies>
<build>
......
......@@ -35,16 +35,10 @@ public class SeleniumDownloader implements Downloader, Closeable {
private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs";
private WebDriverPool.DriverType driverType;
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
public SeleniumDownloader(WebDriverPool.DriverType driverType) {
this.driverType = driverType;
}
/**
......@@ -53,8 +47,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
this.driverType = WebDriverPool.DriverType.PhantomJS;
}
/**
......@@ -116,7 +109,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
webDriverPool = new WebDriverPool(poolSize, driverType);
}
}
}
......
package us.codecraft.webmagic.downloader.selenium;
import io.github.bonigarcia.wdm.*;
import org.apache.log4j.Logger;
import org.openqa.selenium.Capabilities;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.opera.OperaDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
public class WebDriverPool {
private Logger logger = Logger.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;
......@@ -35,125 +34,46 @@ class WebDriverPool {
private final static int STAT_RUNNING = 1;
private final static int STAT_CLODED = 2;
private final static int STAT_CLOSED = 2;
private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);
/*
* new fields for configuring phantomJS
*/
private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs";
protected static Properties sConfig;
protected static DesiredCapabilities sCaps;
/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
String configFile = DEFAULT_CONFIG_FILE;
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));
// Prepare capabilities
sCaps = new DesiredCapabilities();
sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Fetch PhantomJS-specific configuration parameters
if (driver.equals(DRIVER_PHANTOMJS)) {
// "phantomjs_exec_path"
if (sConfig.getProperty("phantomjs_exec_path") != null) {
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
sConfig.getProperty("phantomjs_exec_path"));
} else {
throw new IOException(
String.format(
"Property '%s' not set!",
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY));
}
// "phantomjs_driver_path"
if (sConfig.getProperty("phantomjs_driver_path") != null) {
System.out.println("Test will use an external GhostDriver");
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY,
sConfig.getProperty("phantomjs_driver_path"));
} else {
System.out
.println("Test will use PhantomJS internal GhostDriver");
}
}
private DriverType driverType;
// Disable "web-security", enable all possible "ssl-protocols" and
// "ignore-ssl-errors" for PhantomJSDriver
// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
// String[] {
// "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
ArrayList<String> cliArgsCap = new ArrayList<String>();
cliArgsCap.add("--web-security=false");
cliArgsCap.add("--ssl-protocol=any");
cliArgsCap.add("--ignore-ssl-errors=true");
sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
cliArgsCap);
// Control LogLevel for GhostDriver, via CLI arguments
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
new String[] { "--logLevel="
+ (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig
.getProperty("phantomjs_driver_loglevel")
: "INFO") });
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
mDriver = new ChromeDriver(sCaps);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
public enum DriverType {
Chrome,
Firefox,
Opera,
PhantomJS,
Microsoft_Edge,
Internet_Explorer
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private boolean isUrl(String urlString) {
try {
new URL(urlString);
return true;
} catch (MalformedURLException mue) {
return false;
public void initWebDriver() throws IOException {
switch (this.driverType) {
case Chrome:
ChromeDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new ChromeDriver();
break;
case Opera:
OperaDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new OperaDriver();
break;
case Firefox:
FirefoxDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new FirefoxDriver();
break;
case PhantomJS:
PhantomJsDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new PhantomJSDriver();
break;
case Internet_Explorer:
InternetExplorerDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new InternetExplorerDriver();
break;
}
}
......@@ -168,12 +88,19 @@ class WebDriverPool {
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();
public WebDriverPool(int capacity, DriverType driverType) {
this.capacity = capacity;
this.driverType = driverType;
}
public WebDriverPool(int capacity) {
this.capacity = capacity;
this.driverType = DriverType.PhantomJS;
}
public WebDriverPool() {
this(DEFAULT_CAPACITY);
public WebDriverPool(DriverType driverType) {
this.capacity = DEFAULT_CAPACITY;
this.driverType = driverType;
}
/**
......@@ -190,20 +117,13 @@ class WebDriverPool {
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
// add new WebDriver instance into pool
try {
configure();
initWebDriver();
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
......@@ -223,7 +143,7 @@ class WebDriverPool {
}
public void closeAll() {
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLOSED);
if (!b) {
throw new IllegalStateException("Already closed!");
}
......
......@@ -14,12 +14,10 @@ import us.codecraft.webmagic.Task;
*/
public class SeleniumDownloaderTest {
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
@Ignore("need chrome driver")
@Test
public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(WebDriverPool.DriverType.PhantomJS);
long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
......@@ -41,7 +39,7 @@ public class SeleniumDownloaderTest {
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
SeleniumDownloader seleniumDownloader = new SeleniumDownloader();
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
......
......@@ -11,13 +11,10 @@ import org.openqa.selenium.WebDriver;
*/
public class WebDriverPoolTest {
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
@Ignore("need chrome driver")
@Test
public void test() {
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
WebDriverPool webDriverPool = new WebDriverPool(5);
WebDriverPool webDriverPool = new WebDriverPool(5, WebDriverPool.DriverType.Chrome);
for (int i = 0; i < 5; i++) {
try {
WebDriver webDriver = webDriverPool.get();
......
......@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.downloader.selenium.WebDriverPool;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
......@@ -39,7 +40,7 @@ public class HuabanProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.addPipeline(new FilePipeline("/data/webmagic/test/"))
.setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.setDownloader(new SeleniumDownloader(WebDriverPool.DriverType.PhantomJS))
.addUrl("http://huaban.com/")
.runAsync();
}
......
package us.codecraft.webmagic.samples;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
import java.util.List;
/**
* 爬取的页面 http://s.weibo.com/top/summary
*/
public class WeiboTopSpider implements PageProcessor {
private static final Logger logger = LoggerFactory.getLogger(WeiboTopSpider.class);
private Site site = Site.me().setRetryTimes(3).setSleepTime(3000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
@Override
public void process(Page page) {
Selectable tbody = page.getHtml().$(".hot_ranklist tbody");
List<Selectable> ths = tbody.$("tr").nodes();
for (Selectable selectable : ths) {
String keyWords = selectable.$(".td_02 > div > p > a","text").get();
String url = "http://s.weibo.com/" + selectable.$(".td_02 > div > p > a","href").get();
String score = selectable.$(".td_03 > p > span","text").get();
logger.info(keyWords);
logger.info(url);
logger.info(score);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws JMException {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(new SeleniumDownloader())
.thread(1);
weiboSpider.start();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment