Commit b051f3e9 authored by shenjunlin's avatar shenjunlin

fix bug

parent 21bd1bee
package us.codecraft.webmagic.downloader;
import io.github.bonigarcia.wdm.PhantomJsDriverManager;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.remote.CapabilityType;
......@@ -19,10 +17,6 @@ public class ProxyWebDriverFactory {
private static Logger logger = LoggerFactory.getLogger(ProxyWebDriverFactory.class);
static {
PhantomJsDriverManager.getInstance().forceCache().useMirror().setup();
}
public static WebDriver getProxyDriver(ProxyProvider proxyProvider, Task task){
DesiredCapabilities desiredCapabilities = getDesiredCapabilities(proxyProvider, task);
WebDriver webDriver = new PhantomJSDriver(desiredCapabilities);
......@@ -42,10 +36,7 @@ public class ProxyWebDriverFactory {
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System.setProperty("http.nonProxyHosts", "localhost");
cap.setCapability(CapabilityType.PROXY, seleniumProxy);
// cap.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, Lists.newArrayList("--proxy="+ proxy.getHost() +":"+ proxy.getPort()));
}
return cap;
}
}
package us.codecraft.webmagic.downloader;
import io.github.bonigarcia.wdm.ChromeDriverManager;
import io.github.bonigarcia.wdm.InternetExplorerDriverManager;
import io.github.bonigarcia.wdm.PhantomJsDriverManager;
import org.apache.commons.lang3.SystemUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
......@@ -21,14 +25,27 @@ import java.util.Map;
/**
* 使用Selieum渲染网页进行爬取
* 每个爬取任务有单独的下载器,同一时间不建议
*/
public class SeleniumDownloader implements Downloader, Closeable {
static {
PhantomJsDriverManager.getInstance().useMirror().forceCache().setup();
if (SystemUtils.IS_OS_WINDOWS) {
ChromeDriverManager.getInstance().useMirror().forceCache().setup();
InternetExplorerDriverManager.getInstance().useMirror().forceCache().setup();
} else if (SystemUtils.IS_OS_LINUX) {
//do nothing
} else if (SystemUtils.IS_OS_MAC) {
ChromeDriverManager.getInstance().useMirror().forceCache().setup();
}
}
private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
private int sleepTime = 20;
private int poolSize = 1;
......@@ -69,19 +86,54 @@ public class SeleniumDownloader implements Downloader, Closeable {
*/
@Override
public Page download(Request request, Task task) {
WebDriver webDriver = null;
WebDriver webDriver = getWebDriver(request, task);
if (webDriver != null) {
Page page = downLoadPage(webDriver, task, request);
//下载成功之后处理webDriver
handleWebDriverAfterDownload(webDriver);
return page;
}
return null;
}
private WebDriver getWebDriver(Request request, Task task) {
if (proxyProvider != null) {
webDriver = ProxyWebDriverFactory.getProxyDriver(proxyProvider, task);
return ProxyWebDriverFactory.getProxyDriver(proxyProvider, task);
} else {
checkInit();
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
Thread.currentThread().interrupt();
return webDriverPool.get();
} catch (Exception e) {
if (e instanceof InterruptedException) {
logger.warn("interrupted", e);
Thread.currentThread().interrupt();
} else {
logger.error(e.getMessage(), e);
}
return null;
}
}
}
/**
* 下载成功之后处理driver
* @param webDriver
*/
private void handleWebDriverAfterDownload(WebDriver webDriver) {
if (proxyProvider != null) { //用代理的直接销毁driver
webDriver.quit();
} else { //不用代理的返回driver链接池
webDriverPool.returnToPool(webDriver);
}
}
/**
* 通过webDriver下载代码
* @return
*/
private Page downLoadPage(WebDriver webDriver, Task task, Request request){
logger.info("downloading page " + request.getUrl());
try {
webDriver.get(request.getUrl());
......@@ -93,6 +145,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
logger.error(e.getMessage(), e);
}
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
......@@ -104,12 +157,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
......@@ -117,8 +164,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
//webDriverPool.closeAll();
return page;
}
......
package us.codecraft.webmagic.downloader;
import io.github.bonigarcia.wdm.*;
import org.apache.commons.lang3.SystemUtils;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.edge.EdgeDriver;
......@@ -57,7 +58,6 @@ public class WebDriverPool {
switch (this.driverType) {
case Chrome:
ChromeDriverManager.getInstance().useMirror().forceCache().setup();
mDriver = new ChromeDriver();
break;
case Opera:
......@@ -69,11 +69,9 @@ public class WebDriverPool {
mDriver = new FirefoxDriver();
break;
case PhantomJS:
PhantomJsDriverManager.getInstance().useMirror().forceCache().setup();
mDriver = new PhantomJSDriver();
break;
case Internet_Explorer:
InternetExplorerDriverManager.getInstance().useMirror().forceCache().setup();
mDriver = new InternetExplorerDriver();
break;
case Microsoft_Edge:
......@@ -110,6 +108,7 @@ public class WebDriverPool {
if (poll != null) {
return poll;
}
logger.info("现在的driver{}", webDriverList.size());
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
......@@ -148,5 +147,4 @@ public class WebDriverPool {
webDriver.quit();
}
}
}
......@@ -12,6 +12,7 @@ import us.codecraft.webmagic.downloader.WebDriverPool;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Data5UProxyProvider;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Selectable;
......@@ -50,10 +51,11 @@ public class WeiboTopSpider implements PageProcessor {
public static void main(String[] args) throws JMException {
SeleniumDownloader downloader = new SeleniumDownloader();
// SeleniumDownloader downloader = new SeleniumDownloader();
// downloader.setProxyProvider(new Data5UProxyProvider());
downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("116.233.89.22",8060)));
ProxyProvider proxyProvider = SimpleProxyProvider.from(new Proxy("116.233.89.22",8060));
// downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("116.233.89.22",8060)));
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
......@@ -64,8 +66,10 @@ public class WeiboTopSpider implements PageProcessor {
@Override
public void run() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader();
seleniumDownloader.setProxyProvider(proxyProvider);
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(downloader)
.addUrl("http://s.weibo.com/top/summary").setDownloader(seleniumDownloader)
.thread(1);
weiboSpider.start();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment