Commit 21bd1bee authored by shenjunlin's avatar shenjunlin

添加代理

parent 6720ec3c
......@@ -42,9 +42,13 @@
<version>2.53.1</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
<!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver -->
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.3.0</version>
<!--<groupId>com.github.detro</groupId>-->
<!--<artifactId>phantomjsdriver</artifactId>-->
<!--<version>1.2.0</version>-->
<exclusions>
<exclusion>
<artifactId>selenium-java</artifactId>
......
package us.codecraft.webmagic.downloader;
import io.github.bonigarcia.wdm.PhantomJsDriverManager;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
/**
* 获取含有代理的webDriver
*/
public class ProxyWebDriverFactory {
private static Logger logger = LoggerFactory.getLogger(ProxyWebDriverFactory.class);
static {
PhantomJsDriverManager.getInstance().forceCache().useMirror().setup();
}
public static WebDriver getProxyDriver(ProxyProvider proxyProvider, Task task){
DesiredCapabilities desiredCapabilities = getDesiredCapabilities(proxyProvider, task);
WebDriver webDriver = new PhantomJSDriver(desiredCapabilities);
return webDriver;
}
private static DesiredCapabilities getDesiredCapabilities(ProxyProvider proxyProvider, Task task) {
DesiredCapabilities cap = new DesiredCapabilities();
if (proxyProvider != null) {
Proxy proxy = proxyProvider.getProxy(task);
String proxyIpAndPort = proxy.getHost() + ":" + proxy.getPort();
logger.info("使用代理IP:{}", proxyIpAndPort);
org.openqa.selenium.Proxy seleniumProxy = new org.openqa.selenium.Proxy();
seleniumProxy.setHttpProxy(proxyIpAndPort).setFtpProxy(proxyIpAndPort).setSslProxy(proxyIpAndPort);
cap.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System.setProperty("http.nonProxyHosts", "localhost");
cap.setCapability(CapabilityType.PROXY, seleniumProxy);
// cap.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, Lists.newArrayList("--proxy="+ proxy.getHost() +":"+ proxy.getPort()));
}
return cap;
}
}
......@@ -4,15 +4,12 @@ import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
......@@ -72,21 +69,29 @@ public class SeleniumDownloader implements Downloader, Closeable {
*/
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver = null;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
Thread.currentThread().interrupt();
if (proxyProvider != null) {
webDriver = ProxyWebDriverFactory.getProxyDriver(proxyProvider, task);
} else {
checkInit();
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
Thread.currentThread().interrupt();
}
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
logger.info("downloading page " + request.getUrl());
try {
webDriver.get(request.getUrl());
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} catch (Exception e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
} else {
logger.error(e.getMessage(), e);
}
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
......@@ -120,7 +125,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize, driverType);
webDriverPool = new WebDriverPool(driverType);
}
}
}
......@@ -135,23 +140,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
webDriverPool.closeAll();
}
private DesiredCapabilities getDesiredCapabilities(Task task) {
DesiredCapabilities cap = new DesiredCapabilities();
if (proxyProvider != null) {
Proxy proxy = proxyProvider.getProxy(task);
String proxyIpAndPort = proxy.getHost()+":"+ proxy.getPort();
logger.info("使用代理IP:{}", proxyIpAndPort);
org.openqa.selenium.Proxy seleniumProxy = new org.openqa.selenium.Proxy();
seleniumProxy.setHttpProxy(proxyIpAndPort).setFtpProxy(proxyIpAndPort).setSslProxy(proxyIpAndPort);
cap.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System.setProperty("http.nonProxyHosts", "localhost");
cap.setCapability(CapabilityType.PROXY, proxy);
}
return cap;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
......
......@@ -10,6 +10,8 @@ import org.openqa.selenium.opera.OperaDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.ProxyProvider;
import java.io.IOException;
import java.util.ArrayList;
......@@ -25,7 +27,10 @@ import java.util.concurrent.atomic.AtomicInteger;
public class WebDriverPool {
private Logger logger = LoggerFactory.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;
/**
* 默认开启两个webDriver
*/
private final static int DEFAULT_CAPACITY = 2;
private final int capacity;
......@@ -89,23 +94,13 @@ public class WebDriverPool {
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<>();
public WebDriverPool(int capacity, DriverType driverType) {
this.capacity = capacity;
this.driverType = driverType;
}
public WebDriverPool(int capacity) {
this.capacity = capacity;
this.driverType = DriverType.PhantomJS;
}
public WebDriverPool(DriverType driverType) {
this.capacity = DEFAULT_CAPACITY;
this.driverType = driverType;
}
/**
*
*
* @return
* @throws InterruptedException
*/
......
......@@ -14,7 +14,7 @@ public class WebDriverPoolTest {
@Ignore("need chrome driver")
@Test
public void test() {
WebDriverPool webDriverPool = new WebDriverPool(5, WebDriverPool.DriverType.Chrome);
WebDriverPool webDriverPool = new WebDriverPool(WebDriverPool.DriverType.Chrome);
for (int i = 0; i < 5; i++) {
try {
WebDriver webDriver = webDriverPool.get();
......
package us.codecraft.webmagic.samples;
import org.openqa.selenium.chrome.ChromeDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
......@@ -9,6 +10,7 @@ import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.SeleniumDownloader;
import us.codecraft.webmagic.downloader.WebDriverPool;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Data5UProxyProvider;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Selectable;
......@@ -48,23 +50,26 @@ public class WeiboTopSpider implements PageProcessor {
public static void main(String[] args) throws JMException {
SeleniumDownloader downloader = new SeleniumDownloader(WebDriverPool.DriverType.Chrome);
// downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.2.2.1",1123)));
SeleniumDownloader downloader = new SeleniumDownloader();
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
executorService.scheduleAtFixedRate(new Runnable() {
// downloader.setProxyProvider(new Data5UProxyProvider());
downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("116.233.89.22",8060)));
@Override
public void run() {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(downloader)
.thread(1);
weiboSpider.start();
}
}, 1, 30, TimeUnit.SECONDS);
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
for(int i = 0; i< 10 ; i++) {
executorService.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(downloader)
.thread(1);
weiboSpider.start();
}
}, 1, 30, TimeUnit.SECONDS);
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment