Commit 5cd56273 authored by shenjunlin's avatar shenjunlin

修改版本

parent fe736cee
......@@ -82,7 +82,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.5-jre</version>
<version>18.0</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
......
......@@ -39,23 +39,24 @@
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.8.1</version>
<version>2.53.1</version>
</dependency>
<dependency>
<groupId>com.codeborne</groupId>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.4.3</version>
<version>1.2.0</version>
<exclusions>
<exclusion>
<artifactId>selenium-remote-driver</artifactId>
<artifactId>selenium-java</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
<exclusion>
<artifactId>selenium-api</artifactId>
<artifactId>selenium-remote-driver</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
......
......@@ -4,12 +4,16 @@ import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
......@@ -33,6 +37,8 @@ public class SeleniumDownloader implements Downloader, Closeable {
private WebDriverPool.DriverType driverType;
private ProxyProvider proxyProvider;
public SeleniumDownloader(WebDriverPool.DriverType driverType) {
this.driverType = driverType;
}
......@@ -57,22 +63,30 @@ public class SeleniumDownloader implements Downloader, Closeable {
return this;
}
/**
* 使用动态代理:不从连接池中获取,每个请求完毕之后直接关闭driver
* 不使用动态代理:从连接池中获取,每个爬取任务结束之后清空连接池
* @param request request
* @param task task
* @return
*/
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
WebDriver webDriver = null;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
Thread.currentThread().interrupt();
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
Thread.currentThread().interrupt();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
......@@ -120,4 +134,25 @@ public class SeleniumDownloader implements Downloader, Closeable {
public void close() throws IOException {
webDriverPool.closeAll();
}
private DesiredCapabilities getDesiredCapabilities(Task task) {
DesiredCapabilities cap = new DesiredCapabilities();
if (proxyProvider != null) {
Proxy proxy = proxyProvider.getProxy(task);
String proxyIpAndPort = proxy.getHost()+":"+ proxy.getPort();
logger.info("使用代理IP:{}", proxyIpAndPort);
org.openqa.selenium.Proxy seleniumProxy = new org.openqa.selenium.Proxy();
seleniumProxy.setHttpProxy(proxyIpAndPort).setFtpProxy(proxyIpAndPort).setSslProxy(proxyIpAndPort);
cap.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System.setProperty("http.nonProxyHosts", "localhost");
cap.setCapability(CapabilityType.PROXY, proxy);
}
return cap;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
}
......@@ -123,7 +123,7 @@ public class WebDriverPool {
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
logger.error(e.getMessage(), e);
}
}
}
......
......@@ -5,8 +5,12 @@ import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.SeleniumDownloader;
import us.codecraft.webmagic.downloader.WebDriverPool;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
......@@ -44,13 +48,16 @@ public class WeiboTopSpider implements PageProcessor {
public static void main(String[] args) throws JMException {
SeleniumDownloader downloader = new SeleniumDownloader(WebDriverPool.DriverType.Chrome);
// downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.2.2.1",1123)));
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
executorService.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(new SeleniumDownloader())
.addUrl("http://s.weibo.com/top/summary").setDownloader(downloader)
.thread(1);
weiboSpider.start();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment