Commit 926dd346 authored by shenjunlin's avatar shenjunlin

下载完毕之后关闭driver

parent ceee1a8a
......@@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft.duiba</groupId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
......
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
package us.codecraft.webmagic.downloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
import java.io.*;
/**
* this downloader is used to download pages which need to render the javascript
*
* @author dolphineor@gmail.com
* @version 0.5.3
*/
public class PhantomJSDownloader extends AbstractDownloader {
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() {
this.initPhantomjsCrawlPath();
}
/**
* 添加新的构造函数,支持phantomjs自定义命令
*
* example:
* phantomjs.exe 支持windows环境
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand phantomJsCommand
*/
public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
}
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
* <pre>
* crawl.js start --
*
* var system = require('system');
* var url = system.args[1];
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
* page.close();
* phantom.exit();
* });
*
* -- crawl.js end
* </pre>
* 具体项目时可以将以上js代码复制下来使用
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
}
@Override
public Page download(Request request, Task task) {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
this.threadNum = threadNum;
}
protected String getPage(Request request) {
try {
String url = request.getUrl();
Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
}
}
var system = require('system');
var url = system.args[1];
var page = require('webpage').create();
page.settings.loadImages = false;
page.settings.resourceTimeout = 5000;
page.open(url, function (status) {
if (status != 'success') {
console.log("HTTP request failed!");
} else {
console.log(page.content);
}
page.close();
phantom.exit();
});
\ No newline at end of file
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.PhantomJSDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* Created by dolphineor on 2014-11-21.
* <p>
* 以淘宝为例, 搜索冬装的相关结果
*/
public class PhantomJSPageProcessor implements PageProcessor {
private Site site = Site.me()
.setDomain("s.taobao.com")
.setCharset("GBK")
.addHeader("Referer", "http://www.taobao.com/")
.setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
if (page.getRawText() != null)
page.putField("html", page.getRawText());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws Exception {
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
Spider.create(new PhantomJSPageProcessor())
.addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
.setDownloader(phantomDownloader)
.addPipeline(collectorPipeline)
.thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
.run();
List<ResultItems> resultItemsList = collectorPipeline.getCollected();
System.out.println(resultItemsList.get(0).get("html").toString());
}
}
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4-SNAPSHOT</version>
<version>0.7.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......@@ -16,7 +16,7 @@
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.5.3</version>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>us.codecraft.duiba</groupId>
......@@ -35,13 +35,24 @@
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.4.3</version>
<exclusions>
<exclusion>
<artifactId>selenium-remote-driver</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
<exclusion>
<artifactId>selenium-api</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4-SNAPSHOT</version>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
......
......@@ -98,7 +98,8 @@ public class SeleniumDownloader implements Downloader, Closeable {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
//webDriverPool.returnToPool(webDriver);
webDriverPool.closeAll();
return page;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment