Commit 1f85674a authored by xbynet's avatar xbynet Committed by GitHub

Merge pull request #1 from code4craft/master

test
parents c23627bf 76076e51
...@@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w ...@@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.6.0</version> <version>0.6.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.6.0</version> <version>0.6.1</version>
</dependency> </dependency>
``` ```
......
...@@ -23,12 +23,12 @@ Add dependencies to your pom.xml: ...@@ -23,12 +23,12 @@ Add dependencies to your pom.xml:
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.6.0</version> <version>0.6.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.6.0</version> <version>0.6.1</version>
</dependency> </dependency>
``` ```
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
<version>7</version> <version>7</version>
</parent> </parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection> <connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection> <developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
<url>git@github.com:code4craft/webmagic.git</url> <url>git@github.com:code4craft/webmagic.git</url>
<tag>webmagic-parent-0.6.0</tag> <tag>webmagic-parent-0.6.1</tag>
</scm> </scm>
<licenses> <licenses>
<license> <license>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -292,7 +292,7 @@ public class Spider implements Runnable, Task { ...@@ -292,7 +292,7 @@ public class Spider implements Runnable, Task {
} }
if (startRequests != null) { if (startRequests != null) {
for (Request request : startRequests) { for (Request request : startRequests) {
scheduler.push(request, this); addRequest(request);
} }
startRequests.clear(); startRequests.clear();
} }
......
...@@ -18,10 +18,19 @@ import org.apache.http.impl.client.*; ...@@ -18,10 +18,19 @@ import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext; import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.Proxy;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException; import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map; import java.util.Map;
/** /**
...@@ -29,18 +38,55 @@ import java.util.Map; ...@@ -29,18 +38,55 @@ import java.util.Map;
* @since 0.4.0 * @since 0.4.0
*/ */
public class HttpClientGenerator { public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager; private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() { public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE) .register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", SSLConnectionSocketFactory.getSocketFactory()) .register("https", buildSSLConnectionSocketFactory())
.build(); .build();
connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100); connectionManager.setDefaultMaxPerRoute(100);
} }
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("SSLv3");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) { public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize); connectionManager.setMaxTotal(poolSize);
return this; return this;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -29,21 +29,60 @@ public class PhantomJSDownloader extends AbstractDownloader { ...@@ -29,21 +29,60 @@ public class PhantomJSDownloader extends AbstractDownloader {
public PhantomJSDownloader() { public PhantomJSDownloader() {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
} }
/** /**
* 添加新的构造函数,支持phantomjs自定义命令 * 添加新的构造函数,支持phantomjs自定义命令
* *
* example: * example:
* phantomjs.exe 支持windows环境 * phantomjs.exe 支持windows环境
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
* *
* @param phantomJsCommand phantomJsCommand * @param phantomJsCommand
*/ */
public PhantomJSDownloader(String phantomJsCommand) { public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
} }
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
* <pre>
* crawl.js start --
*
* var system = require('system');
* var url = system.args[1];
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
* page.close();
* phantom.exit();
* });
*
* -- crawl.js end
* </pre>
* 具体项目时可以将以上js代码复制下来使用
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
private void initPhantomjsCrawlPath() { private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
} }
...@@ -86,7 +125,7 @@ public class PhantomJSDownloader extends AbstractDownloader { ...@@ -86,7 +125,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
try { try {
String url = request.getUrl(); String url = request.getUrl();
Runtime runtime = Runtime.getRuntime(); Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + url); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream(); InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is)); BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer(); StringBuffer stringBuffer = new StringBuffer();
......
...@@ -45,7 +45,7 @@ public class SpiderMonitor { ...@@ -45,7 +45,7 @@ public class SpiderMonitor {
* *
* @param spiders spiders * @param spiders spiders
* @return this * @return this
* @throws JMException * @throws JMException JMException
*/ */
public synchronized SpiderMonitor register(Spider... spiders) throws JMException { public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
for (Spider spider : spiders) { for (Spider spider : spiders) {
......
...@@ -102,7 +102,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase { ...@@ -102,7 +102,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
/** /**
* @param key1 key1 * @param key1 key1
* @return * @return map
*/ */
public Map<K2, V> remove(K1 key1) { public Map<K2, V> remove(K1 key1) {
Map<K2, V> remove = map.remove(key1); Map<K2, V> remove = map.remove(key1);
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.6.1-SNAPSHOT</version> <version>0.6.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment