Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
1f85674a
Commit
1f85674a
authored
Jan 22, 2017
by
xbynet
Committed by
GitHub
Jan 22, 2017
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1 from code4craft/master
test
parents
c23627bf
76076e51
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
112 additions
and
27 deletions
+112
-27
README-zh.md
README-zh.md
+2
-2
README.md
README.md
+2
-2
pom.xml
pom.xml
+2
-2
pom.xml
webmagic-core/pom.xml
+1
-1
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+1
-1
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+48
-2
pom.xml
webmagic-extension/pom.xml
+1
-1
PhantomJSDownloader.java
...us/codecraft/webmagic/downloader/PhantomJSDownloader.java
+49
-10
SpiderMonitor.java
...ain/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
+1
-1
DoubleKeyMap.java
...c/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+1
-1
pom.xml
webmagic-samples/pom.xml
+1
-1
pom.xml
webmagic-saxon/pom.xml
+1
-1
pom.xml
webmagic-scripts/pom.xml
+1
-1
pom.xml
webmagic-selenium/pom.xml
+1
-1
No files found.
README-zh.md
View file @
1f85674a
...
...
@@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-core
</artifactId>
<version>
0.6.
0
</version>
<version>
0.6.
1
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-extension
</artifactId>
<version>
0.6.
0
</version>
<version>
0.6.
1
</version>
</dependency>
```
...
...
README.md
View file @
1f85674a
...
...
@@ -23,12 +23,12 @@ Add dependencies to your pom.xml:
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-core
</artifactId>
<version>
0.6.
0
</version>
<version>
0.6.
1
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-extension
</artifactId>
<version>
0.6.
0
</version>
<version>
0.6.
1
</version>
</dependency>
```
...
...
pom.xml
View file @
1f85674a
...
...
@@ -6,7 +6,7 @@
<version>
7
</version>
</parent>
<groupId>
us.codecraft
</groupId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
<modelVersion>
4.0.0
</modelVersion>
<packaging>
pom
</packaging>
<properties>
...
...
@@ -38,7 +38,7 @@
<connection>
scm:git:git@github.com:code4craft/webmagic.git
</connection>
<developerConnection>
scm:git:git@github.com:code4craft/webmagic.git
</developerConnection>
<url>
git@github.com:code4craft/webmagic.git
</url>
<tag>
webmagic-parent-0.6.
0
</tag>
<tag>
webmagic-parent-0.6.
1
</tag>
</scm>
<licenses>
<license>
...
...
webmagic-core/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
1f85674a
...
...
@@ -292,7 +292,7 @@ public class Spider implements Runnable, Task {
}
if
(
startRequests
!=
null
)
{
for
(
Request
request
:
startRequests
)
{
scheduler
.
push
(
request
,
this
);
addRequest
(
request
);
}
startRequests
.
clear
();
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
1f85674a
...
...
@@ -18,10 +18,19 @@ import org.apache.http.impl.client.*;
import
org.apache.http.impl.conn.PoolingHttpClientConnectionManager
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.protocol.HttpContext
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
javax.net.ssl.SSLContext
;
import
javax.net.ssl.TrustManager
;
import
javax.net.ssl.X509TrustManager
;
import
java.io.IOException
;
import
java.security.KeyManagementException
;
import
java.security.NoSuchAlgorithmException
;
import
java.security.cert.CertificateException
;
import
java.security.cert.X509Certificate
;
import
java.util.Map
;
/**
...
...
@@ -30,17 +39,54 @@ import java.util.Map;
*/
public
class
HttpClientGenerator
{
private
transient
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
PoolingHttpClientConnectionManager
connectionManager
;
public
HttpClientGenerator
()
{
Registry
<
ConnectionSocketFactory
>
reg
=
RegistryBuilder
.<
ConnectionSocketFactory
>
create
()
.
register
(
"http"
,
PlainConnectionSocketFactory
.
INSTANCE
)
.
register
(
"https"
,
SSLConnectionSocketFactory
.
get
SocketFactory
())
.
register
(
"https"
,
buildSSLConnection
SocketFactory
())
.
build
();
connectionManager
=
new
PoolingHttpClientConnectionManager
(
reg
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
}
private
SSLConnectionSocketFactory
buildSSLConnectionSocketFactory
()
{
try
{
return
new
SSLConnectionSocketFactory
(
createIgnoreVerifySSL
());
// 优先绕过安全证书
}
catch
(
KeyManagementException
e
)
{
logger
.
error
(
"ssl connection fail"
,
e
);
}
catch
(
NoSuchAlgorithmException
e
)
{
logger
.
error
(
"ssl connection fail"
,
e
);
}
return
SSLConnectionSocketFactory
.
getSocketFactory
();
}
private
SSLContext
createIgnoreVerifySSL
()
throws
NoSuchAlgorithmException
,
KeyManagementException
{
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager
trustManager
=
new
X509TrustManager
()
{
@Override
public
void
checkClientTrusted
(
X509Certificate
[]
chain
,
String
authType
)
throws
CertificateException
{
}
@Override
public
void
checkServerTrusted
(
X509Certificate
[]
chain
,
String
authType
)
throws
CertificateException
{
}
@Override
public
X509Certificate
[]
getAcceptedIssuers
()
{
return
null
;
}
};
SSLContext
sc
=
SSLContext
.
getInstance
(
"SSLv3"
);
sc
.
init
(
null
,
new
TrustManager
[]
{
trustManager
},
null
);
return
sc
;
}
public
HttpClientGenerator
setPoolSize
(
int
poolSize
)
{
connectionManager
.
setMaxTotal
(
poolSize
);
return
this
;
...
...
webmagic-extension/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
View file @
1f85674a
...
...
@@ -29,6 +29,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
public
PhantomJSDownloader
()
{
this
.
initPhantomjsCrawlPath
();
}
/**
* 添加新的构造函数,支持phantomjs自定义命令
*
...
...
@@ -37,13 +38,51 @@ public class PhantomJSDownloader extends AbstractDownloader {
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand
phantomJsCommand
* @param
phantomJsCommand
*/
public
PhantomJSDownloader
(
String
phantomJsCommand
)
{
this
.
initPhantomjsCrawlPath
();
PhantomJSDownloader
.
phantomJsCommand
=
phantomJsCommand
;
}
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
* <pre>
* crawl.js start --
*
* var system = require('system');
* var url = system.args[1];
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
* page.close();
* phantom.exit();
* });
*
* -- crawl.js end
* </pre>
* 具体项目时可以将以上js代码复制下来使用
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
*/
public
PhantomJSDownloader
(
String
phantomJsCommand
,
String
crawlJsPath
)
{
PhantomJSDownloader
.
phantomJsCommand
=
phantomJsCommand
;
PhantomJSDownloader
.
crawlJsPath
=
crawlJsPath
;
}
private
void
initPhantomjsCrawlPath
()
{
PhantomJSDownloader
.
crawlJsPath
=
new
File
(
this
.
getClass
().
getResource
(
"/"
).
getPath
()).
getPath
()
+
System
.
getProperty
(
"file.separator"
)
+
"crawl.js "
;
}
...
...
@@ -86,7 +125,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
try
{
String
url
=
request
.
getUrl
();
Runtime
runtime
=
Runtime
.
getRuntime
();
Process
process
=
runtime
.
exec
(
phantomJsCommand
+
" "
+
crawlJsPath
+
url
);
Process
process
=
runtime
.
exec
(
phantomJsCommand
+
" "
+
crawlJsPath
+
" "
+
url
);
InputStream
is
=
process
.
getInputStream
();
BufferedReader
br
=
new
BufferedReader
(
new
InputStreamReader
(
is
));
StringBuffer
stringBuffer
=
new
StringBuffer
();
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
View file @
1f85674a
...
...
@@ -45,7 +45,7 @@ public class SpiderMonitor {
*
* @param spiders spiders
* @return this
* @throws JMException
* @throws JMException
JMException
*/
public
synchronized
SpiderMonitor
register
(
Spider
...
spiders
)
throws
JMException
{
for
(
Spider
spider
:
spiders
)
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
View file @
1f85674a
...
...
@@ -102,7 +102,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
/**
* @param key1 key1
* @return
* @return
map
*/
public
Map
<
K2
,
V
>
remove
(
K1
key1
)
{
Map
<
K2
,
V
>
remove
=
map
.
remove
(
key1
);
...
...
webmagic-samples/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-saxon/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-scripts/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-selenium/pom.xml
View file @
1f85674a
...
...
@@ -3,7 +3,7 @@
<parent>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<version>
0.6.
1
-SNAPSHOT
</version>
<version>
0.6.
2
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment