Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
926dd346
Commit
926dd346
authored
Jan 03, 2018
by
shenjunlin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
下载完毕之后关闭driver
parent
ceee1a8a
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
26 additions
and
235 deletions
+26
-235
pom.xml
pom.xml
+1
-1
pom.xml
webmagic-core/pom.xml
+1
-1
pom.xml
webmagic-extension/pom.xml
+1
-1
PhantomJSDownloader.java
...us/codecraft/webmagic/downloader/PhantomJSDownloader.java
+0
-150
crawl.js
webmagic-extension/src/main/resources/crawl.js
+0
-17
pom.xml
webmagic-samples/pom.xml
+1
-1
PhantomJSPageProcessor.java
...us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
+0
-54
pom.xml
webmagic-saxon/pom.xml
+1
-1
pom.xml
webmagic-scripts/pom.xml
+1
-1
pom.xml
webmagic-selenium/pom.xml
+18
-7
SeleniumDownloader.java
...raft/webmagic/downloader/selenium/SeleniumDownloader.java
+2
-1
No files found.
pom.xml
View file @
926dd346
...
...
@@ -6,7 +6,7 @@
<version>
7
</version>
</parent>
<groupId>
us.codecraft.duiba
</groupId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
<modelVersion>
4.0.0
</modelVersion>
<packaging>
pom
</packaging>
<properties>
...
...
webmagic-core/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
deleted
100644 → 0
View file @
ceee1a8a
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.selector.PlainText
;
import
java.io.*
;
/**
* this downloader is used to download pages which need to render the javascript
*
* @author dolphineor@gmail.com
* @version 0.5.3
*/
public
class
PhantomJSDownloader
extends
AbstractDownloader
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
PhantomJSDownloader
.
class
);
private
static
String
crawlJsPath
;
private
static
String
phantomJsCommand
=
"phantomjs"
;
// default
private
int
retryNum
;
private
int
threadNum
;
public
PhantomJSDownloader
()
{
this
.
initPhantomjsCrawlPath
();
}
/**
* 添加新的构造函数,支持phantomjs自定义命令
*
* example:
* phantomjs.exe 支持windows环境
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand phantomJsCommand
*/
public
PhantomJSDownloader
(
String
phantomJsCommand
)
{
this
.
initPhantomjsCrawlPath
();
PhantomJSDownloader
.
phantomJsCommand
=
phantomJsCommand
;
}
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
* <pre>
* crawl.js start --
*
* var system = require('system');
* var url = system.args[1];
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
* page.close();
* phantom.exit();
* });
*
* -- crawl.js end
* </pre>
* 具体项目时可以将以上js代码复制下来使用
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
*/
public
PhantomJSDownloader
(
String
phantomJsCommand
,
String
crawlJsPath
)
{
PhantomJSDownloader
.
phantomJsCommand
=
phantomJsCommand
;
PhantomJSDownloader
.
crawlJsPath
=
crawlJsPath
;
}
private
void
initPhantomjsCrawlPath
()
{
PhantomJSDownloader
.
crawlJsPath
=
new
File
(
this
.
getClass
().
getResource
(
"/"
).
getPath
()).
getPath
()
+
System
.
getProperty
(
"file.separator"
)
+
"crawl.js "
;
}
@Override
public
Page
download
(
Request
request
,
Task
task
)
{
if
(
logger
.
isInfoEnabled
())
{
logger
.
info
(
"downloading page: "
+
request
.
getUrl
());
}
String
content
=
getPage
(
request
);
if
(
content
.
contains
(
"HTTP request failed"
))
{
for
(
int
i
=
1
;
i
<=
getRetryNum
();
i
++)
{
content
=
getPage
(
request
);
if
(!
content
.
contains
(
"HTTP request failed"
))
{
break
;
}
}
if
(
content
.
contains
(
"HTTP request failed"
))
{
//when failed
Page
page
=
new
Page
();
page
.
setRequest
(
request
);
return
page
;
}
}
Page
page
=
new
Page
();
page
.
setRawText
(
content
);
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
page
.
setStatusCode
(
200
);
return
page
;
}
@Override
public
void
setThread
(
int
threadNum
)
{
this
.
threadNum
=
threadNum
;
}
protected
String
getPage
(
Request
request
)
{
try
{
String
url
=
request
.
getUrl
();
Runtime
runtime
=
Runtime
.
getRuntime
();
Process
process
=
runtime
.
exec
(
phantomJsCommand
+
" "
+
crawlJsPath
+
" "
+
url
);
InputStream
is
=
process
.
getInputStream
();
BufferedReader
br
=
new
BufferedReader
(
new
InputStreamReader
(
is
));
StringBuffer
stringBuffer
=
new
StringBuffer
();
String
line
;
while
((
line
=
br
.
readLine
())
!=
null
)
{
stringBuffer
.
append
(
line
).
append
(
"\n"
);
}
return
stringBuffer
.
toString
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
public
int
getRetryNum
()
{
return
retryNum
;
}
public
PhantomJSDownloader
setRetryNum
(
int
retryNum
)
{
this
.
retryNum
=
retryNum
;
return
this
;
}
}
webmagic-extension/src/main/resources/crawl.js
deleted
100644 → 0
View file @
ceee1a8a
var
system
=
require
(
'system'
);
var
url
=
system
.
args
[
1
];
var
page
=
require
(
'webpage'
).
create
();
page
.
settings
.
loadImages
=
false
;
page
.
settings
.
resourceTimeout
=
5000
;
page
.
open
(
url
,
function
(
status
)
{
if
(
status
!=
'success'
)
{
console
.
log
(
"HTTP request failed!"
);
}
else
{
console
.
log
(
page
.
content
);
}
page
.
close
();
phantom
.
exit
();
});
\ No newline at end of file
webmagic-samples/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
deleted
100644 → 0
View file @
ceee1a8a
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.PhantomJSDownloader
;
import
us.codecraft.webmagic.pipeline.CollectorPipeline
;
import
us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
/**
* Created by dolphineor on 2014-11-21.
* <p>
* 以淘宝为例, 搜索冬装的相关结果
*/
public
class
PhantomJSPageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
()
.
setDomain
(
"s.taobao.com"
)
.
setCharset
(
"GBK"
)
.
addHeader
(
"Referer"
,
"http://www.taobao.com/"
)
.
setRetryTimes
(
3
).
setSleepTime
(
1000
);
@Override
public
void
process
(
Page
page
)
{
if
(
page
.
getRawText
()
!=
null
)
page
.
putField
(
"html"
,
page
.
getRawText
());
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
static
void
main
(
String
[]
args
)
throws
Exception
{
PhantomJSDownloader
phantomDownloader
=
new
PhantomJSDownloader
().
setRetryNum
(
3
);
CollectorPipeline
<
ResultItems
>
collectorPipeline
=
new
ResultItemsCollectorPipeline
();
Spider
.
create
(
new
PhantomJSPageProcessor
())
.
addUrl
(
"http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc"
)
//%B6%AC%D7%B0为冬装的GBK编码
.
setDownloader
(
phantomDownloader
)
.
addPipeline
(
collectorPipeline
)
.
thread
((
Runtime
.
getRuntime
().
availableProcessors
()
-
1
)
<<
1
)
.
run
();
List
<
ResultItems
>
resultItemsList
=
collectorPipeline
.
getCollected
();
System
.
out
.
println
(
resultItemsList
.
get
(
0
).
get
(
"html"
).
toString
());
}
}
webmagic-saxon/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-scripts/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-selenium/pom.xml
View file @
926dd346
...
...
@@ -3,7 +3,7 @@
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.
4
-SNAPSHOT
</version>
<version>
0.7.
5
-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
...
...
@@ -16,7 +16,7 @@
<dependency>
<groupId>
org.seleniumhq.selenium
</groupId>
<artifactId>
selenium-java
</artifactId>
<version>
3.
5.3
</version>
<version>
3.
8.1
</version>
</dependency>
<dependency>
<groupId>
us.codecraft.duiba
</groupId>
...
...
@@ -35,13 +35,24 @@
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<groupId>
com.codeborne
</groupId>
<artifactId>
phantomjsdriver
</artifactId>
<version>
1.4.3
</version>
<exclusions>
<exclusion>
<artifactId>
selenium-remote-driver
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
<exclusion>
<artifactId>
selenium-api
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-extension
</artifactId>
<version>
0.7.4-SNAPSHOT
</version>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
</dependencies>
...
...
webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
View file @
926dd346
...
...
@@ -98,7 +98,8 @@ public class SeleniumDownloader implements Downloader, Closeable {
page
.
setHtml
(
new
Html
(
content
,
request
.
getUrl
()));
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
webDriverPool
.
returnToPool
(
webDriver
);
//webDriverPool.returnToPool(webDriver);
webDriverPool
.
closeAll
();
return
page
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment