Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
e1e25cb5
Commit
e1e25cb5
authored
Jun 20, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update javadoc
parent
b1f023ea
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
56 additions
and
13 deletions
+56
-13
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+1
-0
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+1
-1
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+47
-11
Task.java
webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
+5
-0
DianpingProcessor.java
...java/us/codecraft/webmagic/samples/DianpingProcessor.java
+1
-1
DiandianProcessorTest.java
...s/codecraft/webmagic/processor/DiandianProcessorTest.java
+1
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
e1e25cb5
...
...
@@ -3,6 +3,7 @@ package us.codecraft.webmagic;
/**
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* <br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
e1e25cb5
...
...
@@ -85,7 +85,7 @@ public class Site {
/**
* 获取已设置的domain
*
* @return
* @return
已设置的domain
*/
public
String
getDomain
()
{
return
domain
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
e1e25cb5
...
...
@@ -15,9 +15,19 @@ import java.util.List;
/**
* <pre>
* webmagic爬虫的入口类。
* 示例:
*webmagic爬虫的入口类。
*
*示例:
*定义一个最简单的爬虫:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
*使用FilePipeline保存结果到文件:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
*使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
...
...
@@ -41,36 +51,60 @@ public class Spider implements Runnable, Task {
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
public
Spider
(
PageProcessor
pageProcessor
){
/**
* 使用已定义的抽取规则新建一个Spider。
* @param pageProcessor 已定义的抽取规则
*/
public
Spider
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
this
.
startUrls
=
pageProcessor
.
getSite
().
getStartUrls
();
}
/**
* 使用已定义的抽取规则新建一个Spider。
* @param pageProcessor 已定义的抽取规则
* @return 新建的Spider
*/
public
static
Spider
create
(
PageProcessor
pageProcessor
)
{
return
new
Spider
(
pageProcessor
);
}
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
* @param startUrls
* @return this
*/
public
Spider
startUrls
(
List
<
String
>
startUrls
)
{
this
.
startUrls
=
startUrls
;
return
this
;
}
public
Spider
startUrl
(
String
startUrl
)
{
startUrls
=
new
ArrayList
<
String
>();
startUrls
.
add
(
startUrl
);
return
this
;
}
/**
* 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。
* @param uuid 唯一ID
* @return this
*/
public
Spider
setUUID
(
String
uuid
)
{
this
.
uuid
=
uuid
;
return
this
;
}
/**
* 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。
* @param scheduler 调度器
* @return this
*/
public
Spider
scheduler
(
Scheduler
scheduler
)
{
this
.
scheduler
=
scheduler
;
return
this
;
}
/**
* 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。
* @param pipeline 处理管道
* @return this
*/
public
Spider
pipeline
(
Pipeline
pipeline
)
{
this
.
pipelines
.
add
(
pipeline
);
return
this
;
...
...
@@ -79,9 +113,11 @@ public class Spider implements Runnable, Task {
@Override
public
void
run
()
{
if
(
startUrls
!=
null
)
{
for
(
String
startUrl
:
startUrls
)
{
scheduler
.
push
(
new
Request
(
startUrl
),
this
);
}
}
Request
request
=
scheduler
.
poll
(
this
);
if
(
pipelines
.
isEmpty
())
{
pipelines
.
add
(
new
ConsolePipeline
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
View file @
e1e25cb5
package
us
.
codecraft
.
webmagic
;
/**
* 抓取任务的抽象接口。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-6-18
* Time: 下午2:57
*/
public
interface
Task
{
/**
* 返回唯一标志该任务的字符串,以区分不同任务。
* @return uuid
*/
public
String
getUUID
();
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
View file @
e1e25cb5
...
...
@@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public
static
void
main
(
String
[]
args
)
{
DianpingProcessor
dianpingProcessor
=
new
DianpingProcessor
();
Spider
.
create
(
dianpingProcessor
).
startUrl
(
"http://www.dianping.com/shanghai/food"
).
run
();
Spider
.
create
(
dianpingProcessor
).
run
();
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java
View file @
e1e25cb5
...
...
@@ -30,6 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider
.
create
(
diaoyuwengProcessor
).
pipeline
(
new
ConsolePipeline
()).
pipeline
(
pipeline
).
scheduler
(
new
FileCacheQueueScheduler
(
"/data/temp/webmagic/cache/"
)).
run
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment