Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
5f1f4cbc
Commit
5f1f4cbc
authored
Aug 17, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update comments
parent
6cc1d62a
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
152 additions
and
162 deletions
+152
-162
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+25
-41
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+19
-36
ResultItems.java
...core/src/main/java/us/codecraft/webmagic/ResultItems.java
+15
-8
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+45
-41
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+28
-22
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+1
-1
package.html
...gic-core/src/main/java/us/codecraft/webmagic/package.html
+0
-3
Experimental.java
...c/main/java/us/codecraft/webmagic/utils/Experimental.java
+1
-1
MultiPageModel.java
...n/src/main/java/us/codecraft/webmagic/MultiPageModel.java
+1
-1
FileCache.java
...main/java/us/codecraft/webmagic/downloader/FileCache.java
+1
-1
HasKey.java
...ion/src/main/java/us/codecraft/webmagic/model/HasKey.java
+1
-1
ComboExtract.java
.../us/codecraft/webmagic/model/annotation/ComboExtract.java
+5
-2
ExtractBy.java
...ava/us/codecraft/webmagic/model/annotation/ExtractBy.java
+8
-2
MultiPagePipeline.java
...ava/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
+1
-1
FileCacheQueueScheduler.java
...codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
5f1f4cbc
...
...
@@ -8,30 +8,19 @@ import java.util.ArrayList;
import
java.util.List
;
/**
* <pre class="zh">
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
* 主要方法:
* {@link #getUrl()} 获取页面的Url
* {@link #getHtml()} 获取页面的html内容
* {@link #putField(String, Object)} 保存抽取的结果
* {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
* </pre>
* <pre class="en">
* Store extracted result and urls to be crawled.
*
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* </pre>
* Object storing extracted result and urls to be crawled.<br>
* Main method: <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
*/
public
class
Page
{
...
...
@@ -55,19 +44,19 @@ public class Page {
}
/**
* store extract results
*
*
* @param key 结果的key
* @param field 结果的value
* @param key
* @param field
*/
public
void
putField
(
String
key
,
Object
field
)
{
resultItems
.
put
(
key
,
field
);
}
/**
*
获取页面的html内容
*
get html content of page
*
* @return html
页面的html内容
* @return html
*/
public
Selectable
getHtml
()
{
return
html
;
...
...
@@ -82,9 +71,9 @@ public class Page {
}
/**
*
添加待抓取的链接
*
add urls to crawl
*
* @param requests
待抓取的链接
* @param requests
*/
public
void
addTargetRequests
(
List
<
String
>
requests
)
{
synchronized
(
targetRequests
)
{
...
...
@@ -99,9 +88,9 @@ public class Page {
}
/**
*
添加待抓取的链接
*
add url to crawl
*
* @param requestString
待抓取的链接
* @param requestString
*/
public
void
addTargetRequest
(
String
requestString
)
{
if
(
StringUtils
.
isBlank
(
requestString
)
||
requestString
.
equals
(
"#"
))
{
...
...
@@ -114,9 +103,9 @@ public class Page {
}
/**
*
添加待抓取的页面,在需要传递附加信息时使用
*
add requests to crawl
*
* @param request
待抓取的页面
* @param request
*/
public
void
addTargetRequest
(
Request
request
)
{
synchronized
(
targetRequests
)
{
...
...
@@ -125,27 +114,22 @@ public class Page {
}
/**
*
获取页面的Url
*
get url of current page
*
* @return url
当前页面的url,可用于抽取
* @return url
of current page
*/
public
Selectable
getUrl
()
{
return
url
;
}
/**
* 设置url
*
* @param url
*/
public
void
setUrl
(
Selectable
url
)
{
this
.
url
=
url
;
}
/**
*
获取抓取请求
*
get request of current page
*
* @return request
抓取请求
* @return request
*/
public
Request
getRequest
()
{
return
request
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
5f1f4cbc
package
us
.
codecraft
.
webmagic
;
import
us.codecraft.webmagic.utils.Experimental
;
import
java.io.Serializable
;
import
java.util.HashMap
;
import
java.util.Map
;
/**
* <div class="zh">
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* <br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* </div>
* Object contains url to crawl.<br>
* It contains some additional information.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:37
* @since 0.1.0
*/
public
class
Request
implements
Serializable
{
...
...
@@ -36,20 +20,22 @@ public class Request implements Serializable {
private
String
url
;
/**
*
额外参数,可以保存一些需要的上下文信息
*
Store additional information in extras.
*/
private
Map
<
String
,
Object
>
extras
;
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*/
@Experimental
private
double
priority
;
public
Request
()
{
}
/**
* 构建一个request对象
*
* @param url 必须参数,待抓取的url
*/
public
Request
(
String
url
)
{
this
.
url
=
url
;
}
...
...
@@ -59,12 +45,14 @@ public class Request implements Serializable {
}
/**
* 设置优先级,用于URL队列排序<br>
* 需扩展Scheduler<br>
* 目前还没有对应支持优先级的Scheduler实现 =。= <br>
* @param priority 优先级,越大则越靠前
* Set the priority of request for sorting.<br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*
* @param priority
* @return this
*/
@Experimental
public
Request
setPriority
(
double
priority
)
{
this
.
priority
=
priority
;
return
this
;
...
...
@@ -85,11 +73,6 @@ public class Request implements Serializable {
return
this
;
}
/**
* 获取待抓取的url
*
* @return url 待抓取的url
*/
public
String
getUrl
()
{
return
url
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
View file @
5f1f4cbc
...
...
@@ -4,10 +4,13 @@ import java.util.HashMap;
import
java.util.Map
;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 下午12:20 <br>
* @since 0.1.0
* @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public
class
ResultItems
{
...
...
@@ -25,7 +28,7 @@ public class ResultItems {
return
(
T
)
fields
.
get
(
key
);
}
public
Map
<
String
,
Object
>
getAll
()
{
public
Map
<
String
,
Object
>
getAll
()
{
return
fields
;
}
...
...
@@ -44,8 +47,10 @@ public class ResultItems {
}
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
* Whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/
public
boolean
isSkip
()
{
return
skip
;
...
...
@@ -53,8 +58,10 @@ public class ResultItems {
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip
* Set whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this
*/
public
ResultItems
setSkip
(
boolean
skip
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
5f1f4cbc
...
...
@@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.util.*
;
/**
* Site定义一个待抓取的站点的各种信息。<br>
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
*
Date: 13-4-21
*
Time: 下午12:13
*
@since 0.1.0
*
@see us.codecraft.webmagic.processor.PageProcessor
*/
public
class
Site
{
...
...
@@ -22,6 +21,9 @@ public class Site {
private
String
charset
;
/**
* startUrls is the urls the crawler to start with.
*/
private
List
<
String
>
startUrls
=
new
ArrayList
<
String
>();
private
int
sleepTime
=
3000
;
...
...
@@ -37,19 +39,19 @@ public class Site {
}
/**
*
创建一个Site对象,等价于new Site()
*
new a Site
*
* @return
新建的对象
* @return
new site
*/
public
static
Site
me
()
{
return
new
Site
();
}
/**
*
为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
*
Add a cookie with domain {@link #getDomain()}
*
* @param name
cookie的名称
* @param value
cookie的值
* @param name
* @param value
* @return this
*/
public
Site
addCookie
(
String
name
,
String
value
)
{
...
...
@@ -58,7 +60,7 @@ public class Site {
}
/**
*
为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
*
set user agent
*
* @param userAgent userAgent
* @return this
...
...
@@ -69,27 +71,27 @@ public class Site {
}
/**
*
获取已经设置的所有cookie
*
get cookies
*
* @return
已经设置的所有cookie
* @return
get cookies
*/
public
Map
<
String
,
String
>
getCookies
()
{
return
cookies
;
}
/**
*
获取已设置的user-
agent
*
get user
agent
*
* @return
已设置的user-
agent
* @return
user
agent
*/
public
String
getUserAgent
()
{
return
userAgent
;
}
/**
*
获取已设置的
domain
*
get
domain
*
* @return
已设置的
domain
* @return
get
domain
*/
public
String
getDomain
()
{
if
(
domain
==
null
)
{
...
...
@@ -101,10 +103,9 @@ public class Site {
}
/**
* 设置这个站点所在域名,必须项。<br>
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
* set the domain of site.
*
* @param domain
爬虫会抓取的域名
* @param domain
* @return this
*/
public
Site
setDomain
(
String
domain
)
{
...
...
@@ -113,10 +114,10 @@ public class Site {
}
/**
*
设置页面编码,若不设置则自动根据Html meta信息获取。
<br>
*
一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
*
Set charset of page manually.
<br>
*
When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset
编码格式,主要是"utf-8"、"gbk"两种
* @param charset
* @return this
*/
public
Site
setCharset
(
String
charset
)
{
...
...
@@ -125,20 +126,21 @@ public class Site {
}
/**
*
获取已设置的编码
*
get charset set manually
*
* @return
已设置的domain
* @return
charset
*/
public
String
getCharset
()
{
return
charset
;
}
/**
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
* 默认为200,正常情况下,无须设置此项。<br>
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
* Set acceptStatCode.<br>
* When status code of http response is in acceptStatCodes, it will be processed.<br>
* {200} by default.<br>
* It is not necessarily to be set.<br>
*
* @param acceptStatCode
可接受的状态码
* @param acceptStatCode
* @return this
*/
public
Site
setAcceptStatCode
(
Set
<
Integer
>
acceptStatCode
)
{
...
...
@@ -147,27 +149,27 @@ public class Site {
}
/**
*
获取可接受的状态码
*
get acceptStatCode
*
* @return
可接受的状态码
* @return
acceptStatCode
*/
public
Set
<
Integer
>
getAcceptStatCode
()
{
return
acceptStatCode
;
}
/**
*
获取初始页面的地址列表
*
get start urls
*
* @return
初始页面的地址列表
* @return
start urls
*/
public
List
<
String
>
getStartUrls
()
{
return
startUrls
;
}
/**
*
增加初始页面的地址,可反复调用此方法增加多个初始地址。
*
Add a url to start url.<br>
*
* @param startUrl
初始页面的地址
* @param startUrl
* @return this
*/
public
Site
addStartUrl
(
String
startUrl
)
{
...
...
@@ -176,9 +178,10 @@ public class Site {
}
/**
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
* Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @param sleepTime
单位毫秒
* @param sleepTime
* @return this
*/
public
Site
setSleepTime
(
int
sleepTime
)
{
...
...
@@ -187,25 +190,26 @@ public class Site {
}
/**
* 获取两次抓取之间的间隔
* Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @return
两次抓取之间的间隔,单位毫秒
* @return
the interval between the processing of two pages,
*/
public
int
getSleepTime
()
{
return
sleepTime
;
}
/**
*
获取重新下载的次数,默认为0
*
Get retry times when download fail, 0 by default.<br>
*
* @return
重新下载的次数
* @return
retry times when download fail
*/
public
int
getRetryTimes
()
{
return
retryTimes
;
}
/**
*
设置获取重新下载的次数,默认为0
*
Set retry times when download fail, 0 by default.<br>
*
* @return this
*/
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
5f1f4cbc
...
...
@@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* <pre>
* webmagic爬虫的入口类。
*
* 示例:
* 定义一个最简单的爬虫:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
* 使用FilePipeline保存结果到文件:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* Every module is a field of Spider. <br>
* The modules are defined in interface. <br>
* You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br>
* A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* <br>
* Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午6:53
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
public
class
Spider
implements
Runnable
,
Task
{
...
...
@@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/**
* 用某些特定URL进行爬虫测试
*
* @param urls 要抓取的url
*/
public
void
test
(
String
...
urls
){
public
void
test
(
String
...
urls
)
{
checkComponent
();
if
(
urls
.
length
>
0
)
{
if
(
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
processRequest
(
new
Request
(
url
));
}
...
...
@@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
}
pageProcessor
.
process
(
page
);
addRequest
(
page
);
if
(!
page
.
getResultItems
().
isSkip
()){
if
(!
page
.
getResultItems
().
isSkip
())
{
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
.
getResultItems
(),
this
);
}
...
...
@@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return
this
;
}
public
Spider
clearPipeline
(){
pipelines
=
new
ArrayList
<
Pipeline
>();
public
Spider
clearPipeline
()
{
pipelines
=
new
ArrayList
<
Pipeline
>();
return
this
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
5f1f4cbc
...
...
@@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 直接下载页面的简便方法
*
* @param url
* @return
* @return
html
*/
public
Html
download
(
String
url
)
{
Page
page
=
download
(
new
Request
(
url
),
null
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/package.html
View file @
5f1f4cbc
...
...
@@ -2,9 +2,6 @@
<body>
<div
class=
"en"
>
Main class "Spider" and models.
</div>
<div
class=
"zh"
>
包括webmagic入口类Spider和一些数据传递的实体类。
</div>
</body>
</html>
webmagic-
extension/src/main/java/us/codecraft/webmagic/model/annotation
/Experimental.java
→
webmagic-
core/src/main/java/us/codecraft/webmagic/utils
/Experimental.java
View file @
5f1f4cbc
package
us
.
codecraft
.
webmagic
.
model
.
annotation
;
package
us
.
codecraft
.
webmagic
.
utils
;
/**
* @author code4crafter@gmail.com <br>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
View file @
5f1f4cbc
package
us
.
codecraft
.
webmagic
;
import
us.codecraft.webmagic.
model.annotation
.Experimental
;
import
us.codecraft.webmagic.
utils
.Experimental
;
import
java.util.Collection
;
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
View file @
5f1f4cbc
...
...
@@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.*
;
import
us.codecraft.webmagic.
model.annotation
.Experimental
;
import
us.codecraft.webmagic.
utils
.Experimental
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.SimplePageProcessor
;
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java
View file @
5f1f4cbc
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.
model.annotation
.Experimental
;
import
us.codecraft.webmagic.
utils
.Experimental
;
/**
* Interface to be implemented by page mode.<br>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
View file @
5f1f4cbc
...
...
@@ -21,7 +21,7 @@ public @interface ComboExtract {
*/
ExtractBy
[]
value
();
enum
Op
{
public
static
enum
Op
{
/**
* All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source.
...
...
@@ -49,7 +49,10 @@ public @interface ComboExtract {
*/
boolean
notNull
()
default
false
;
public
enum
Source
{
/**
* types of source for extracting.
*/
public
static
enum
Source
{
/**
* extract from the content extracted by class extractor
*/
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
View file @
5f1f4cbc
...
...
@@ -21,7 +21,10 @@ public @interface ExtractBy {
*/
String
value
();
public
enum
Type
{
XPath
,
Regex
,
Css
}
/**
* types of extractor expressions
*/
public
static
enum
Type
{
XPath
,
Regex
,
Css
}
/**
* Extractor type, support XPath, CSS Selector and regex.
...
...
@@ -38,7 +41,10 @@ public @interface ExtractBy {
*/
boolean
notNull
()
default
false
;
public
enum
Source
{
/**
* types of source for extracting.
*/
public
static
enum
Source
{
/**
* extract from the content extracted by class extractor
*/
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
View file @
5f1f4cbc
...
...
@@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
import
us.codecraft.webmagic.MultiPageModel
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.
model.annotation
.Experimental
;
import
us.codecraft.webmagic.
utils
.Experimental
;
import
us.codecraft.webmagic.utils.DoubleKeyMap
;
import
java.util.*
;
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
View file @
5f1f4cbc
...
...
@@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown
。
<br>
* Store urls and cursor in files so that a Spider can resume the status when shutdown
.
<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment