Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
7bed01c9
Commit
7bed01c9
authored
Jun 19, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update Spider api
parent
69ff524d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
161 additions
and
64 deletions
+161
-64
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+100
-14
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+24
-21
FileCacheQueueScheduler.java
...codecraft/webmagic/schedular/FileCacheQueueScheduler.java
+2
-2
QueueScheduler.java
.../java/us/codecraft/webmagic/schedular/QueueScheduler.java
+1
-1
Scheduler.java
.../main/java/us/codecraft/webmagic/schedular/Scheduler.java
+1
-1
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+9
-5
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+4
-0
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+1
-1
DianpingProcessor.java
...java/us/codecraft/webmagic/samples/DianpingProcessor.java
+1
-1
SpiderTest.java
...mples/src/test/java/us/codecraft/webmagic/SpiderTest.java
+5
-5
DiandianProcessorTest.java
...s/codecraft/webmagic/processor/DiandianProcessorTest.java
+3
-3
DiaoyuwengProcessorTest.java
...codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
+5
-5
SinablogProcessorTest.java
...s/codecraft/webmagic/processor/SinablogProcessorTest.java
+5
-5
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
7bed01c9
...
...
@@ -3,10 +3,12 @@ package us.codecraft.webmagic;
import
java.util.*
;
/**
* Site定义一个待抓取的站点的各种信息。
* Site定义一个待抓取的站点的各种信息。<br>
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午12:13
*
Date: 13-4-21
*
Time: 下午12:13
*/
public
class
Site
{
...
...
@@ -30,73 +32,157 @@ public class Site {
DEFAULT_STATUS_CODE_SET
.
add
(
200
);
}
/**
* 创建一个Site对象,等价于new Site()
*
* @return 新建的对象
*/
public
static
Site
me
()
{
return
new
Site
();
}
public
Site
setCookie
(
String
name
,
String
value
)
{
/**
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
*
* @param name cookie的名称
* @param value cookie的值
* @return this
*/
public
Site
addCookie
(
String
name
,
String
value
)
{
cookies
.
put
(
name
,
value
);
return
this
;
}
/**
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
*
* @param userAgent userAgent
* @return this
*/
public
Site
setUserAgent
(
String
userAgent
)
{
this
.
userAgent
=
userAgent
;
return
this
;
}
/**
* 获取已经设置的所有cookie
*
* @return 已经设置的所有cookie
*/
public
Map
<
String
,
String
>
getCookies
()
{
return
cookies
;
}
/**
* 获取已设置的user-agent
*
* @return 已设置的user-agent
*/
public
String
getUserAgent
()
{
return
userAgent
;
}
/**
* 获取已设置的domain
*
* @return
*/
public
String
getDomain
()
{
return
domain
;
}
/**
* 设置这个站点所在域名,必须项。<br>
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
*
* @param domain 爬虫会抓取的域名
* @return this
*/
public
Site
setDomain
(
String
domain
)
{
this
.
domain
=
domain
;
return
this
;
}
public
String
getEncoding
()
{
return
encoding
;
}
/**
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
*
* @param encoding 编码格式,主要是"utf-8"、"gbk"两种
* @return this
*/
public
Site
setEncoding
(
String
encoding
)
{
this
.
encoding
=
encoding
;
return
this
;
}
public
Set
<
Integer
>
getAcceptStatCode
()
{
return
acceptStatCode
;
/**
* 获取已设置的编码
*
* @return 已设置的domain
*/
public
String
getEncoding
()
{
return
encoding
;
}
/**
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
* 默认为200,正常情况下,无须设置此项。<br>
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
*
* @param acceptStatCode 可接受的状态码
* @return this
*/
public
Site
setAcceptStatCode
(
Set
<
Integer
>
acceptStatCode
)
{
this
.
acceptStatCode
=
acceptStatCode
;
return
this
;
}
/**
* 获取可接受的状态码
*
* @return 可接受的状态码
*/
public
Set
<
Integer
>
getAcceptStatCode
()
{
return
acceptStatCode
;
}
/**
* 获取初始页面的地址列表
* @return 初始页面的地址列表
*/
public
List
<
String
>
getStartUrls
()
{
return
startUrls
;
}
/**
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
* @param startUrl 初始页面的地址
* @return this
*/
public
Site
addStartUrl
(
String
startUrl
)
{
this
.
startUrls
.
add
(
startUrl
);
return
this
;
}
public
int
getSleepTime
()
{
return
sleepTime
;
}
/**
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
*
* @param sleepTime 单位毫秒
* @return this
*/
public
Site
setSleepTime
(
int
sleepTime
)
{
this
.
sleepTime
=
sleepTime
;
return
this
;
}
/**
* 获取两次抓取之间的间隔
* @return 两次抓取之间的间隔,单位毫秒
*/
public
int
getSleepTime
()
{
return
sleepTime
;
}
@Override
public
boolean
equals
(
Object
o
)
{
if
(
this
==
o
)
return
true
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
7bed01c9
...
...
@@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.schedular.QueueSchedul
a
r
;
import
us.codecraft.webmagic.schedular.Schedul
a
r
;
import
us.codecraft.webmagic.schedular.QueueSchedul
e
r
;
import
us.codecraft.webmagic.schedular.Schedul
e
r
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* <pre>
* webmagic爬虫的入口类。
* 示例:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
* </pre>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午6:53
...
...
@@ -32,18 +37,17 @@ public class Spider implements Runnable, Task {
private
String
uuid
;
private
Schedul
ar
schedular
=
new
QueueSchedula
r
();
private
Schedul
er
scheduler
=
new
QueueSchedule
r
();
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
public
static
Spider
me
()
{
return
new
Spider
();
}
public
Spider
processor
(
PageProcessor
pageProcessor
)
{
public
Spider
(
PageProcessor
pageProcessor
){
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
return
this
;
}
public
static
Spider
create
(
PageProcessor
pageProcessor
)
{
return
new
Spider
(
pageProcessor
);
}
public
Spider
startUrls
(
List
<
String
>
startUrls
)
{
...
...
@@ -57,8 +61,13 @@ public class Spider implements Runnable, Task {
return
this
;
}
public
Spider
schedular
(
Schedular
schedular
)
{
this
.
schedular
=
schedular
;
public
Spider
setUUID
(
String
uuid
)
{
this
.
uuid
=
uuid
;
return
this
;
}
public
Spider
schedular
(
Scheduler
scheduler
)
{
this
.
scheduler
=
scheduler
;
return
this
;
}
...
...
@@ -71,9 +80,9 @@ public class Spider implements Runnable, Task {
@Override
public
void
run
()
{
for
(
String
startUrl
:
startUrls
)
{
schedul
a
r
.
push
(
new
Request
(
startUrl
),
this
);
schedul
e
r
.
push
(
new
Request
(
startUrl
),
this
);
}
Request
request
=
schedul
a
r
.
poll
(
this
);
Request
request
=
schedul
e
r
.
poll
(
this
);
if
(
pipelines
.
isEmpty
())
{
pipelines
.
add
(
new
ConsolePipeline
());
}
...
...
@@ -89,16 +98,10 @@ public class Spider implements Runnable, Task {
pipeline
.
process
(
page
,
this
);
}
sleep
(
site
.
getSleepTime
());
request
=
schedul
a
r
.
poll
(
this
);
request
=
schedul
e
r
.
poll
(
this
);
}
}
public
Spider
setUUID
(
String
uuid
)
{
this
.
uuid
=
uuid
;
return
this
;
}
private
void
sleep
(
int
time
)
{
try
{
Thread
.
sleep
(
time
);
...
...
@@ -110,7 +113,7 @@ public class Spider implements Runnable, Task {
private
void
addRequest
(
Page
page
)
{
if
(
CollectionUtils
.
isNotEmpty
(
page
.
getTargetRequests
()))
{
for
(
Request
request
:
page
.
getTargetRequests
())
{
schedul
a
r
.
push
(
request
,
this
);
schedul
e
r
.
push
(
request
,
this
);
}
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedul
a
r.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedul
e
r.java
View file @
7bed01c9
...
...
@@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Date: 13-4-21
* Time: 下午1:13
*/
public
class
FileCacheQueueSchedul
ar
implements
Schedula
r
{
public
class
FileCacheQueueSchedul
er
implements
Schedule
r
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
...
...
@@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular {
private
Set
<
String
>
urls
;
public
FileCacheQueueSchedul
a
r
(
String
filePath
)
{
public
FileCacheQueueSchedul
e
r
(
String
filePath
)
{
this
.
filePath
=
filePath
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedul
a
r.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedul
e
r.java
View file @
7bed01c9
...
...
@@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* Date: 13-4-21
* Time: 下午1:13
*/
public
class
QueueSchedul
ar
implements
Schedula
r
{
public
class
QueueSchedul
er
implements
Schedule
r
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedul
a
r.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedul
e
r.java
View file @
7bed01c9
...
...
@@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task;
* Date: 13-4-21
* Time: 下午1:12
*/
public
interface
Schedul
a
r
{
public
interface
Schedul
e
r
{
public
void
push
(
Request
request
,
Task
task
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
7bed01c9
...
...
@@ -5,8 +5,8 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午7:54
*
Date: 13-4-21
*
Time: 上午7:54
*/
public
class
Html
extends
PlainText
{
...
...
@@ -18,12 +18,16 @@ public class Html extends PlainText {
super
(
text
);
}
public
static
Html
create
(
String
text
)
{
return
new
Html
(
text
);
}
@Override
protected
Selectable
select
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
String
result
=
selector
.
select
(
string
);
if
(
result
!=
null
)
{
if
(
result
!=
null
)
{
results
.
add
(
result
);
}
}
...
...
@@ -43,13 +47,13 @@ public class Html extends PlainText {
@Override
public
Selectable
smartContent
()
{
SmartContentSelector
smartContentSelector
=
SelectorFactory
.
getInstatnce
().
newSmartContentSelector
();
return
select
(
smartContentSelector
,
strings
);
return
select
(
smartContentSelector
,
strings
);
}
@Override
public
Selectable
links
()
{
XpathSelector
xpathSelector
=
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
return
selectList
(
xpathSelector
,
strings
);
}
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
7bed01c9
...
...
@@ -24,6 +24,10 @@ public class PlainText implements Selectable {
this
.
strings
=
results
;
}
public
static
PlainText
create
(
String
text
)
{
return
new
PlainText
(
text
);
}
@Override
public
Selectable
xpath
(
String
xpath
)
{
throw
new
UnsupportedOperationException
();
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
View file @
7bed01c9
...
...
@@ -15,7 +15,7 @@ public class HttpClientDownloaderTest {
@Test
public
void
testCookie
()
{
Site
site
=
Site
.
me
().
setDomain
(
"www.diandian.com"
).
set
Cookie
(
"t"
,
"yct7q7e6v319wpg4cpxqduu5m77lcgix"
);
Site
site
=
Site
.
me
().
setDomain
(
"www.diandian.com"
).
add
Cookie
(
"t"
,
"yct7q7e6v319wpg4cpxqduu5m77lcgix"
);
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
Page
download
=
httpClientDownloader
.
download
(
new
Request
(
"http://www.diandian.com"
),
site
);
Assert
.
assertTrue
(
download
.
getHtml
().
toString
().
contains
(
"flashsword30"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
View file @
7bed01c9
...
...
@@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public
static
void
main
(
String
[]
args
)
{
DianpingProcessor
dianpingProcessor
=
new
DianpingProcessor
();
Spider
.
me
().
processor
(
dianpingProcessor
).
startUrl
(
"http://www.dianping.com/shanghai/food"
).
run
();
Spider
.
create
(
dianpingProcessor
).
startUrl
(
"http://www.dianping.com/shanghai/food"
).
run
();
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
View file @
7bed01c9
...
...
@@ -5,7 +5,7 @@ import org.junit.Test;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.SimplePageProcessor
;
import
us.codecraft.webmagic.samples.HuxiuProcessor
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
a
r
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
e
r
;
/**
* @author code4crafter@gmail.com <br>
...
...
@@ -18,7 +18,7 @@ public class SpiderTest {
@Ignore
@Test
public
void
testSpider
()
throws
InterruptedException
{
Spider
me
=
Spider
.
me
().
pipeline
(
new
FilePipeline
()).
processor
(
new
HuxiuProcessor
());
Spider
me
=
Spider
.
create
(
new
HuxiuProcessor
()).
pipeline
(
new
FilePipeline
());
me
.
run
();
}
...
...
@@ -26,13 +26,13 @@ public class SpiderTest {
@Test
public
void
testGlobalSpider
(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedul
a
r(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedul
e
r(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor
pageProcessor2
=
new
SimplePageProcessor
(
"http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"
,
"http://www.diaoyuweng.com/thread-*-1-1.html"
);
System
.
out
.
println
(
pageProcessor2
.
getSite
().
getEncoding
());
pageProcessor2
.
getSite
().
setSleepTime
(
500
);
Spider
.
me
().
pipeline
(
new
FilePipeline
()).
schedular
(
new
FileCacheQueueSchedula
r
(
"/data/temp/webmagic/cache/"
)).
processor
(
pageProcessor2
).
run
();
Spider
.
create
(
pageProcessor2
).
pipeline
(
new
FilePipeline
()).
schedular
(
new
FileCacheQueueSchedule
r
(
"/data/temp/webmagic/cache/"
)).
run
();
}
...
...
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java
View file @
7bed01c9
...
...
@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.FreemarkerPipeline
;
import
us.codecraft.webmagic.samples.DiandianBlogProcessor
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
a
r
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
e
r
;
import
java.io.IOException
;
...
...
@@ -30,7 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider
.
me
().
pipeline
(
new
ConsolePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedula
r
(
"/data/temp/webmagic/cache/"
)).
processor
(
diaoyuwengProcessor
).
run
();
Spider
.
create
(
diaoyuwengProcessor
).
pipeline
(
new
ConsolePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedule
r
(
"/data/temp/webmagic/cache/"
)).
run
();
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
View file @
7bed01c9
...
...
@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.pipeline.FreemarkerPipeline
;
import
us.codecraft.webmagic.samples.DiaoyuwengProcessor
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
a
r
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
e
r
;
import
java.io.IOException
;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
*
Date: 13-6-9
*
Time: 上午8:02
*/
public
class
DiaoyuwengProcessorTest
{
...
...
@@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest {
public
void
test
()
throws
IOException
{
DiaoyuwengProcessor
diaoyuwengProcessor
=
new
DiaoyuwengProcessor
();
FreemarkerPipeline
pipeline
=
new
FreemarkerPipeline
(
"wordpress.ftl"
);
Spider
.
me
().
pipeline
(
new
FilePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedula
r
(
"/data/temp/webmagic/cache/"
)).
processor
(
diaoyuwengProcessor
).
run
();
Spider
.
create
(
diaoyuwengProcessor
).
pipeline
(
new
FilePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedule
r
(
"/data/temp/webmagic/cache/"
)).
run
();
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java
View file @
7bed01c9
...
...
@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.pipeline.FreemarkerPipeline
;
import
us.codecraft.webmagic.samples.SinaBlogProcesser
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
a
r
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedul
e
r
;
import
java.io.IOException
;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
*
Date: 13-6-9
*
Time: 上午8:02
*/
public
class
SinablogProcessorTest
{
...
...
@@ -30,7 +30,7 @@ public class SinablogProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider
.
me
().
pipeline
(
new
FilePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedula
r
(
"/data/temp/webmagic/cache/"
)).
processor
(
sinaBlogProcesser
).
run
();
Spider
.
create
(
sinaBlogProcesser
).
pipeline
(
new
FilePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedule
r
(
"/data/temp/webmagic/cache/"
)).
run
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment