Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
c2142f87
Commit
c2142f87
authored
Jul 26, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add iteye sample
parent
2a19d803
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
89 additions
and
14 deletions
+89
-14
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+24
-13
FilePipeline.java
...ain/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+1
-1
IteyeBlogProcessor.java
...ava/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+38
-0
log4j.xml
webmagic-samples/src/main/resources/log4j.xml
+26
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
c2142f87
...
@@ -18,23 +18,24 @@ import java.util.concurrent.atomic.AtomicInteger;
...
@@ -18,23 +18,24 @@ import java.util.concurrent.atomic.AtomicInteger;
/**
/**
* <pre>
* <pre>
*webmagic爬虫的入口类。
*
webmagic爬虫的入口类。
*
*
*示例:
*
示例:
*定义一个最简单的爬虫:
*
定义一个最简单的爬虫:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
*
*使用FilePipeline保存结果到文件:
*
使用FilePipeline保存结果到文件:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
*
*使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
*
使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* </pre>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
*
Date: 13-4-21
* Time: 上午6:53
*
Time: 上午6:53
*/
*/
public
class
Spider
implements
Runnable
,
Task
{
public
class
Spider
implements
Runnable
,
Task
{
...
@@ -66,6 +67,7 @@ public class Spider implements Runnable, Task {
...
@@ -66,6 +67,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 使用已定义的抽取规则新建一个Spider。
* 使用已定义的抽取规则新建一个Spider。
*
* @param pageProcessor 已定义的抽取规则
* @param pageProcessor 已定义的抽取规则
*/
*/
public
Spider
(
PageProcessor
pageProcessor
)
{
public
Spider
(
PageProcessor
pageProcessor
)
{
...
@@ -76,6 +78,7 @@ public class Spider implements Runnable, Task {
...
@@ -76,6 +78,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 使用已定义的抽取规则新建一个Spider。
* 使用已定义的抽取规则新建一个Spider。
*
* @param pageProcessor 已定义的抽取规则
* @param pageProcessor 已定义的抽取规则
* @return 新建的Spider
* @return 新建的Spider
*/
*/
...
@@ -85,6 +88,7 @@ public class Spider implements Runnable, Task {
...
@@ -85,6 +88,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
* 重新设置startUrls,会覆盖Site本身的startUrls。
*
* @param startUrls
* @param startUrls
* @return this
* @return this
*/
*/
...
@@ -96,6 +100,7 @@ public class Spider implements Runnable, Task {
...
@@ -96,6 +100,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。
* 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。
*
* @param uuid 唯一ID
* @param uuid 唯一ID
* @return this
* @return this
*/
*/
...
@@ -106,6 +111,7 @@ public class Spider implements Runnable, Task {
...
@@ -106,6 +111,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。
* 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。
*
* @param scheduler 调度器
* @param scheduler 调度器
* @return this
* @return this
*/
*/
...
@@ -117,6 +123,7 @@ public class Spider implements Runnable, Task {
...
@@ -117,6 +123,7 @@ public class Spider implements Runnable, Task {
/**
/**
* 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。
* 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。
*
* @param pipeline 处理管道
* @param pipeline 处理管道
* @return this
* @return this
*/
*/
...
@@ -148,7 +155,7 @@ public class Spider implements Runnable, Task {
...
@@ -148,7 +155,7 @@ public class Spider implements Runnable, Task {
pipelines
.
add
(
new
ConsolePipeline
());
pipelines
.
add
(
new
ConsolePipeline
());
}
}
//singel thread
//singel thread
if
(
executorService
==
null
)
{
if
(
executorService
==
null
)
{
while
(
request
!=
null
)
{
while
(
request
!=
null
)
{
processRequest
(
request
);
processRequest
(
request
);
request
=
scheduler
.
poll
(
this
);
request
=
scheduler
.
poll
(
this
);
...
@@ -217,13 +224,13 @@ public class Spider implements Runnable, Task {
...
@@ -217,13 +224,13 @@ public class Spider implements Runnable, Task {
}
}
}
}
private
void
checkIfNotRunning
(){
private
void
checkIfNotRunning
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_INIT
))
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_INIT
))
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
}
}
}
public
void
runAsync
(){
public
void
runAsync
()
{
Thread
thread
=
new
Thread
(
this
);
Thread
thread
=
new
Thread
(
this
);
thread
.
setDaemon
(
false
);
thread
.
setDaemon
(
false
);
thread
.
start
();
thread
.
start
();
...
@@ -231,15 +238,19 @@ public class Spider implements Runnable, Task {
...
@@ -231,15 +238,19 @@ public class Spider implements Runnable, Task {
/**
/**
* 建立多个线程下载
* 建立多个线程下载
*
* @param threadNum 线程数
* @param threadNum 线程数
* @return this
* @return this
*/
*/
public
Spider
thread
(
int
threadNum
)
{
public
Spider
thread
(
int
threadNum
)
{
checkIfNotRunning
();
checkIfNotRunning
();
if
(
threadNum
<=
1
)
{
if
(
threadNum
<=
0
)
{
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
}
}
synchronized
(
this
){
if
(
threadNum
==
1
)
{
return
this
;
}
synchronized
(
this
)
{
this
.
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
this
.
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
}
}
return
this
;
return
this
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
View file @
c2142f87
...
@@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline {
...
@@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline {
file
.
mkdirs
();
file
.
mkdirs
();
}
}
try
{
try
{
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
resultItems
.
getRequest
().
getUrl
())));
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
resultItems
.
getRequest
().
getUrl
())
+
".html"
));
printWriter
.
println
(
"url:\t"
+
resultItems
.
getRequest
().
getUrl
());
printWriter
.
println
(
"url:\t"
+
resultItems
.
getRequest
().
getUrl
());
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
resultItems
.
getAll
().
entrySet
())
{
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
resultItems
.
getAll
().
entrySet
())
{
printWriter
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
());
printWriter
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
());
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
0 → 100644
View file @
c2142f87
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* Time: 上午7:31 <br>
*/
public
class
IteyeBlogProcessor
implements
PageProcessor
{
private
Site
site
;
@Override
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
".*yanghaoli\\.iteye\\.com/blog/\\d+"
).
all
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
().
toString
());
}
@Override
public
Site
getSite
()
{
if
(
site
==
null
)
{
site
=
Site
.
me
().
setDomain
(
"yanghaoli.iteye.com"
).
addStartUrl
(
"http://yanghaoli.iteye.com/"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
)
.
setSleepTime
(
100
).
setRetryTimes
(
3
);
}
return
site
;
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
pipeline
(
new
FilePipeline
()).
run
();
}
}
webmagic-samples/src/main/resources/log4j.xml
0 → 100644
View file @
c2142f87
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration
xmlns:log4j=
"http://jakarta.apache.org/log4j/"
>
<appender
name=
"stdout"
class=
"org.apache.log4j.ConsoleAppender"
>
<layout
class=
"org.apache.log4j.PatternLayout"
>
<param
name=
"ConversionPattern"
value=
"%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n"
/>
</layout>
</appender>
<logger
name=
"org.springframework"
additivity=
"false"
>
<level
value=
"warn"
/>
<appender-ref
ref=
"stdout"
/>
</logger>
<logger
name=
"net.sf.ehcache"
additivity=
"false"
>
<level
value=
"warn"
/>
<appender-ref
ref=
"stdout"
/>
</logger>
<root>
<level
value=
"info"
/>
<appender-ref
ref=
"stdout"
/>
</root>
</log4j:configuration>
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment