Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
ef4cf49f
Commit
ef4cf49f
authored
Sep 06, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add stop method to spider #24
parent
ac4cd391
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
71 additions
and
26 deletions
+71
-26
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+43
-26
SpiderTest.java
...-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
+28
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
ef4cf49f
...
...
@@ -79,22 +79,22 @@ public class Spider implements Runnable, Task {
* create a spider with pageProcessor.
*
* @param pageProcessor
* @return new spider
* @see PageProcessor
*/
public
Spider
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
this
.
startUrls
=
pageProcessor
.
getSite
().
getStartUrls
();
public
static
Spider
create
(
PageProcessor
pageProcessor
)
{
return
new
Spider
(
pageProcessor
);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor
* @return new spider
* @see PageProcessor
*/
public
static
Spider
create
(
PageProcessor
pageProcessor
)
{
return
new
Spider
(
pageProcessor
);
public
Spider
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
this
.
startUrls
=
pageProcessor
.
getSite
().
getStartUrls
();
}
/**
...
...
@@ -105,7 +105,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public
Spider
startUrls
(
List
<
String
>
startUrls
)
{
checkIf
Not
Running
();
checkIfRunning
();
this
.
startUrls
=
startUrls
;
return
this
;
}
...
...
@@ -139,11 +139,11 @@ public class Spider implements Runnable, Task {
*
* @param scheduler
* @return this
* @since 0.2.1
* @see Scheduler
* @since 0.2.1
*/
public
Spider
setScheduler
(
Scheduler
scheduler
)
{
checkIf
Not
Running
();
checkIfRunning
();
this
.
scheduler
=
scheduler
;
return
this
;
}
...
...
@@ -153,8 +153,8 @@ public class Spider implements Runnable, Task {
*
* @param pipeline
* @return this
* @deprecated
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public
Spider
pipeline
(
Pipeline
pipeline
)
{
return
addPipeline
(
pipeline
);
...
...
@@ -165,11 +165,11 @@ public class Spider implements Runnable, Task {
*
* @param pipeline
* @return this
* @since 0.2.1
* @see Pipeline
* @since 0.2.1
*/
public
Spider
addPipeline
(
Pipeline
pipeline
)
{
checkIf
Not
Running
();
checkIfRunning
();
this
.
pipelines
.
add
(
pipeline
);
return
this
;
}
...
...
@@ -189,8 +189,8 @@ public class Spider implements Runnable, Task {
*
* @param downloader
* @return this
* @deprecated
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public
Spider
downloader
(
Downloader
downloader
)
{
return
setDownloader
(
downloader
);
...
...
@@ -198,12 +198,13 @@ public class Spider implements Runnable, Task {
/**
* set the downloader of spider
*
@see Downloader
*
* @param downloader
* @return this
* @see Downloader
*/
public
Spider
setDownloader
(
Downloader
downloader
)
{
checkIf
Not
Running
();
checkIfRunning
();
this
.
downloader
=
downloader
;
return
this
;
}
...
...
@@ -220,7 +221,8 @@ public class Spider implements Runnable, Task {
@Override
public
void
run
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_RUNNING
))
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_RUNNING
)
&&
!
stat
.
compareAndSet
(
STAT_STOPPED
,
STAT_RUNNING
))
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
checkComponent
();
...
...
@@ -228,18 +230,19 @@ public class Spider implements Runnable, Task {
for
(
String
startUrl
:
startUrls
)
{
scheduler
.
push
(
new
Request
(
startUrl
),
this
);
}
startUrls
.
clear
();
}
Request
request
=
scheduler
.
poll
(
this
);
//sing
el
thread
//sing
le
thread
if
(
executorService
==
null
)
{
while
(
request
!=
null
)
{
while
(
request
!=
null
&&
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_RUNNING
)
)
{
processRequest
(
request
);
request
=
scheduler
.
poll
(
this
);
}
}
else
{
//multi thread
final
AtomicInteger
threadAlive
=
new
AtomicInteger
(
0
);
while
(
true
)
{
while
(
true
&&
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_RUNNING
)
)
{
if
(
request
==
null
)
{
//when no request found but some thread is alive, sleep a while.
try
{
...
...
@@ -311,7 +314,7 @@ public class Spider implements Runnable, Task {
return
;
}
//for cycle retry
if
(
page
.
getHtml
()
==
null
)
{
if
(
page
.
getHtml
()
==
null
)
{
addRequest
(
page
);
sleep
(
site
.
getSleepTime
());
return
;
...
...
@@ -342,8 +345,8 @@ public class Spider implements Runnable, Task {
}
}
protected
void
checkIf
Not
Running
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_INIT
))
{
protected
void
checkIfRunning
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_INIT
)
&&
!
stat
.
compareAndSet
(
STAT_STOPPED
,
STAT_STOPPED
)
)
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
}
...
...
@@ -354,6 +357,19 @@ public class Spider implements Runnable, Task {
thread
.
start
();
}
public
void
start
()
{
runAsync
();
}
public
void
stop
()
{
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
);
}
public
void
stopAndDestroy
()
{
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
);
destroy
();
}
/**
* start with more than one threads
*
...
...
@@ -361,7 +377,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public
Spider
thread
(
int
threadNum
)
{
checkIf
Not
Running
();
checkIfRunning
();
this
.
threadNum
=
threadNum
;
if
(
threadNum
<=
0
)
{
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
...
...
@@ -377,9 +393,10 @@ public class Spider implements Runnable, Task {
/**
* switch off xsoup
*
* @return
*/
public
static
void
xsoupOff
(){
public
static
void
xsoupOff
()
{
EnvironmentUtil
.
setUseXsoup
(
false
);
}
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
0 → 100644
View file @
ef4cf49f
package
us
.
codecraft
.
webmagic
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.processor.SimplePageProcessor
;
/**
* @author code4crafter@gmail.com
*/
public
class
SpiderTest
{
@Ignore
(
"long time"
)
@Test
public
void
testStartAndStop
()
throws
InterruptedException
{
Spider
spider
=
Spider
.
create
(
new
SimplePageProcessor
(
"http://www.oschina.net/"
,
"http://www.oschina.net/*"
)).
addPipeline
(
new
Pipeline
()
{
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
System
.
out
.
println
(
1
);
}
});
spider
.
start
();
Thread
.
sleep
(
10000
);
spider
.
stop
();
// spider.run();
Thread
.
sleep
(
10000
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment