Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
fba33087
Commit
fba33087
authored
Sep 22, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix a thread pool exception
parent
3c79d031
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
407 additions
and
404 deletions
+407
-404
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+396
-383
ThreadUtils.java
...rc/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
+8
-19
SpiderTest.java
...-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
+3
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
fba33087
...
...
@@ -21,21 +21,26 @@ import java.util.concurrent.atomic.AtomicInteger;
/**
* Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
* Pipeline.<br>
* Every module is a field of Spider. <br>
* The modules are defined in interface. <br>
* You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br>
* A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")).run();<br>
* <br>
* Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a
* Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
*
* @author code4crafter@gmail.com <br>
...
...
@@ -221,8 +226,7 @@ public class Spider implements Runnable, Task {
@Override
public
void
run
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_RUNNING
)
&&
!
stat
.
compareAndSet
(
STAT_STOPPED
,
STAT_RUNNING
))
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_RUNNING
)
&&
!
stat
.
compareAndSet
(
STAT_STOPPED
,
STAT_RUNNING
))
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
checkComponent
();
...
...
@@ -233,7 +237,8 @@ public class Spider implements Runnable, Task {
startUrls
.
clear
();
}
Request
request
=
scheduler
.
poll
(
this
);
//single thread
logger
.
info
(
"Spider "
+
getUUID
()
+
" started!"
);
// single thread
if
(
threadNum
<=
1
)
{
while
(
request
!=
null
&&
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_RUNNING
))
{
processRequest
(
request
);
...
...
@@ -243,11 +248,12 @@ public class Spider implements Runnable, Task {
synchronized
(
this
)
{
this
.
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
}
//
multi thread
//
multi thread
final
AtomicInteger
threadAlive
=
new
AtomicInteger
(
0
);
while
(
true
&&
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_RUNNING
))
{
if
(
request
==
null
)
{
//when no request found but some thread is alive, sleep a while.
// when no request found but some thread is alive, sleep a
// while.
try
{
Thread
.
sleep
(
100
);
}
catch
(
InterruptedException
e
)
{
...
...
@@ -274,7 +280,7 @@ public class Spider implements Runnable, Task {
executorService
.
shutdown
();
}
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
);
//
release some resources
//
release some resources
destroy
();
}
...
...
@@ -299,7 +305,8 @@ public class Spider implements Runnable, Task {
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
* @param urls
* urls to process
*/
public
void
test
(
String
...
urls
)
{
checkComponent
();
...
...
@@ -316,7 +323,7 @@ public class Spider implements Runnable, Task {
sleep
(
site
.
getSleepTime
());
return
;
}
//
for cycle retry
//
for cycle retry
if
(
page
.
getHtml
()
==
null
)
{
addRequest
(
page
);
sleep
(
site
.
getSleepTime
());
...
...
@@ -365,9 +372,15 @@ public class Spider implements Runnable, Task {
}
public
void
stop
()
{
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
);
if
(
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
))
{
if
(
executorService
!=
null
)
{
executorService
.
shutdown
();
}
logger
.
info
(
"Spider "
+
getUUID
()
+
" stop success!"
);
}
else
{
logger
.
info
(
"Spider "
+
getUUID
()
+
" stop fail!"
);
}
}
public
void
stopAndDestroy
()
{
stop
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
View file @
fba33087
package
us
.
codecraft
.
webmagic
.
utils
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.
LinkedBlocking
Queue
;
import
java.util.concurrent.
Synchronous
Queue
;
import
java.util.concurrent.ThreadPoolExecutor
;
import
java.util.concurrent.TimeUnit
;
...
...
@@ -12,21 +12,10 @@ import java.util.concurrent.TimeUnit;
public
class
ThreadUtils
{
public
static
ExecutorService
newFixedThreadPool
(
int
threadSize
)
{
return
new
ThreadPoolExecutor
(
threadSize
,
threadSize
,
0L
,
TimeUnit
.
MILLISECONDS
,
new
LinkedBlockingQueue
<
Runnable
>(
1
)
{
private
static
final
long
serialVersionUID
=
-
9028058603126367678L
;
@Override
public
boolean
offer
(
Runnable
e
)
{
try
{
put
(
e
);
return
true
;
}
catch
(
InterruptedException
ie
)
{
Thread
.
currentThread
().
interrupt
();
}
return
false
;
if
(
threadSize
<=
1
)
{
throw
new
IllegalArgumentException
(
"ThreadSize must be greater than 1!"
);
}
});
return
new
ThreadPoolExecutor
(
threadSize
-
1
,
threadSize
-
1
,
0L
,
TimeUnit
.
MILLISECONDS
,
new
SynchronousQueue
<
Runnable
>(),
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
View file @
fba33087
...
...
@@ -18,11 +18,12 @@ public class SpiderTest {
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
System
.
out
.
println
(
1
);
}
});
})
.
thread
(
2
)
;
spider
.
start
();
Thread
.
sleep
(
10000
);
spider
.
stop
();
// spider.run();
Thread
.
sleep
(
10000
);
spider
.
start
();
Thread
.
sleep
(
10000
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment