Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
cad2594a
Commit
cad2594a
authored
Jun 23, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add multithread support
parent
5a6a68a3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
120 additions
and
10 deletions
+120
-10
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+87
-10
ThreadUtils.java
...rc/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
+33
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
cad2594a
...
...
@@ -9,9 +9,12 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.schedular.QueueScheduler
;
import
us.codecraft.webmagic.schedular.Scheduler
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* <pre>
...
...
@@ -51,6 +54,16 @@ public class Spider implements Runnable, Task {
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
ExecutorService
executorService
;
private
AtomicInteger
stat
=
new
AtomicInteger
(
STAT_INIT
);
private
final
static
int
STAT_INIT
=
0
;
private
final
static
int
STAT_RUNNING
=
1
;
private
final
static
int
STAT_STOPPED
=
2
;
/**
* 使用已定义的抽取规则新建一个Spider。
* @param pageProcessor 已定义的抽取规则
...
...
@@ -76,6 +89,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public
Spider
startUrls
(
List
<
String
>
startUrls
)
{
checkIfNotRunning
();
this
.
startUrls
=
startUrls
;
return
this
;
}
...
...
@@ -96,6 +110,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public
Spider
scheduler
(
Scheduler
scheduler
)
{
checkIfNotRunning
();
this
.
scheduler
=
scheduler
;
return
this
;
}
...
...
@@ -106,6 +121,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public
Spider
pipeline
(
Pipeline
pipeline
)
{
checkIfNotRunning
();
this
.
pipelines
.
add
(
pipeline
);
return
this
;
}
...
...
@@ -113,6 +129,9 @@ public class Spider implements Runnable, Task {
@Override
public
void
run
()
{
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_RUNNING
))
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
if
(
startUrls
!=
null
)
{
for
(
String
startUrl
:
startUrls
)
{
scheduler
.
push
(
new
Request
(
startUrl
),
this
);
...
...
@@ -122,20 +141,56 @@ public class Spider implements Runnable, Task {
if
(
pipelines
.
isEmpty
())
{
pipelines
.
add
(
new
ConsolePipeline
());
}
while
(
request
!=
null
)
{
Page
page
=
downloader
.
download
(
request
,
site
);
if
(
page
=
=
null
)
{
sleep
(
site
.
getSleepTime
()
);
continue
;
//singel thread
if
(
executorService
==
null
){
while
(
request
!
=
null
)
{
processRequest
(
request
);
request
=
scheduler
.
poll
(
this
)
;
}
pageProcessor
.
process
(
page
);
addRequest
(
page
);
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
,
this
);
}
else
{
final
AtomicInteger
threadAlive
=
new
AtomicInteger
(
0
);
while
(
true
)
{
if
(
request
==
null
)
{
try
{
Thread
.
sleep
(
100
);
}
catch
(
InterruptedException
e
)
{
}
}
else
{
final
Request
requestFinal
=
request
;
threadAlive
.
incrementAndGet
();
executorService
.
execute
(
new
Runnable
()
{
@Override
public
void
run
()
{
processRequest
(
requestFinal
);
threadAlive
.
decrementAndGet
();
}
});
}
request
=
scheduler
.
poll
(
this
);
if
(
threadAlive
.
get
()
==
0
)
{
request
=
scheduler
.
poll
(
this
);
if
(
request
==
null
)
{
break
;
}
}
}
executorService
.
shutdown
();
}
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
);
}
private
void
processRequest
(
Request
request
)
{
Page
page
=
downloader
.
download
(
request
,
site
);
if
(
page
==
null
)
{
sleep
(
site
.
getSleepTime
());
request
=
scheduler
.
poll
(
this
);
return
;
}
pageProcessor
.
process
(
page
);
addRequest
(
page
);
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
,
this
);
}
sleep
(
site
.
getSleepTime
());
}
private
void
sleep
(
int
time
)
{
...
...
@@ -154,6 +209,28 @@ public class Spider implements Runnable, Task {
}
}
private
void
checkIfNotRunning
(){
if
(!
stat
.
compareAndSet
(
STAT_INIT
,
STAT_INIT
)){
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
}
/**
* 建立多个线程下载
* @param threadNum 线程数
* @return
*/
public
Spider
thread
(
int
threadNum
)
{
checkIfNotRunning
();
if
(
threadNum
<=
1
)
{
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
}
synchronized
(
this
){
this
.
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
}
return
this
;
}
@Override
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java
0 → 100644
View file @
cad2594a
package
us
.
codecraft
.
webmagic
.
utils
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.LinkedBlockingQueue
;
import
java.util.concurrent.ThreadPoolExecutor
;
import
java.util.concurrent.TimeUnit
;
/**
* @author code4crafer@gmail.com
* Date: 13-6-23
* Time: 下午7:11
*/
public
class
ThreadUtils
{
public
static
ExecutorService
newFixedThreadPool
(
int
threadSize
)
{
return
new
ThreadPoolExecutor
(
threadSize
,
threadSize
,
0L
,
TimeUnit
.
MILLISECONDS
,
new
LinkedBlockingQueue
<
Runnable
>(
1
)
{
private
static
final
long
serialVersionUID
=
-
9028058603126367678L
;
@Override
public
boolean
offer
(
Runnable
e
)
{
try
{
put
(
e
);
return
true
;
}
catch
(
InterruptedException
ie
)
{
Thread
.
currentThread
().
interrupt
();
}
return
false
;
}
});
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment