Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
11041229
Commit
11041229
authored
Apr 27, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
more abstraction in scheduler
parent
b0fb1c3e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
113 additions
and
16 deletions
+113
-16
DuplicatedRemoveScheduler.java
...decraft/webmagic/scheduler/DuplicatedRemoveScheduler.java
+45
-0
LocalDuplicatedRemoveScheduler.java
...ft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java
+34
-0
PriorityScheduler.java
...va/us/codecraft/webmagic/scheduler/PriorityScheduler.java
+1
-1
QueueScheduler.java
.../java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+1
-1
FileCacheQueueScheduler.java
...codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+1
-1
RedisScheduler.java
.../java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+31
-13
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/
LocalDuplicatedRemoved
Scheduler.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/
DuplicatedRemove
Scheduler.java
View file @
11041229
package
us
.
codecraft
.
webmagic
.
scheduler
;
import
com.google.common.collect.Sets
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
java.util.Set
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
*
Base Scheduler with duplicated urls removed locally.
*
Remove duplicate urls and only push urls which are not duplicate.<br></br>
*
* @author code4craf
t
er@gmail.com
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public
abstract
class
LocalDuplicatedRemovedScheduler
implements
Monitorable
Scheduler
{
public
abstract
class
DuplicatedRemoveScheduler
implements
Scheduler
{
protected
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
Set
<
String
>
urls
=
Sets
.
newSetFromMap
(
new
ConcurrentHashMap
<
String
,
Boolean
>());
@Override
public
void
push
(
Request
request
,
Task
task
)
{
logger
.
trace
(
"get a candidate url {}"
,
request
.
getUrl
());
if
(
isDuplicate
(
request
)
||
shouldReserved
(
request
))
{
if
(
isDuplicate
(
request
,
task
)
||
shouldReserved
(
request
))
{
logger
.
debug
(
"push to queue {}"
,
request
.
getUrl
());
pushWhenNoDuplicate
(
request
,
task
);
}
}
protected
boolean
isDuplicate
(
Request
request
)
{
return
urls
.
add
(
request
.
getUrl
());
}
/**
* Reset duplicate check.
*/
public
abstract
void
resetDuplicateCheck
(
Task
task
);
/**
* @param request
* @return
*/
protected
abstract
boolean
isDuplicate
(
Request
request
,
Task
task
);
protected
boolean
shouldReserved
(
Request
request
)
{
return
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
)
!=
null
;
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
urls
.
size
();
}
protected
void
pushWhenNoDuplicate
(
Request
request
,
Task
task
)
{
protected
abstract
void
pushWhenNoDuplicate
(
Request
request
,
Task
task
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java
0 → 100644
View file @
11041229
package
us
.
codecraft
.
webmagic
.
scheduler
;
import
com.google.common.collect.Sets
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
java.util.Set
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* Base Scheduler with duplicated urls removed by hash set.<br></br>
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public
abstract
class
LocalDuplicatedRemoveScheduler
extends
DuplicatedRemoveScheduler
implements
MonitorableScheduler
{
private
Set
<
String
>
urls
=
Sets
.
newSetFromMap
(
new
ConcurrentHashMap
<
String
,
Boolean
>());
@Override
public
void
resetDuplicateCheck
(
Task
task
)
{
urls
.
clear
();
}
@Override
protected
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
return
urls
.
add
(
request
.
getUrl
());
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
urls
.
size
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
View file @
11041229
...
...
@@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
* @since 0.2.1
*/
@ThreadSafe
public
class
PriorityScheduler
extends
LocalDuplicatedRemove
d
Scheduler
{
public
class
PriorityScheduler
extends
LocalDuplicatedRemoveScheduler
{
public
static
final
int
INITIAL_CAPACITY
=
5
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
View file @
11041229
...
...
@@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* @since 0.1.0
*/
@ThreadSafe
public
class
QueueScheduler
extends
LocalDuplicatedRemove
d
Scheduler
{
public
class
QueueScheduler
extends
LocalDuplicatedRemoveScheduler
{
private
BlockingQueue
<
Request
>
queue
=
new
LinkedBlockingQueue
<
Request
>();
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
View file @
11041229
...
...
@@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public
class
FileCacheQueueScheduler
extends
LocalDuplicatedRemove
d
Scheduler
{
public
class
FileCacheQueueScheduler
extends
LocalDuplicatedRemoveScheduler
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
View file @
11041229
...
...
@@ -14,7 +14,7 @@ import us.codecraft.webmagic.Task;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public
class
RedisScheduler
implements
MonitorableScheduler
{
public
class
RedisScheduler
extends
DuplicatedRemoveScheduler
implements
MonitorableScheduler
{
private
JedisPool
pool
;
...
...
@@ -33,21 +33,39 @@ public class RedisScheduler implements MonitorableScheduler {
}
@Override
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
public
void
resetDuplicateCheck
(
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
try
{
// if cycleRetriedTimes is set, allow duplicated.
Object
cycleRetriedTimes
=
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
);
// use set to remove duplicate url
if
(
cycleRetriedTimes
!=
null
||
!
jedis
.
sismember
(
getSetKey
(
task
),
request
.
getUrl
()))
{
// use list to store queue
jedis
.
rpush
(
getQueueKey
(
task
),
request
.
getUrl
());
jedis
.
del
(
getSetKey
(
task
));
}
finally
{
pool
.
returnResource
(
jedis
);
}
}
@Override
protected
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
try
{
boolean
isDuplicate
=
!
jedis
.
sismember
(
getSetKey
(
task
),
request
.
getUrl
());
if
(!
isDuplicate
)
{
jedis
.
sadd
(
getSetKey
(
task
),
request
.
getUrl
());
if
(
request
.
getExtras
()
!=
null
)
{
String
field
=
DigestUtils
.
shaHex
(
request
.
getUrl
());
String
value
=
JSON
.
toJSONString
(
request
);
jedis
.
hset
((
ITEM_PREFIX
+
task
.
getUUID
()),
field
,
value
);
}
}
return
isDuplicate
;
}
finally
{
pool
.
returnResource
(
jedis
);
}
}
@Override
protected
void
pushWhenNoDuplicate
(
Request
request
,
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
try
{
jedis
.
rpush
(
getQueueKey
(
task
),
request
.
getUrl
());
if
(
request
.
getExtras
()
!=
null
)
{
String
field
=
DigestUtils
.
shaHex
(
request
.
getUrl
());
String
value
=
JSON
.
toJSONString
(
request
);
jedis
.
hset
((
ITEM_PREFIX
+
task
.
getUUID
()),
field
,
value
);
}
}
finally
{
pool
.
returnResource
(
jedis
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment