Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
d1140b9e
Commit
d1140b9e
authored
May 02, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add bloom filter for scheduler #118
parent
64293cba
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
169 additions
and
32 deletions
+169
-32
DuplicateRemovedScheduler.java
...decraft/webmagic/scheduler/DuplicateRemovedScheduler.java
+13
-13
PriorityScheduler.java
...va/us/codecraft/webmagic/scheduler/PriorityScheduler.java
+6
-1
QueueScheduler.java
.../java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+6
-1
BloomFilterDuplicateRemover.java
...agic/scheduler/component/BloomFilterDuplicateRemover.java
+61
-0
DuplicateRemover.java
...ecraft/webmagic/scheduler/component/DuplicateRemover.java
+35
-0
HashSetDuplicateRemover.java
...webmagic/scheduler/component/HashSetDuplicateRemover.java
+7
-10
package.html
...va/us/codecraft/webmagic/scheduler/component/package.html
+5
-0
BloomFilterDuplicateRemoverTest.java
...t/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
+27
-0
FileCacheQueueScheduler.java
...codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+6
-5
RedisScheduler.java
.../java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+3
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Duplicate
dRemove
Scheduler.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Duplicate
Removed
Scheduler.java
View file @
d1140b9e
...
...
@@ -4,6 +4,7 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.scheduler.component.DuplicateRemover
;
/**
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
...
...
@@ -11,30 +12,29 @@ import us.codecraft.webmagic.Task;
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public
abstract
class
Duplicate
dRemove
Scheduler
implements
Scheduler
{
public
abstract
class
Duplicate
Removed
Scheduler
implements
Scheduler
{
protected
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
DuplicateRemover
duplicatedRemover
;
public
DuplicateRemover
getDuplicateRemover
()
{
return
duplicatedRemover
;
}
public
void
setDuplicateRemover
(
DuplicateRemover
duplicatedRemover
)
{
this
.
duplicatedRemover
=
duplicatedRemover
;
}
@Override
public
void
push
(
Request
request
,
Task
task
)
{
logger
.
trace
(
"get a candidate url {}"
,
request
.
getUrl
());
if
(!
isDuplicate
(
request
,
task
)
||
shouldReserved
(
request
))
{
if
(!
duplicatedRemover
.
isDuplicate
(
request
,
task
)
||
shouldReserved
(
request
))
{
logger
.
debug
(
"push to queue {}"
,
request
.
getUrl
());
pushWhenNoDuplicate
(
request
,
task
);
}
}
/**
* Reset duplicate check.
*/
public
abstract
void
resetDuplicateCheck
(
Task
task
);
/**
* @param request
* @return
*/
protected
abstract
boolean
isDuplicate
(
Request
request
,
Task
task
);
protected
boolean
shouldReserved
(
Request
request
)
{
return
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
)
!=
null
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
View file @
d1140b9e
...
...
@@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
* @since 0.2.1
*/
@ThreadSafe
public
class
PriorityScheduler
extends
LocalDuplicatedRemov
eScheduler
{
public
class
PriorityScheduler
extends
DuplicateRemovedScheduler
implements
Monitorabl
eScheduler
{
public
static
final
int
INITIAL_CAPACITY
=
5
;
...
...
@@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
public
int
getLeftRequestsCount
(
Task
task
)
{
return
noPriorityQueue
.
size
();
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
getDuplicateRemover
().
getTotalRequestsCount
(
task
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
View file @
d1140b9e
...
...
@@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* @since 0.1.0
*/
@ThreadSafe
public
class
QueueScheduler
extends
LocalDuplicatedRemov
eScheduler
{
public
class
QueueScheduler
extends
DuplicateRemovedScheduler
implements
Monitorabl
eScheduler
{
private
BlockingQueue
<
Request
>
queue
=
new
LinkedBlockingQueue
<
Request
>();
...
...
@@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
public
int
getLeftRequestsCount
(
Task
task
)
{
return
queue
.
size
();
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
getDuplicateRemover
().
getTotalRequestsCount
(
task
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java
0 → 100644
View file @
d1140b9e
package
us
.
codecraft
.
webmagic
.
scheduler
.
component
;
import
com.google.common.hash.BloomFilter
;
import
com.google.common.hash.Funnels
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
java.nio.charset.Charset
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* BloomFilterDuplicateRemover for huge number of urls.
*
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public
class
BloomFilterDuplicateRemover
implements
DuplicateRemover
{
private
int
expectedInsertions
;
private
double
fpp
;
private
AtomicInteger
counter
;
public
BloomFilterDuplicateRemover
(
int
expectedInsertions
)
{
this
(
expectedInsertions
,
0.03
);
}
public
BloomFilterDuplicateRemover
(
int
expectedInsertions
,
double
fpp
)
{
this
.
expectedInsertions
=
expectedInsertions
;
this
.
fpp
=
fpp
;
this
.
bloomFilter
=
rebuildBloomFilter
();
}
protected
BloomFilter
<
CharSequence
>
rebuildBloomFilter
()
{
counter
=
new
AtomicInteger
(
0
);
return
BloomFilter
.
create
(
Funnels
.
stringFunnel
(
Charset
.
defaultCharset
()),
expectedInsertions
,
fpp
);
}
private
final
BloomFilter
<
CharSequence
>
bloomFilter
;
@Override
public
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
boolean
isDuplicate
=
bloomFilter
.
mightContain
(
request
.
getUrl
());
if
(!
isDuplicate
)
{
bloomFilter
.
apply
(
request
.
getUrl
());
counter
.
incrementAndGet
();
}
return
isDuplicate
;
}
@Override
public
void
resetDuplicateCheck
(
Task
task
)
{
rebuildBloomFilter
();
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
counter
.
get
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
0 → 100644
View file @
d1140b9e
package
us
.
codecraft
.
webmagic
.
scheduler
.
component
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
/**
* Remove duplicate requests.
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public
interface
DuplicateRemover
{
/**
*
* Check whether the request is duplicate.
*
* @param request
* @param task
* @return
*/
public
boolean
isDuplicate
(
Request
request
,
Task
task
);
/**
* Reset duplicate check.
* @param task
*/
public
void
resetDuplicateCheck
(
Task
task
);
/**
* Get TotalRequestsCount for monitor.
* @param task
* @return
*/
public
int
getTotalRequestsCount
(
Task
task
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/
LocalDuplicatedRemoveSchedul
er.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/
component/HashSetDuplicateRemov
er.java
View file @
d1140b9e
package
us
.
codecraft
.
webmagic
.
scheduler
;
package
us
.
codecraft
.
webmagic
.
scheduler
.
component
;
import
com.google.common.collect.Sets
;
import
us.codecraft.webmagic.Request
;
...
...
@@ -8,23 +8,20 @@ import java.util.Set;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* Base Scheduler with duplicated urls removed by hash set.<br></br>
*
* @author code4crafter@gmail.com
* @since 0.5.0
* @author code4crafer@gmail.com
*/
public
abstract
class
LocalDuplicatedRemoveScheduler
extends
DuplicatedRemoveScheduler
implements
MonitorableSchedul
er
{
public
class
HashSetDuplicateRemover
implements
DuplicateRemov
er
{
private
Set
<
String
>
urls
=
Sets
.
newSetFromMap
(
new
ConcurrentHashMap
<
String
,
Boolean
>());
@Override
public
void
resetDuplicateCheck
(
Task
task
)
{
urls
.
clear
(
);
public
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
return
!
urls
.
add
(
request
.
getUrl
()
);
}
@Override
p
rotected
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
return
!
urls
.
add
(
request
.
getUrl
()
);
p
ublic
void
resetDuplicateCheck
(
Task
task
)
{
urls
.
clear
(
);
}
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html
0 → 100644
View file @
d1140b9e
<html>
<body>
Component of scheduler.
</body>
</html>
webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
0 → 100644
View file @
d1140b9e
package
us
.
codecraft
.
webmagic
.
scheduler
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
/**
* @author code4crafer@gmail.com
*/
public
class
BloomFilterDuplicateRemoverTest
{
@Test
public
void
testRemove
()
throws
Exception
{
BloomFilterDuplicateRemover
bloomFilterDuplicateRemover
=
new
BloomFilterDuplicateRemover
(
10
);
boolean
isDuplicate
=
bloomFilterDuplicateRemover
.
isDuplicate
(
new
Request
(
"a"
),
null
);
assertThat
(
isDuplicate
).
isFalse
();
isDuplicate
=
bloomFilterDuplicateRemover
.
isDuplicate
(
new
Request
(
"a"
),
null
);
assertThat
(
isDuplicate
);
isDuplicate
=
bloomFilterDuplicateRemover
.
isDuplicate
(
new
Request
(
"b"
),
null
);
assertThat
(
isDuplicate
).
isFalse
();
isDuplicate
=
bloomFilterDuplicateRemover
.
isDuplicate
(
new
Request
(
"b"
),
null
);
assertThat
(
isDuplicate
);
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
View file @
d1140b9e
...
...
@@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang3.math.NumberUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
...
...
@@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public
class
FileCacheQueueScheduler
extends
LocalDuplicatedRemoveScheduler
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
public
class
FileCacheQueueScheduler
extends
DuplicateRemovedScheduler
implements
MonitorableScheduler
{
private
String
filePath
=
System
.
getProperty
(
"java.io.tmpdir"
);
...
...
@@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
public
int
getLeftRequestsCount
(
Task
task
)
{
return
queue
.
size
();
}
@Override
public
int
getTotalRequestsCount
(
Task
task
)
{
return
getDuplicateRemover
().
getTotalRequestsCount
(
task
);
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
View file @
d1140b9e
...
...
@@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool;
import
redis.clients.jedis.JedisPoolConfig
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.scheduler.component.DuplicateRemover
;
/**
* Use Redis as url scheduler for distributed crawlers.<br>
...
...
@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public
class
RedisScheduler
extends
Duplicate
dRemoveScheduler
implements
MonitorableSchedul
er
{
public
class
RedisScheduler
extends
Duplicate
RemovedScheduler
implements
MonitorableScheduler
,
DuplicateRemov
er
{
private
JedisPool
pool
;
...
...
@@ -43,7 +44,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
}
@Override
p
rotected
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
p
ublic
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
try
{
boolean
isDuplicate
=
jedis
.
sismember
(
getSetKey
(
task
),
request
.
getUrl
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment