Commit 4f22f121 authored by yihua.huang's avatar yihua.huang

some bug fix #118

parent 186b9051
...@@ -5,6 +5,7 @@ import org.slf4j.LoggerFactory; ...@@ -5,6 +5,7 @@ import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
/** /**
* Remove duplicate urls and only push urls which are not duplicate.<br></br> * Remove duplicate urls and only push urls which are not duplicate.<br></br>
...@@ -16,7 +17,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { ...@@ -16,7 +17,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
protected Logger logger = LoggerFactory.getLogger(getClass()); protected Logger logger = LoggerFactory.getLogger(getClass());
private DuplicateRemover duplicatedRemover; private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
public DuplicateRemover getDuplicateRemover() { public DuplicateRemover getDuplicateRemover() {
return duplicatedRemover; return duplicatedRemover;
......
...@@ -43,7 +43,7 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { ...@@ -43,7 +43,7 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover {
public boolean isDuplicate(Request request, Task task) { public boolean isDuplicate(Request request, Task task) {
boolean isDuplicate = bloomFilter.mightContain(request.getUrl()); boolean isDuplicate = bloomFilter.mightContain(request.getUrl());
if (!isDuplicate) { if (!isDuplicate) {
bloomFilter.apply(request.getUrl()); bloomFilter.put(request.getUrl());
counter.incrementAndGet(); counter.incrementAndGet();
} }
return isDuplicate; return isDuplicate;
......
...@@ -17,11 +17,11 @@ public class BloomFilterDuplicateRemoverTest { ...@@ -17,11 +17,11 @@ public class BloomFilterDuplicateRemoverTest {
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
assertThat(isDuplicate).isFalse(); assertThat(isDuplicate).isFalse();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
assertThat(isDuplicate); assertThat(isDuplicate).isTrue();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
assertThat(isDuplicate).isFalse(); assertThat(isDuplicate).isFalse();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
assertThat(isDuplicate); assertThat(isDuplicate).isTrue();
} }
} }
...@@ -3,9 +3,12 @@ package us.codecraft.webmagic.samples; ...@@ -3,9 +3,12 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import javax.management.JMException;
import java.util.List; import java.util.List;
/** /**
...@@ -30,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -30,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) throws JMException {
Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
SpiderMonitor.instance().register(spider);
spider.run();
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment