Commit 42a30074 authored by yihua.huang's avatar yihua.huang

update urls.contains to DuplicateRemover in FileCacheQueueScheduler #157

parent 689e89a9
...@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils; ...@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.io.*; import java.io.*;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
...@@ -68,6 +69,26 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement ...@@ -68,6 +69,26 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
logger.info("init cache scheduler success"); logger.info("init cache scheduler success");
} }
private void initDuplicateRemover() {
setDuplicateRemover(
new DuplicateRemover() {
@Override
public boolean isDuplicate(Request request, Task task) {
return !urls.add(request.getUrl());
}
@Override
public void resetDuplicateCheck(Task task) {
urls.clear();
}
@Override
public int getTotalRequestsCount(Task task) {
return urls.size();
}
});
}
private void initFlushThread() { private void initFlushThread() {
Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() { Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
@Override @Override
...@@ -92,6 +113,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement ...@@ -92,6 +113,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
urls = new LinkedHashSet<String>(); urls = new LinkedHashSet<String>();
readCursorFile(); readCursorFile();
readUrlFile(); readUrlFile();
initDuplicateRemover();
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
//init //init
logger.info("init cache file " + getFileName(fileUrlAllName)); logger.info("init cache file " + getFileName(fileUrlAllName));
...@@ -145,8 +167,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement ...@@ -145,8 +167,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
if (!inited.get()) { if (!inited.get()) {
init(task); init(task);
} }
if(urls.contains(request.getUrl())) //已存在此URL 表示已抓取过 跳过
return;
queue.add(request); queue.add(request);
fileUrlWriter.println(request.getUrl()); fileUrlWriter.println(request.getUrl());
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment