Commit e8d4a9be authored by yihua.huang's avatar yihua.huang

fix remove duplicate error #117

parent 22652c45
...@@ -18,7 +18,7 @@ public abstract class DuplicatedRemoveScheduler implements Scheduler { ...@@ -18,7 +18,7 @@ public abstract class DuplicatedRemoveScheduler implements Scheduler {
@Override @Override
public void push(Request request, Task task) { public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl()); logger.trace("get a candidate url {}", request.getUrl());
if (isDuplicate(request, task) || shouldReserved(request)) { if (!isDuplicate(request, task) || shouldReserved(request)) {
logger.debug("push to queue {}", request.getUrl()); logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task); pushWhenNoDuplicate(request, task);
} }
......
...@@ -24,7 +24,7 @@ public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveSch ...@@ -24,7 +24,7 @@ public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveSch
@Override @Override
protected boolean isDuplicate(Request request, Task task) { protected boolean isDuplicate(Request request, Task task) {
return urls.add(request.getUrl()); return !urls.add(request.getUrl());
} }
@Override @Override
......
...@@ -46,7 +46,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor ...@@ -46,7 +46,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
protected boolean isDuplicate(Request request, Task task) { protected boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
try { try {
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl()); boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
if (!isDuplicate) { if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl()); jedis.sadd(getSetKey(task), request.getUrl());
} }
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
/**
* @author code4crafer@gmail.com
*/
public class AmanzonPageProcessor implements PageProcessor{
public void process(Page page) {
Html html = page.getHtml();
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
if(questionList != null && questionList.size() > 1)
{
//i=0是列名称,所以i从1开始
for( int i = 1 ; i < questionList.size(); i++)
{
System.out.println(questionList.get(i));
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
System.out.println(comment);
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
System.out.println(answerNum);
String createTime = tempHtml.xpath("//td[3]/text()").toString();
System.out.println(createTime);
/* Document doc = Jsoup.parse(questionList.get(i));
Html hmt = Html.create(questionList.get(i)) ;
String str = hmt.links().toString();
String content = doc.getElementsByTag("a").text();
String ss = doc.text();*/
}
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
}
}
...@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page; ...@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List; import java.util.List;
...@@ -30,6 +31,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -30,6 +31,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).run(); Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment