Commit feb604da authored by yihua.huang's avatar yihua.huang

Merge branch 'stable' of github.com:code4craft/webmagic

parents d5f34a4c dbebcbe4
...@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w ...@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
``` ```
...@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) ...@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
* [lidongyang](http://my.oschina.net/lidongyang) * [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu) * [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118) * [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
### 邮件组: ### 邮件组:
......
...@@ -25,12 +25,12 @@ Add dependencies to your pom.xml: ...@@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
``` ```
...@@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for ...@@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for
* [lidongyang](http://my.oschina.net/lidongyang) * [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu) * [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118) * [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
### Thanks: ### Thanks:
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
<version>7</version> <version>7</version>
</parent> </parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
<module>webmagic-selenium</module> <module>webmagic-selenium</module>
<module>webmagic-saxon</module> <module>webmagic-saxon</module>
<module>webmagic-samples</module> <module>webmagic-samples</module>
<module>webmagic-avalon</module> <!--<module>webmagic-avalon</module>-->
</modules> </modules>
<dependencyManagement> <dependencyManagement>
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
</parent> </parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>forger</artifactId> <artifactId>forger</artifactId>
<version>0.1.0</version> <version>0.1.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>jar</packaging> <packaging>jar</packaging>
<properties> <properties>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.1-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
...@@ -39,12 +39,6 @@ ...@@ -39,12 +39,6 @@
<version>1.1.1</version> <version>1.1.1</version>
</dependency> </dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>forger</artifactId>
<version>0.1.1-SNAPSHOT</version>
</dependency>
<dependency> <dependency>
<groupId>org.freemarker</groupId> <groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId> <artifactId>freemarker</artifactId>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-avalon</artifactId> <artifactId>webmagic-avalon</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.1-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-avalon</artifactId> <artifactId>webmagic-avalon</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.1-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>forger</artifactId> <artifactId>forger</artifactId>
<version>0.1.0</version> <version>0.1.1-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
...@@ -150,18 +150,4 @@ ...@@ -150,18 +150,4 @@
</plugins> </plugins>
</build> </build>
<repositories>
<repository>
<id>sonatype-nexus-snapshots</id>
<name>Sonatype Nexus Snapshots</name>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
</project> </project>
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-avalon</artifactId> <artifactId>webmagic-avalon</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.1-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -4,6 +4,8 @@ import org.slf4j.Logger; ...@@ -4,6 +4,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
/** /**
* Remove duplicate urls and only push urls which are not duplicate.<br></br> * Remove duplicate urls and only push urls which are not duplicate.<br></br>
...@@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task; ...@@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task;
* @author code4crafer@gmail.com * @author code4crafer@gmail.com
* @since 0.5.0 * @since 0.5.0
*/ */
public abstract class DuplicatedRemoveScheduler implements Scheduler { public abstract class DuplicateRemovedScheduler implements Scheduler {
protected Logger logger = LoggerFactory.getLogger(getClass()); protected Logger logger = LoggerFactory.getLogger(getClass());
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
public DuplicateRemover getDuplicateRemover() {
return duplicatedRemover;
}
public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
this.duplicatedRemover = duplicatedRemover;
return this;
}
@Override @Override
public void push(Request request, Task task) { public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl()); logger.trace("get a candidate url {}", request.getUrl());
if (isDuplicate(request, task) || shouldReserved(request)) { if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
logger.debug("push to queue {}", request.getUrl()); logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task); pushWhenNoDuplicate(request, task);
} }
} }
/**
* Reset duplicate check.
*/
public abstract void resetDuplicateCheck(Task task);
/**
* @param request
* @return
*/
protected abstract boolean isDuplicate(Request request, Task task);
protected boolean shouldReserved(Request request) { protected boolean shouldReserved(Request request) {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
} }
......
...@@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue; ...@@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
* @since 0.2.1 * @since 0.2.1
*/ */
@ThreadSafe @ThreadSafe
public class PriorityScheduler extends LocalDuplicatedRemoveScheduler { public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5; public static final int INITIAL_CAPACITY = 5;
...@@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler { ...@@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) { public int getLeftRequestsCount(Task task) {
return noPriorityQueue.size(); return noPriorityQueue.size();
} }
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
} }
...@@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue; ...@@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* @since 0.1.0 * @since 0.1.0
*/ */
@ThreadSafe @ThreadSafe
public class QueueScheduler extends LocalDuplicatedRemoveScheduler { public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>(); private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
...@@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler { ...@@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) { public int getLeftRequestsCount(Task task) {
return queue.size(); return queue.size();
} }
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
} }
package us.codecraft.webmagic.scheduler.component;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.nio.charset.Charset;
import java.util.concurrent.atomic.AtomicInteger;
/**
* BloomFilterDuplicateRemover for huge number of urls.
*
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public class BloomFilterDuplicateRemover implements DuplicateRemover {
private int expectedInsertions;
private double fpp;
private AtomicInteger counter;
public BloomFilterDuplicateRemover(int expectedInsertions) {
this(expectedInsertions, 0.01);
}
/**
*
* @param expectedInsertions the number of expected insertions to the constructed
* @param fpp the desired false positive probability (must be positive and less than 1.0)
*/
public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
this.expectedInsertions = expectedInsertions;
this.fpp = fpp;
this.bloomFilter = rebuildBloomFilter();
}
protected BloomFilter<CharSequence> rebuildBloomFilter() {
counter = new AtomicInteger(0);
return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
}
private final BloomFilter<CharSequence> bloomFilter;
@Override
public boolean isDuplicate(Request request, Task task) {
boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
if (!isDuplicate) {
bloomFilter.put(getUrl(request));
counter.incrementAndGet();
}
return isDuplicate;
}
protected String getUrl(Request request) {
return request.getUrl();
}
@Override
public void resetDuplicateCheck(Task task) {
rebuildBloomFilter();
}
@Override
public int getTotalRequestsCount(Task task) {
return counter.get();
}
}
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Remove duplicate requests.
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public interface DuplicateRemover {
/**
*
* Check whether the request is duplicate.
*
* @param request
* @param task
* @return
*/
public boolean isDuplicate(Request request, Task task);
/**
* Reset duplicate check.
* @param task
*/
public void resetDuplicateCheck(Task task);
/**
* Get TotalRequestsCount for monitor.
* @param task
* @return
*/
public int getTotalRequestsCount(Task task);
}
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler.component;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
...@@ -8,23 +8,24 @@ import java.util.Set; ...@@ -8,23 +8,24 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
/** /**
* Base Scheduler with duplicated urls removed by hash set.<br></br> * @author code4crafer@gmail.com
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/ */
public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { public class HashSetDuplicateRemover implements DuplicateRemover {
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
@Override @Override
public void resetDuplicateCheck(Task task) { public boolean isDuplicate(Request request, Task task) {
urls.clear(); return !urls.add(getUrl(request));
}
protected String getUrl(Request request) {
return request.getUrl();
} }
@Override @Override
protected boolean isDuplicate(Request request, Task task) { public void resetDuplicateCheck(Task task) {
return urls.add(request.getUrl()); urls.clear();
} }
@Override @Override
......
...@@ -22,10 +22,10 @@ public class FilePersistentBase { ...@@ -22,10 +22,10 @@ public class FilePersistentBase {
} }
public void setPath(String path) { public void setPath(String path) {
this.path = path;
if (!path.endsWith(PATH_SEPERATOR)) { if (!path.endsWith(PATH_SEPERATOR)) {
path += PATH_SEPERATOR; path += PATH_SEPERATOR;
} }
this.path = path;
} }
public File getFile(String fullName) { public File getFile(String fullName) {
......
package us.codecraft.webmagic.scheduler;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafer@gmail.com
*/
public class BloomFilterDuplicateRemoverTest {
@Test
public void testRemove() throws Exception {
BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
assertThat(isDuplicate).isFalse();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
assertThat(isDuplicate).isTrue();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
assertThat(isDuplicate).isFalse();
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
assertThat(isDuplicate).isTrue();
}
@Ignore("long time")
@Test
public void testMemory() throws Exception {
int times = 5000000;
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
long freeMemory = Runtime.getRuntime().freeMemory();
long time = System.currentTimeMillis();
for (int i = 0; i < times; i++) {
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
}
System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
duplicateRemover = new HashSetDuplicateRemover();
System.gc();
freeMemory = Runtime.getRuntime().freeMemory();
time = System.currentTimeMillis();
for (int i = 0; i < times; i++) {
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
}
System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
}
@Ignore("long time")
@Test
public void testMissHit() throws Exception {
int times = 5000000;
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
int right = 0;
int wrong = 0;
int missCheck = 0;
for (int i = 0; i < times; i++) {
boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
if (duplicate) {
wrong++;
} else {
right++;
}
duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
if (!duplicate) {
missCheck++;
}
}
System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
}
}
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler; ...@@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
...@@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger; ...@@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler { public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private Logger logger = LoggerFactory.getLogger(getClass());
private String filePath = System.getProperty("java.io.tmpdir"); private String filePath = System.getProperty("java.io.tmpdir");
...@@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler { ...@@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) { public int getLeftRequestsCount(Task task) {
return queue.size(); return queue.size();
} }
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
} }
...@@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool; ...@@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig; import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/** /**
* Use Redis as url scheduler for distributed crawlers.<br> * Use Redis as url scheduler for distributed crawlers.<br>
...@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task; ...@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
private JedisPool pool; private JedisPool pool;
...@@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor ...@@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
private static final String ITEM_PREFIX = "item_"; private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) { public RedisScheduler(String host) {
pool = new JedisPool(new JedisPoolConfig(), host); this(new JedisPool(new JedisPoolConfig(), host));
} }
public RedisScheduler(JedisPool pool) { public RedisScheduler(JedisPool pool) {
this.pool = pool; this.pool = pool;
setDuplicateRemover(this);
} }
@Override @Override
...@@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor ...@@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
} }
@Override @Override
protected boolean isDuplicate(Request request, Task task) { public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
try { try {
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl()); boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
if (!isDuplicate) { if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl()); jedis.sadd(getSetKey(task), request.getUrl());
} }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
/**
* @author code4crafer@gmail.com
*/
public class AmanzonPageProcessor implements PageProcessor{
public void process(Page page) {
Html html = page.getHtml();
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
if(questionList != null && questionList.size() > 1)
{
//i=0是列名称,所以i从1开始
for( int i = 1 ; i < questionList.size(); i++)
{
System.out.println(questionList.get(i));
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
System.out.println(comment);
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
System.out.println(answerNum);
String createTime = tempHtml.xpath("//td[3]/text()").toString();
System.out.println(createTime);
/* Document doc = Jsoup.parse(questionList.get(i));
Html hmt = Html.create(questionList.get(i)) ;
String str = hmt.links().toString();
String content = doc.getElementsByTag("a").text();
String ss = doc.text();*/
}
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
}
}
package us.codecraft.webmagic.samples;
/**
* @author code4crafer@gmail.com
*/
public class GithubRepo {
private String name;
private String author;
private String readme;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getReadme() {
return readme;
}
public void setReadme(String readme) {
this.readme = readme;
}
}
\ No newline at end of file
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @since 0.5.1
*/
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
GithubRepo githubRepo = new GithubRepo();
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
if (githubRepo.getName() == null) {
//skip this page
page.setSkip(true);
} else {
page.putField("repo", githubRepo);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
...@@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples; ...@@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import javax.management.JMException;
import java.util.List; import java.util.List;
/** /**
...@@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) throws JMException {
Spider.create(new OschinaBlogPageProcesser()).run(); Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
SpiderMonitor.instance().register(spider);
spider.run();
} }
} }
package us.codecraft.webmagic.samples.pipeline;
/**
* @author code4crafer@gmail.com
*/
public class ReplacePipeline {
}
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.5.0</version> <version>0.5.2-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
......
...@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w ...@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.5.0</version> <version>0.5.1</version>
</dependency> </dependency>
``` ```
...@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) ...@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
* [lidongyang](http://my.oschina.net/lidongyang) * [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu) * [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118) * [sebastian1118](https://github.com/sebastian1118)
* [codev777](https://github.com/codev777)
### 邮件组: ### 邮件组:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment