Commit b0af45f4 authored by yihua.huang's avatar yihua.huang

complete redis support

parent f3a29d93
package us.codecraft.webmagic; package us.codecraft.webmagic;
import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -26,7 +27,9 @@ import java.util.Map; ...@@ -26,7 +27,9 @@ import java.util.Map;
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午11:37 * Time: 上午11:37
*/ */
public class Request { public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
private String url; private String url;
...@@ -40,7 +43,7 @@ public class Request { ...@@ -40,7 +43,7 @@ public class Request {
/** /**
* 构建一个request对象 * 构建一个request对象
* *
* @param url 必须参数,待抓取的url * @param url 必须参数,待抓取的url
*/ */
public Request(String url) { public Request(String url) {
this.url = url; this.url = url;
...@@ -56,17 +59,17 @@ public class Request { ...@@ -56,17 +59,17 @@ public class Request {
} }
public Object getExtra(String key) { public Object getExtra(String key) {
if (extras==null){ if (extras == null) {
return null; return null;
} }
return extras.get(key); return extras.get(key);
} }
public Request putExtra(String key,Object value) { public Request putExtra(String key, Object value) {
if (extras==null){ if (extras == null) {
extras = new HashMap<String, Object>(); extras = new HashMap<String, Object>();
} }
extras.put(key,value); extras.put(key, value);
return this; return this;
} }
...@@ -91,6 +94,10 @@ public class Request { ...@@ -91,6 +94,10 @@ public class Request {
return true; return true;
} }
public Map<String, Object> getExtras() {
return extras;
}
@Override @Override
public int hashCode() { public int hashCode() {
return url.hashCode(); return url.hashCode();
......
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
<artifactId>freemarker</artifactId> <artifactId>freemarker</artifactId>
<version>2.3.15</version> <version>2.3.15</version>
</dependency> </dependency>
<dependency>
<groupId>org.resthub</groupId>
<artifactId>hessian</artifactId>
<version>4.0.8</version>
</dependency>
<dependency> <dependency>
<groupId>redis.clients</groupId> <groupId>redis.clients</groupId>
<artifactId>jedis</artifactId> <artifactId>jedis</artifactId>
......
package us.codecraft.webmagic.scheduler;
import com.caucho.hessian.io.Hessian2Input;
import com.caucho.hessian.io.Hessian2Output;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-14 <br>
* Time: 下午9:20 <br>
*/
public enum HessianSerializer {
INSTANCE;
public <T> byte[] serialize(T v) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Hessian2Output hessian2Output = new Hessian2Output(baos);
hessian2Output.writeObject(v);
hessian2Output.close();
return baos.toByteArray();
}
@SuppressWarnings("unchecked")
public <T> T deSerialize(byte[] bytes) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Hessian2Input hessian2Input = new Hessian2Input(bais);
T t = (T) hessian2Input.readObject();
hessian2Input.close();
return t;
}
}
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler;
import org.apache.commons.codec.digest.DigestUtils;
import redis.clients.jedis.Jedis; import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig; import redis.clients.jedis.JedisPoolConfig;
...@@ -7,6 +8,8 @@ import us.codecraft.webmagic.Request; ...@@ -7,6 +8,8 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.schedular.Scheduler; import us.codecraft.webmagic.schedular.Scheduler;
import java.io.IOException;
/** /**
* 使用redis管理url,构建一个分布式的爬虫。<br> * 使用redis管理url,构建一个分布式的爬虫。<br>
* *
...@@ -22,6 +25,8 @@ public class RedisScheduler implements Scheduler { ...@@ -22,6 +25,8 @@ public class RedisScheduler implements Scheduler {
private static final String SET_PREFIX = "set_"; private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) { public RedisScheduler(String host) {
pool = new JedisPool(new JedisPoolConfig(), host); pool = new JedisPool(new JedisPoolConfig(), host);
} }
...@@ -34,6 +39,15 @@ public class RedisScheduler implements Scheduler { ...@@ -34,6 +39,15 @@ public class RedisScheduler implements Scheduler {
//使用List保存队列 //使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
if (request.getExtras() != null) {
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
try {
byte[] serialize = HessianSerializer.INSTANCE.serialize(request);
jedis.set(key.getBytes(), serialize);
} catch (IOException e) {
e.printStackTrace();
}
}
} }
pool.returnResource(jedis); pool.returnResource(jedis);
} }
...@@ -42,8 +56,16 @@ public class RedisScheduler implements Scheduler { ...@@ -42,8 +56,16 @@ public class RedisScheduler implements Scheduler {
public synchronized Request poll(Task task) { public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
byte[] bytes = jedis.get(key.getBytes());
try {
Object o = HessianSerializer.INSTANCE.deSerialize(bytes);
return (Request)o;
} catch (Exception e) {
e.printStackTrace();
}
pool.returnResource(jedis); pool.returnResource(jedis);
if (url==null){ if (url == null) {
return null; return null;
} }
return new Request(url); return new Request(url);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment