Commit d141541e authored by yihua.huang's avatar yihua.huang

add retry

parent a1ef2523
...@@ -17,6 +17,8 @@ public class Request implements Serializable { ...@@ -17,6 +17,8 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L; private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
private String url; private String url;
/** /**
......
...@@ -30,6 +30,8 @@ public class Site { ...@@ -30,6 +30,8 @@ public class Site {
private int retryTimes = 0; private int retryTimes = 0;
private int cycleRetryTimes = 0;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>(); private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET; private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
...@@ -200,7 +202,7 @@ public class Site { ...@@ -200,7 +202,7 @@ public class Site {
} }
/** /**
* Get retry times when download fail, 0 by default.<br> * Get retry times when download fail immediately, 0 by default.<br>
* *
* @return retry times when download fail * @return retry times when download fail
*/ */
...@@ -218,6 +220,25 @@ public class Site { ...@@ -218,6 +220,25 @@ public class Site {
return this; return this;
} }
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler. <br>
*
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
......
...@@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader {
* @param url * @param url
* @return html * @return html
*/ */
public Html download(String url,String charset) { public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml(); return (Html) page.getHtml();
} }
...@@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader { ...@@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
if (tried > retryTimes) { if (tried > retryTimes) {
logger.warn("download page " + request.getUrl() + " error", e); logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
}
return page;
}
return null; return null;
} }
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
......
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.EnvironmentUtil;
...@@ -15,6 +16,8 @@ import java.util.List; ...@@ -15,6 +16,8 @@ import java.util.List;
*/ */
public class Html extends PlainText { public class Html extends PlainText {
private Logger logger = Logger.getLogger(getClass());
/** /**
* Store parsed document for better performance when only one text exist. * Store parsed document for better performance when only one text exist.
*/ */
...@@ -26,7 +29,11 @@ public class Html extends PlainText { ...@@ -26,7 +29,11 @@ public class Html extends PlainText {
public Html(String text) { public Html(String text) {
super(text); super(text);
this.document = Jsoup.parse(text); try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
logger.warn("parse document error ", e);
}
} }
public Html(Document document) { public Html(Document document) {
...@@ -108,7 +115,7 @@ public class Html extends PlainText { ...@@ -108,7 +115,7 @@ public class Html extends PlainText {
} }
public String getText() { public String getText() {
if (strings!=null&&strings.size()>0){ if (strings != null && strings.size() > 0) {
return strings.get(0); return strings.get(0);
} }
return document.html(); return document.html();
......
...@@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler { ...@@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
public synchronized void push(Request request, Task task) { public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource(); Jedis jedis = pool.getResource();
try { try {
//使用Set进行url去重 // if cycleRetriedTimes is set, allow duplicated.
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES);
//使用List保存队列 // use set to remove duplicate url
if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
// use list to store queue
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
if (request.getExtras() != null) { if (request.getExtras() != null) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment