Commit 2b34dc9d authored by yihua.huang's avatar yihua.huang

add retry

parent 5c79550f
target/* target/*
*.iml *.iml
out/
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.2.1</version> <version>4.2.4</version>
</dependency> </dependency>
<dependency> <dependency>
......
...@@ -24,6 +24,8 @@ public class Site { ...@@ -24,6 +24,8 @@ public class Site {
private int sleepTime = 3000; private int sleepTime = 3000;
private int retryTimes = 0;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>(); private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET; private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
...@@ -183,6 +185,23 @@ public class Site { ...@@ -183,6 +185,23 @@ public class Site {
return sleepTime; return sleepTime;
} }
/**
* 获取重新下载的次数,默认为0
* @return 重新下载的次数
*/
public int getRetryTimes() {
return retryTimes;
}
/**
* 设置获取重新下载的次数,默认为0
* @return this
*/
public Site setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
return this;
}
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
......
...@@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.Html; ...@@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:15 * Time: 下午12:15
*/ */
public class HttpClientDownloader implements Downloader { public class HttpClientDownloader implements Downloader {
...@@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader { ...@@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader {
String charset = site.getCharset(); String charset = site.getCharset();
try { try {
HttpGet httpGet = new HttpGet(request.getUrl()); HttpGet httpGet = new HttpGet(request.getUrl());
HttpResponse httpResponse = httpClient.execute(httpGet); HttpResponse httpResponse = null;
int tried = 0;
boolean retry;
do {
try {
httpResponse = httpClient.execute(httpGet);
retry = false;
} catch (IOException e) {
tried++;
if (tried > site.getRetryTimes()) {
logger.warn("download page " + request.getUrl() + " error", e);
return null;
}
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
retry = true;
}
} while (retry);
int statusCode = httpResponse.getStatusLine().getStatusCode(); int statusCode = httpResponse.getStatusLine().getStatusCode();
if (site.getAcceptStatCode().contains(statusCode)) { if (site.getAcceptStatCode().contains(statusCode)) {
//charset //charset
if (charset == null){ if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
} }
...@@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader {
page.setRequest(request); page.setRequest(request);
return page; return page;
} else { } else {
logger.warn("code error " + statusCode); logger.warn("code error " + statusCode + "\t" + request.getUrl());
} }
} catch (Exception e) { } catch (Exception e) {
logger.warn("download page " + request.getUrl() + " error", e); logger.warn("download page " + request.getUrl() + " error", e);
......
...@@ -39,6 +39,25 @@ ...@@ -39,6 +39,25 @@
<target>1.6</target> <target>1.6</target>
</configuration> </configuration>
</plugin> </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId> <artifactId>maven-resources-plugin</artifactId>
...@@ -70,6 +89,19 @@ ...@@ -70,6 +89,19 @@
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>./lib/</classpathPrefix>
<mainClass>us.codecraft.webmagic.samples.DianpingIndexProcessor</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId> <artifactId>maven-release-plugin</artifactId>
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21 Time: 下午8:08
*/
public class DianpingIndexProcessor implements PageProcessor {
@Override
public void process(Page page) {
if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) {
page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings());
return;
}
Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+");
Matcher matcher = p.matcher(page.getUrl().toString());
if (matcher.matches()) {
page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings());
} else {
p = Pattern.compile("http://www\\.dianping\\.com/search/.*");
matcher = p.matcher(page.getUrl().toString());
if (matcher.matches()) {
String result = page.getHtml().regex("您要查看的内容不存在").toString();
if (result != null) {
System.err.println("No!Url not exist!" + page.getUrl());
}
}
}
}
@Override
public Site getSite() {
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist")
.setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang");
}
public static void main(String[] args) {
int sleepTime = 0;
if (args.length > 0) {
sleepTime = Integer.parseInt(args[0]);
}
DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor();
dianpingProcessor.getSite().setSleepTime(sleepTime);
Spider.create(dianpingProcessor).thread(10).run();
}
}
package us.codecraft.webmagic.samples; package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -9,30 +9,36 @@ import java.util.List; ...@@ -9,30 +9,36 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午8:08 * Time: 下午8:08
*/ */
public class DianpingProcessor implements PageProcessor { public class DianpingProcessor implements PageProcessor {
private Site site;
@Override @Override
public void process(Page page) { public void process(Page page) {
List<String> requests = page.getHtml().links().regex(".*shop.*").toStrings(); List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
requests = page.getHtml().regex(".*search/category/.*").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")) {
page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().smartContent());
}
} }
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). if (site == null) {
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0").
setSleepTime(100).
setUserAgent("I'm a performance tester created by yihua.huang");
}
return site;
} }
public static void main(String[] args) { public static void main(String[] args) {
int sleepTime = 0;
if (args.length > 0) {
sleepTime = Integer.parseInt(args[0]);
}
DianpingProcessor dianpingProcessor = new DianpingProcessor(); DianpingProcessor dianpingProcessor = new DianpingProcessor();
dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10);
Spider.create(dianpingProcessor).run(); Spider.create(dianpingProcessor).run();
} }
} }
...@@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor { ...@@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
final List<String> requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings(); final List<String> requests = page.getHtml().links().toStrings();
page.addTargetRequests(requests); page.addTargetRequests(requests);
} }
...@@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor { ...@@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor {
@Override @Override
public Site getSite() { public Site getSite() {
if (site==null){ if (site==null){
site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); site = Site.me().setDomain("www.2345.com")
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
} }
return site; return site;
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new GlobalProcessor()).thread(10) Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new FileCacheQueueScheduler("/data/webmagic/github")) .scheduler(new FileCacheQueueScheduler("/data/webmagic/test"))
.downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader())) .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader()))
.pipeline(new FilePipeline("/data/webmagic/douban")) .pipeline(new FilePipeline("/data/webmagic/test"))
.run(); .run();
} }
} }
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-14 <br>
* Time: 上午8:33 <br>
*/
public class GuoxueProcessor {
public static void main(String[] args) {
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
}
}
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
...@@ -17,7 +16,6 @@ import java.io.IOException; ...@@ -17,7 +16,6 @@ import java.io.IOException;
*/ */
public class DiaoyuwengProcessorTest { public class DiaoyuwengProcessorTest {
@Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment