Commit c13110c4 authored by yihua.huang's avatar yihua.huang

fix samples

parent c51ac601
package us.codecraft.webmagic;
import org.apache.http.HttpHost;
import org.apache.http.auth.UsernamePasswordCredentials;
import us.codecraft.webmagic.proxy.ProxyProvider;
import java.util.*;
/**
......@@ -41,12 +37,6 @@ public class Site {
private Map<String, String> headers = new HashMap<String, String>();
private HttpHost httpProxy;
private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
private ProxyProvider httpProxyPool;
private boolean useGzip = true;
/**
......
......@@ -479,7 +479,9 @@ public class Spider implements Runnable, Task {
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
startRequests.clear();
}
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
......
......@@ -95,12 +95,12 @@ public class HttpClientGenerator {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site != null && site.getUserAgent() != null) {
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
if (site == null || site.isUseGzip()) {
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
......@@ -117,16 +117,12 @@ public class HttpClientGenerator {
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
if (site != null) {
socketConfigBuilder.setSoTimeout(site.getTimeOut());
}
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
if (site != null) {
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
}
return httpClientBuilder.build();
}
......
......@@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
......@@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor {
private Site site;
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl));
public SimplePageProcessor(String urlPattern) {
this.site = Site.me();
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
......
......@@ -19,12 +19,12 @@ public class SpiderTest {
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
}).thread(1);
}).thread(1).addUrl("http://www.oschina.net/");
spider.start();
Thread.sleep(10000);
spider.stop();
......
package us.codecraft.webmagic.downloader;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.*;
/**
* Download file and saved to file for cache.<br>
*
* @author code4crafter@gmail.com
* @since 0.2.1
*/
@Experimental
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
private Downloader downloaderWhenFileMiss;
private final PageProcessor pageProcessor;
private Logger logger = LoggerFactory.getLogger(getClass());
public FileCache(String startUrl, String urlPattern) {
this(startUrl, urlPattern, "/data/webmagic/temp/");
}
public FileCache(String startUrl, String urlPattern, String path) {
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
setPath(path);
downloaderWhenFileMiss = new HttpClientDownloader();
}
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
return this;
}
@Override
public Page download(Request request, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
Page page = null;
try {
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
String line = bufferedReader.readLine();
if (line.equals("url:\t" + request.getUrl())) {
final String html = getHtml(bufferedReader);
page = new Page();
page.setRequest(request);
page.setUrl(PlainText.create(request.getUrl()));
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
}
} catch (IOException e) {
if (e instanceof FileNotFoundException) {
logger.info("File not exist for url " + request.getUrl());
} else {
logger.warn("File read error for url " + request.getUrl(), e);
}
}
if (page == null) {
page = downloadWhenMiss(request, task);
}
return page;
}
@Override
public void setThread(int thread) {
}
private String getHtml(BufferedReader bufferedReader) throws IOException {
String line;
StringBuilder htmlBuilder = new StringBuilder();
line = bufferedReader.readLine();
line = StringUtils.removeStart(line, "html:\t");
htmlBuilder.append(line);
while ((line = bufferedReader.readLine()) != null) {
htmlBuilder.append(line);
}
return htmlBuilder.toString();
}
private Page downloadWhenMiss(Request request, Task task) {
Page page = null;
if (downloaderWhenFileMiss != null) {
page = downloaderWhenFileMiss.download(request, task);
}
return page;
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
printWriter.println("html:\t" + resultItems.get("html"));
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
@Override
public void process(Page page) {
pageProcessor.process(page);
}
@Override
public Site getSite() {
return pageProcessor.getSite();
}
}
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
/**
* @author code4crafter@gmail.com <br>
*/
public class FileCacheTest {
@Ignore("takes long")
@Test
public void test() {
FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
}
}
......@@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
return Site.me();
}
@Test
......
......@@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor {
public Site getSite() {
//site定义抽取配置,以及开始url等
if (site == null) {
site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
site = Site.me().setDomain("progressdaily.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
......
......@@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
site= Site.me().setDomain("www.diaoyuweng.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
}
}
......@@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run();
}
}
......@@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
return Site.me().setDomain("www.huxiu.com");
}
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run();
}
}
......@@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
......@@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.thread(5)
.addUrl("http://www.infoq.com/cn/minibooks")
.run();
}
}
......@@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
site = Site.me().setDomain("yanghaoli.iteye.com");
}
return site;
}
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).run();
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
}
}
......@@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
return Site.me().setDomain("kaichiba.com").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run();
}
}
......@@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
return Site.me().setDomain("meican.com").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
return Site.me().setDomain("bbs.nju.edu.cn");
}
public static void main(String[] args) {
Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.management.JMException;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws JMException {
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
SpiderMonitor.instance().register(spider);
spider.run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class OschinaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
}
@Override
public Site getSite() {
return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
......@@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
return Site.me().setDomain("www.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
......@@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment