Commit 5c79550f authored by yihua.huang's avatar yihua.huang

add offline cache and process

parent a7316a1f
......@@ -18,7 +18,7 @@ public class Site {
private Map<String, String> cookies = new LinkedHashMap<String, String>();
private String encoding;
private String charset;
private List<String> startUrls = new ArrayList<String>();
......@@ -107,11 +107,11 @@ public class Site {
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
*
* @param encoding 编码格式,主要是"utf-8"、"gbk"两种
* @param charset 编码格式,主要是"utf-8"、"gbk"两种
* @return this
*/
public Site setEncoding(String encoding) {
this.encoding = encoding;
public Site setCharset(String charset) {
this.charset = charset;
return this;
}
......@@ -120,8 +120,8 @@ public class Site {
*
* @return 已设置的domain
*/
public String getEncoding() {
return encoding;
public String getCharset() {
return charset;
}
/**
......@@ -194,18 +194,32 @@ public class Site {
return false;
if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
public Task toTask(){
return new Task() {
@Override
public String getUUID() {
return Site.this.getDomain();
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public int hashCode() {
int result = domain.hashCode();
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
return result;
}
......
......@@ -126,6 +126,12 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider downloader(Downloader downloader) {
checkIfNotRunning();
this.downloader = downloader;
return this;
}
@Override
public void run() {
......@@ -180,7 +186,7 @@ public class Spider implements Runnable, Task {
}
private void processRequest(Request request) {
Page page = downloader.download(request, site);
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
return;
......@@ -216,12 +222,7 @@ public class Spider implements Runnable, Task {
}
public void runAsync(){
Thread thread = new Thread(){
@Override
public void run() {
Spider.this.run();
}
};
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
......@@ -252,4 +253,9 @@ public class Spider implements Runnable, Task {
}
return null;
}
@Override
public Site getSite() {
return site;
}
}
......@@ -14,4 +14,10 @@ public interface Task {
*/
public String getUUID();
/**
* 返回任务抓取的站点信息
* @return site
*/
public Site getSite();
}
......@@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
......@@ -16,8 +16,8 @@ public interface Downloader {
* 下载页面,并保存信息到Page对象中。
*
* @param request
* @param site
* @param task
* @return page
*/
public Page download(Request request, Site site);
public Page download(Request request, Task task);
}
package us.codecraft.webmagic.downloader;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import java.io.*;
/**
* @author code4crafer@gmail.com
* Date: 13-6-24
* Time: 上午7:24
*/
public class FileDownloader implements Downloader {
private String path = "/data/temp/webmagic/";
private Downloader downloaderWhenFileMiss;
private Logger logger = Logger.getLogger(getClass());
public FileDownloader() {
this("/data/temp/webmagic/", null);
}
public FileDownloader(String path) {
this(path, null);
}
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
this.path = path;
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
}
@Override
public Page download(Request request, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
Page page = null;
try {
final File file = new File(path + DigestUtils.md5Hex(request.getUrl()));
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
String line = null;
line = bufferedReader.readLine();
if (line.equals("url:\t" + request.getUrl())) {
final String html = getHtml(bufferedReader);
page = new Page();
page.setRequest(request);
page.setUrl(PlainText.create(request.getUrl()));
page.setHtml(Html.create(html));
}
} catch (IOException e) {
if (e instanceof FileNotFoundException) {
logger.info("File not exist for url " + request.getUrl());
} else {
logger.warn("File read error for url " + request.getUrl(), e);
}
}
if (page == null) {
page = downloadWhenMiss(request, task);
}
return page;
}
private String getHtml(BufferedReader bufferedReader) throws IOException {
String line;
StringBuilder htmlBuilder= new StringBuilder();
line = bufferedReader.readLine();
line = StringUtils.removeStart(line, "html:\t");
htmlBuilder.append(line);
while ((line=bufferedReader.readLine())!=null){
htmlBuilder.append(line);
}
return htmlBuilder.toString();
}
private Page downloadWhenMiss(Request request, Task task) {
Page page = null;
if (downloaderWhenFileMiss != null) {
page = downloaderWhenFileMiss.download(request, task);
}
return page;
}
}
......@@ -11,6 +11,7 @@ import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
......@@ -26,24 +27,25 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass());
@Override
public Page download(Request request, Site site) {
public Page download(Request request, Task task) {
Site site = task.getSite();
logger.info("downloading page " + request.getUrl());
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
String encoding = site.getEncoding();
String charset = site.getCharset();
try {
HttpGet httpGet = new HttpGet(request.getUrl());
HttpResponse httpResponse = httpClient.execute(httpGet);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (site.getAcceptStatCode().contains(statusCode)) {
//charset
if (encoding == null){
if (charset == null){
String value = httpResponse.getEntity().getContentType().getValue();
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
}
//
handleGzip(httpResponse);
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
site.getEncoding());
charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
......
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Selectable;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
......@@ -20,6 +19,8 @@ public class FilePipeline implements Pipeline {
private String path = "/data/temp/webmagic/";
private Logger logger = Logger.getLogger(getClass());
public FilePipeline() {
}
......@@ -36,15 +37,12 @@ public class FilePipeline implements Pipeline {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
printWriter.println("url:\t" + page.getUrl());
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings());
}
printWriter.println("html:\t" + page.getHtml());
printWriter.close();
} catch (IOException e) {
e.printStackTrace();
logger.warn("write file error",e);
}
}
}
......@@ -75,7 +75,7 @@ public class UrlUtils {
return domain;
}
private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
public static String fixAllRelativeHrefs(String html, String url) {
StringBuilder stringBuilder = new StringBuilder();
......
......@@ -19,7 +19,7 @@ public class HttpClientDownloaderTest {
public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask());
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
}
}
......@@ -34,7 +34,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.FileDownloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.util.List;
/**
* Author yihua.huang@dianping.com
* Date: 13-6-24
* Time: 下午2:12
*/
public class GlobalProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
final List<String> requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings();
page.addTargetRequests(requests);
}
@Override
public Site getSite() {
if (site==null){
site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
public static void main(String[] args) {
Spider.create(new GlobalProcessor()).thread(10)
.scheduler(new FileCacheQueueScheduler("/data/webmagic/github"))
.downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader()))
.pipeline(new FilePipeline("/data/webmagic/douban"))
.run();
}
}
......@@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
......@@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
......@@ -29,7 +29,7 @@ public class SpiderTest {
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getEncoding());
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment