Commit 1148450f authored by yihua.huang's avatar yihua.huang

update filecache to more useful

parent 7829c8fe
...@@ -16,13 +16,11 @@ public class SimplePageProcessor implements PageProcessor { ...@@ -16,13 +16,11 @@ public class SimplePageProcessor implements PageProcessor {
private String urlPattern; private String urlPattern;
private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31";
private Site site; private Site site;
public SimplePageProcessor(String startUrl, String urlPattern) { public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl). this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); setDomain(UrlUtils.getDomain(startUrl));
//compile "*" expression to regex //compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
......
...@@ -3,42 +3,45 @@ package us.codecraft.webmagic.downloader; ...@@ -3,42 +3,45 @@ package us.codecraft.webmagic.downloader;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.*;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.*; import java.io.*;
/** /**
* 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br> * Download file and saved to file for cache.<br>
* @author code4crafer@gmail.com *
* Date: 13-6-24 *
* Time: 上午7:24 * @author code4crafter@gmail.com
* @since 0.2.1
*/ */
public class FileDownloader implements Downloader { public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
private String path = "/data/temp/webmagic/";
private Downloader downloaderWhenFileMiss; private Downloader downloaderWhenFileMiss;
private final PageProcessor pageProcessor;
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
public FileDownloader() { public FileCache(String startUrl, String urlPattern) {
this("/data/temp/webmagic/", null); this(startUrl, urlPattern, "/data/webmagic/temp/");
} }
public FileDownloader(String path) { public FileCache(String startUrl, String urlPattern, String path) {
this(path, null); this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
setPath(path);
downloaderWhenFileMiss = new HttpClientDownloader();
} }
public FileDownloader(String path, Downloader downloaderWhenFileMiss) { public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
this.downloaderWhenFileMiss = downloaderWhenFileMiss; this.downloaderWhenFileMiss = downloaderWhenFileMiss;
return this;
} }
@Override @Override
...@@ -46,16 +49,15 @@ public class FileDownloader implements Downloader { ...@@ -46,16 +49,15 @@ public class FileDownloader implements Downloader {
String path = this.path + "/" + task.getUUID() + "/"; String path = this.path + "/" + task.getUUID() + "/";
Page page = null; Page page = null;
try { try {
final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
String line = null; String line = bufferedReader.readLine();
line = bufferedReader.readLine();
if (line.equals("url:\t" + request.getUrl())) { if (line.equals("url:\t" + request.getUrl())) {
final String html = getHtml(bufferedReader); final String html = getHtml(bufferedReader);
page = new Page(); page = new Page();
page.setRequest(request); page.setRequest(request);
page.setUrl(PlainText.create(request.getUrl())); page.setUrl(PlainText.create(request.getUrl()));
page.setHtml(Html.create(html)); page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
} }
} catch (IOException e) { } catch (IOException e) {
if (e instanceof FileNotFoundException) { if (e instanceof FileNotFoundException) {
...@@ -77,11 +79,11 @@ public class FileDownloader implements Downloader { ...@@ -77,11 +79,11 @@ public class FileDownloader implements Downloader {
private String getHtml(BufferedReader bufferedReader) throws IOException { private String getHtml(BufferedReader bufferedReader) throws IOException {
String line; String line;
StringBuilder htmlBuilder= new StringBuilder(); StringBuilder htmlBuilder = new StringBuilder();
line = bufferedReader.readLine(); line = bufferedReader.readLine();
line = StringUtils.removeStart(line, "html:\t"); line = StringUtils.removeStart(line, "html:\t");
htmlBuilder.append(line); htmlBuilder.append(line);
while ((line=bufferedReader.readLine())!=null){ while ((line = bufferedReader.readLine()) != null) {
htmlBuilder.append(line); htmlBuilder.append(line);
} }
return htmlBuilder.toString(); return htmlBuilder.toString();
...@@ -94,4 +96,27 @@ public class FileDownloader implements Downloader { ...@@ -94,4 +96,27 @@ public class FileDownloader implements Downloader {
} }
return page; return page;
} }
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
printWriter.println("html:\t" + resultItems.get("html"));
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
@Override
public void process(Page page) {
pageProcessor.process(page);
}
@Override
public Site getSite() {
return pageProcessor.getSite();
}
} }
package us.codecraft.webmagic.downloader;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
/**
* @author code4crafter@gmail.com <br>
*/
public class FileCacheTest {
// @Ignore("takes long")
@Test
public void test() {
FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
}
}
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
<date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated> <date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated>
</meta> </meta>
<comment> <comment>
<key><![CDATA[us.codecraft.webmagic.downloader.FileDownloader]]></key> <key><![CDATA[us.codecraft.webmagic.downloader.FileCache]]></key>
<data><![CDATA[ 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br> <data><![CDATA[ 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br>
@author code4crafer@gmail.com @author code4crafer@gmail.com
Date: 13-6-24 Date: 13-6-24
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment