Commit 54904851 authored by yihua.huang's avatar yihua.huang

add list output support

parent 2bb6f847
...@@ -20,6 +20,15 @@ public class ConsolePipeline implements Pipeline{ ...@@ -20,6 +20,15 @@ public class ConsolePipeline implements Pipeline{
} }
System.out.println("get page: "+resultItems.getRequest().getUrl()); System.out.println("get page: "+resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
System.out.println(entry.getKey() + ":");
for (Object o : value) {
System.out.println(o);
}
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
System.out.println(entry.getKey()+":\t"+entry.getValue()); System.out.println(entry.getKey()+":\t"+entry.getValue());
} }
} }
......
...@@ -13,6 +13,7 @@ import java.util.Map; ...@@ -13,6 +13,7 @@ import java.util.Map;
/** /**
* 持久化到文件的接口。 * 持久化到文件的接口。
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午6:28 * Time: 下午6:28
...@@ -32,6 +33,7 @@ public class FilePipeline implements Pipeline { ...@@ -32,6 +33,7 @@ public class FilePipeline implements Pipeline {
/** /**
* 新建一个FilePipeline * 新建一个FilePipeline
*
* @param path 文件保存路径 * @param path 文件保存路径
*/ */
public FilePipeline(String path) { public FilePipeline(String path) {
...@@ -45,18 +47,26 @@ public class FilePipeline implements Pipeline { ...@@ -45,18 +47,26 @@ public class FilePipeline implements Pipeline {
if (!file.exists()) { if (!file.exists()) {
file.mkdirs(); file.mkdirs();
} }
if (resultItems.isSkip()){ if (resultItems.isSkip()) {
return; return;
} }
try { try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl()); printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
printWriter.println(entry.getKey()+":\t"+entry.getValue()); if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
} }
printWriter.close(); printWriter.close();
} catch (IOException e) { } catch (IOException e) {
logger.warn("write file error",e); logger.warn("write file error", e);
} }
} }
} }
package us.codecraft.webmagic.samples;
import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/
public class InfoQMiniBookProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
if (CollectionUtils.isNotEmpty(all)) {
page.putField("pdf", all);
} else {
page.getResultItems().setSkip(true);
}
}
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/temp/webmagic/"))
.thread(5)
.run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment