Commit 19229dd8 authored by yihua.huang's avatar yihua.huang

add JsonFilePageModelPipeline

parent a3256b50
package us.codecraft.webmagic.model;
/**
* 标志一个Model的key。<br>
* 实现了这个接口的Model在输出时会使用getKey()作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。<br>
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8 。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-10 <br>
* Time: 上午7:39 <br>
*/
public interface HasKey {
/**
* 在输出时会使用key作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。
*
* @return key
*/
public String key();
}
package us.codecraft.webmagic.pipeline;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
/**
* JSON格式持久化到文件的接口。<br>
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午6:28
*/
public class JsonFilePageModelPipeline implements PageModelPipeline {
private String path = "/data/webmagic/";
private Logger logger = Logger.getLogger(getClass());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
}
/**
* 新建一个FilePipeline
*
* @param path 文件保存路径
*/
public JsonFilePageModelPipeline(String path) {
if (!path.endsWith("/") && !path.endsWith("\\")) {
path += "/";
}
this.path = path;
}
@Override
public void process(Object o, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
String filename;
if (o instanceof HasKey) {
filename = path + ((HasKey)o).key() + ".json";
} else {
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
}
PrintWriter printWriter = new PrintWriter(new FileWriter(filename));
printWriter.write(JSON.toJSONString(o));
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.List; import java.util.List;
...@@ -14,7 +15,7 @@ import java.util.List; ...@@ -14,7 +15,7 @@ import java.util.List;
* Time: 上午7:52 <br> * Time: 上午7:52 <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog { public class OschinaBlog implements HasKey{
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
...@@ -27,7 +28,23 @@ public class OschinaBlog { ...@@ -27,7 +28,23 @@ public class OschinaBlog {
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new ConsolePageModelPipeline(), OschinaBlog.class).run(); ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
} }
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
public List<String> getTags() {
return tags;
}
@Override
public String key() {
return title;
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment