Commit 96454fd7 authored by yihua.huang's avatar yihua.huang

update java doc

parent 81e7f798
...@@ -101,7 +101,7 @@ public class Page { ...@@ -101,7 +101,7 @@ public class Page {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
break; break;
} }
s = UrlUtils.fixRelativeUrl(s, url.toString()); s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s)); targetRequests.add(new Request(s));
} }
} }
...@@ -116,7 +116,7 @@ public class Page { ...@@ -116,7 +116,7 @@ public class Page {
return; return;
} }
synchronized (targetRequests) { synchronized (targetRequests) {
requestString = UrlUtils.fixRelativeUrl(requestString, url.toString()); requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString)); targetRequests.add(new Request(requestString));
} }
} }
......
...@@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader {
//charset //charset
if (charset == null) { if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); charset = UrlUtils.getCharset(value);
} }
// //
handleGzip(httpResponse); handleGzip(httpResponse);
...@@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader { ...@@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader {
Header ceheader = httpResponse.getEntity().getContentEncoding(); Header ceheader = httpResponse.getEntity().getContentEncoding();
if (ceheader != null) { if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements(); HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) { for (HeaderElement codec : codecs) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) { if (codec.getName().equalsIgnoreCase("gzip")) {
httpResponse.setEntity( httpResponse.setEntity(
new GzipDecompressingEntity(httpResponse.getEntity())); new GzipDecompressingEntity(httpResponse.getEntity()));
} }
......
...@@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable; ...@@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable;
import java.util.Map; import java.util.Map;
/** /**
* 命令行输出抽取结果。可用于测试。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:45 * Time: 下午1:45
......
...@@ -11,6 +11,7 @@ import java.io.IOException; ...@@ -11,6 +11,7 @@ import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
/** /**
* 持久化到文件的接口。
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午6:28 * Time: 下午6:28
...@@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline { ...@@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
*/
public FilePipeline() { public FilePipeline() {
} }
/**
* 新建一个FilePipeline
* @param path 文件保存路径
*/
public FilePipeline(String path) { public FilePipeline(String path) {
this.path = path; this.path = path;
} }
......
...@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page; ...@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:39 * Time: 下午1:39
......
...@@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page; ...@@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
/** /**
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
* extends the class to implements various spiders.<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午11:42 * Time: 上午11:42
...@@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site; ...@@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site;
public interface PageProcessor { public interface PageProcessor {
/** /**
* extends the class to implements variaty spiders * 定义如何处理页面,包括链接提取、内容抽取等。
* @param page * @param page
*/ */
public void process(Page page); public void process(Page page);
/** /**
* the site the processor for * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
* @return site * @return site
*/ */
public Site getSite(); public Site getSite();
......
...@@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List; import java.util.List;
/** /**
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-22 * Date: 13-4-22
* Time: 下午9:15 * Time: 下午9:15
...@@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor { ...@@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor {
public SimplePageProcessor(String startUrl, String urlPattern) { public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl). this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
//compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
} }
......
...@@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean; ...@@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:13 * Time: 下午1:13
...@@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler { ...@@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile(); readCursorFile();
readUrlFile(); readUrlFile();
} catch (IOException e) { } catch (IOException e) {
logger.error("init file error",e);
} }
} }
...@@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler { ...@@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler {
private void readCursorFile() throws IOException { private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
String line = null; String line;
//read the last number //read the last number
while ((line = fileCursorReader.readLine()) != null) { while ((line = fileCursorReader.readLine()) != null) {
cursor = new AtomicInteger(NumberUtils.toInt(line)); cursor = new AtomicInteger(NumberUtils.toInt(line));
......
...@@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue; ...@@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
/** /**
* 内存队列实现的线程安全Scheduler。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:13 * Time: 下午1:13
......
...@@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request; ...@@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午1:12 * Time: 下午1:12
*/ */
public interface Scheduler { public interface Scheduler {
/**
* 加入一个待抓取的链接
* @param request 待抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
*/
public void push(Request request,Task task); public void push(Request request,Task task);
/**
* 返回下一个要抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @return
*/
public Request poll(Task task); public Request poll(Task task);
} }
<html> <html>
<body> <body>
包含url管理和调度的接口Schedular及它的几个实现类。 包含url管理和调度的接口Scheduler及它的几个实现类。
</body> </body>
</html> </html>
...@@ -10,6 +10,7 @@ import java.util.ArrayList; ...@@ -10,6 +10,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* css风格的选择器。包装了Jsoup。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午9:39 * Time: 上午9:39
......
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
/** /**
* 封装正则表达式抽取接口的类。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午7:39 * Time: 上午7:39
......
...@@ -18,7 +18,7 @@ public interface Selectable { ...@@ -18,7 +18,7 @@ public interface Selectable {
public Selectable xpath(String xpath); public Selectable xpath(String xpath);
/** /**
* select list with jquery selector * select list with css selector
* *
* @param * @param
* @return * @return
......
...@@ -6,6 +6,7 @@ import java.util.ArrayList; ...@@ -6,6 +6,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* xpath的选择器。包装了HtmlCleaner。<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午9:39 * Time: 上午9:39
...@@ -52,12 +53,12 @@ public class XpathSelector implements Selector { ...@@ -52,12 +53,12 @@ public class XpathSelector implements Selector {
try { try {
Object[] objects = tagNode.evaluateXPath(xpathStr); Object[] objects = tagNode.evaluateXPath(xpathStr);
if (objects != null && objects.length >= 1) { if (objects != null && objects.length >= 1) {
for (int i = 0; i < objects.length; i++) { for (Object object : objects) {
if (objects[i] instanceof TagNode) { if (object instanceof TagNode) {
TagNode tagNode1 = (TagNode) objects[i]; TagNode tagNode1 = (TagNode) object;
results.add(htmlCleaner.getInnerHtml(tagNode1)); results.add(htmlCleaner.getInnerHtml(tagNode1));
} else { } else {
results.add(objects[i].toString()); results.add(object.toString());
} }
} }
} }
......
...@@ -14,7 +14,13 @@ public class UrlUtils { ...@@ -14,7 +14,13 @@ public class UrlUtils {
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
public static String fixRelativeUrl(String url, String refer) { /**
* 将url想对地址转化为绝对地址
* @param url url地址
* @param refer url地址来自哪个页面
* @return
*/
public static String canonicalizeUrl(String url, String refer) {
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
return url; return url;
} }
...@@ -62,12 +68,12 @@ public class UrlUtils { ...@@ -62,12 +68,12 @@ public class UrlUtils {
private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");
public static String removeProtocal(String url) { public static String removeProtocol(String url) {
return patternForProtocal.matcher(url).replaceAll(""); return patternForProtocal.matcher(url).replaceAll("");
} }
public static String getDomain(String url) { public static String getDomain(String url) {
String domain = removeProtocal(url); String domain = removeProtocol(url);
int i = StringUtils.indexOf(domain, "/", 1); int i = StringUtils.indexOf(domain, "/", 1);
if (i > 0) { if (i > 0) {
domain = StringUtils.substring(domain, 0, i); domain = StringUtils.substring(domain, 0, i);
...@@ -84,7 +90,7 @@ public class UrlUtils { ...@@ -84,7 +90,7 @@ public class UrlUtils {
while (matcher.find()) { while (matcher.find()) {
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
stringBuilder.append(matcher.group(1)); stringBuilder.append(matcher.group(1));
stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\""); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end(); lastEnd = matcher.end();
} }
stringBuilder.append(StringUtils.substring(html, lastEnd)); stringBuilder.append(StringUtils.substring(html, lastEnd));
......
...@@ -12,18 +12,18 @@ public class UrlUtilsTest { ...@@ -12,18 +12,18 @@ public class UrlUtilsTest {
@Test @Test
public void testFixRelativeUrl() { public void testFixRelativeUrl() {
String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com"); String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
System.out.println("fix: " + fixrelativeurl); System.out.println("fix: " + fixrelativeurl);
Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); // fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com");
// System.out.println("fix: " + fixrelativeurl); // System.out.println("fix: " + fixrelativeurl);
...@@ -628,7 +628,6 @@ public class UrlUtilsTest { ...@@ -628,7 +628,6 @@ public class UrlUtilsTest {
"\t\t\t<script src=\"http://discuz.gtimg.cn/cloud/scripts/discuz_tips.js?v=1\" type=\"text/javascript\" charset=\"UTF-8\"></script></body>\n" + "\t\t\t<script src=\"http://discuz.gtimg.cn/cloud/scripts/discuz_tips.js?v=1\" type=\"text/javascript\" charset=\"UTF-8\"></script></body>\n" +
"</html>\n"; "</html>\n";
String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/");
String text = "<a class=\"xu_subscribe\" href=\"home.php?mod=spacecp&amp;ac=profile&amp;op=info\" >订阅<span >虎嗅</span></a>";
Assert.assertTrue(html.contains("<a href=\"article")); Assert.assertTrue(html.contains("<a href=\"article"));
Assert.assertFalse(newHtml.contains("<a href=\"article")); Assert.assertFalse(newHtml.contains("<a href=\"article"));
} }
......
...@@ -14,6 +14,6 @@ public class FreemarkerPipelineTest { ...@@ -14,6 +14,6 @@ public class FreemarkerPipelineTest {
@Test @Test
public void testTemplateLoad() throws IOException { public void testTemplateLoad() throws IOException {
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl"); new FreemarkerPipeline("wordpress.ftl");
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment