Commit 96454fd7 authored by yihua.huang's avatar yihua.huang

update java doc

parent 81e7f798
......@@ -101,7 +101,7 @@ public class Page {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
break;
}
s = UrlUtils.fixRelativeUrl(s, url.toString());
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
}
......@@ -116,7 +116,7 @@ public class Page {
return;
}
synchronized (targetRequests) {
requestString = UrlUtils.fixRelativeUrl(requestString, url.toString());
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
}
......
......@@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader {
//charset
if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue();
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
charset = UrlUtils.getCharset(value);
}
//
handleGzip(httpResponse);
......@@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader {
Header ceheader = httpResponse.getEntity().getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
for (HeaderElement codec : codecs) {
if (codec.getName().equalsIgnoreCase("gzip")) {
httpResponse.setEntity(
new GzipDecompressingEntity(httpResponse.getEntity()));
}
......
......@@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable;
import java.util.Map;
/**
* 命令行输出抽取结果。可用于测试。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:45
......
......@@ -11,6 +11,7 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
* 持久化到文件的接口。
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午6:28
......@@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
*/
public FilePipeline() {
}
/**
* 新建一个FilePipeline
* @param path 文件保存路径
*/
public FilePipeline(String path) {
this.path = path;
}
......
......@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
/**
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:39
......
......@@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
* extends the class to implements various spiders.<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:42
......@@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site;
public interface PageProcessor {
/**
* extends the class to implements variaty spiders
* 定义如何处理页面,包括链接提取、内容抽取等。
* @param page
*/
public void process(Page page);
/**
* the site the processor for
* 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
* @return site
*/
public Site getSite();
......
......@@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
/**
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-22
* Time: 下午9:15
......@@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor {
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
//compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
}
......
......@@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:13
......@@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile();
readUrlFile();
} catch (IOException e) {
logger.error("init file error",e);
}
}
......@@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler {
private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
String line = null;
String line;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
cursor = new AtomicInteger(NumberUtils.toInt(line));
......
......@@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 内存队列实现的线程安全Scheduler。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:13
......
......@@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:12
*/
public interface Scheduler {
/**
* 加入一个待抓取的链接
* @param request 待抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
*/
public void push(Request request,Task task);
/**
* 返回下一个要抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @return
*/
public Request poll(Task task);
}
<html>
<body>
包含url管理和调度的接口Schedular及它的几个实现类。
包含url管理和调度的接口Scheduler及它的几个实现类。
</body>
</html>
......@@ -10,6 +10,7 @@ import java.util.ArrayList;
import java.util.List;
/**
* css风格的选择器。包装了Jsoup。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
......
package us.codecraft.webmagic.selector;
/**
* 封装正则表达式抽取接口的类。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午7:39
......
......@@ -18,7 +18,7 @@ public interface Selectable {
public Selectable xpath(String xpath);
/**
* select list with jquery selector
* select list with css selector
*
* @param
* @return
......
......@@ -6,6 +6,7 @@ import java.util.ArrayList;
import java.util.List;
/**
* xpath的选择器。包装了HtmlCleaner。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
......@@ -52,12 +53,12 @@ public class XpathSelector implements Selector {
try {
Object[] objects = tagNode.evaluateXPath(xpathStr);
if (objects != null && objects.length >= 1) {
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof TagNode) {
TagNode tagNode1 = (TagNode) objects[i];
for (Object object : objects) {
if (object instanceof TagNode) {
TagNode tagNode1 = (TagNode) object;
results.add(htmlCleaner.getInnerHtml(tagNode1));
} else {
results.add(objects[i].toString());
results.add(object.toString());
}
}
}
......
......@@ -14,7 +14,13 @@ public class UrlUtils {
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
public static String fixRelativeUrl(String url, String refer) {
/**
* 将url想对地址转化为绝对地址
* @param url url地址
* @param refer url地址来自哪个页面
* @return
*/
public static String canonicalizeUrl(String url, String refer) {
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
return url;
}
......@@ -62,12 +68,12 @@ public class UrlUtils {
private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");
public static String removeProtocal(String url) {
public static String removeProtocol(String url) {
return patternForProtocal.matcher(url).replaceAll("");
}
public static String getDomain(String url) {
String domain = removeProtocal(url);
String domain = removeProtocol(url);
int i = StringUtils.indexOf(domain, "/", 1);
if (i > 0) {
domain = StringUtils.substring(domain, 0, i);
......@@ -84,7 +90,7 @@ public class UrlUtils {
while (matcher.find()) {
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
stringBuilder.append(matcher.group(1));
stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\"");
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end();
}
stringBuilder.append(StringUtils.substring(html, lastEnd));
......
......@@ -12,18 +12,18 @@ public class UrlUtilsTest {
@Test
public void testFixRelativeUrl() {
String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com");
String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
System.out.println("fix: " + fixrelativeurl);
Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com");
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com");
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com");
// System.out.println("fix: " + fixrelativeurl);
......@@ -628,7 +628,6 @@ public class UrlUtilsTest {
"\t\t\t<script src=\"http://discuz.gtimg.cn/cloud/scripts/discuz_tips.js?v=1\" type=\"text/javascript\" charset=\"UTF-8\"></script></body>\n" +
"</html>\n";
String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/");
String text = "<a class=\"xu_subscribe\" href=\"home.php?mod=spacecp&amp;ac=profile&amp;op=info\" >订阅<span >虎嗅</span></a>";
Assert.assertTrue(html.contains("<a href=\"article"));
Assert.assertFalse(newHtml.contains("<a href=\"article"));
}
......
......@@ -14,6 +14,6 @@ public class FreemarkerPipelineTest {
@Test
public void testTemplateLoad() throws IOException {
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
new FreemarkerPipeline("wordpress.ftl");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment