Commit db67db81 authored by yihua.huang's avatar yihua.huang

#523 remove fixAllRelativeHrefs by default, get absolute urls for links()

parent abd020b4
package us.codecraft.webmagic; package us.codecraft.webmagic;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
...@@ -76,7 +73,7 @@ public class Page { ...@@ -76,7 +73,7 @@ public class Page {
*/ */
public Html getHtml() { public Html getHtml() {
if (html == null) { if (html == null) {
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl())); html = new Html(rawText, request.getUrl());
} }
return html; return html;
} }
......
...@@ -44,6 +44,16 @@ public class Html extends HtmlNode { ...@@ -44,6 +44,16 @@ public class Html extends HtmlNode {
*/ */
private Document document; private Document document;
public Html(String text, String url) {
try {
disableJsoupHtmlEntityEscape();
this.document = Jsoup.parse(text, url);
} catch (Exception e) {
this.document = null;
logger.warn("parse document error ", e);
}
}
public Html(String text) { public Html(String text) {
try { try {
disableJsoupHtmlEntityEscape(); disableJsoupHtmlEntityEscape();
......
...@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable { ...@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
@Override @Override
public Selectable links() { public Selectable links() {
return xpath("//a/@href"); return selectElements(new LinksSelector());
} }
@Override @Override
......
package us.codecraft.webmagic.selector;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* Links selector based on jsoup. Use absolute url. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.7.0
*/
public class LinksSelector extends BaseElementSelector {
@Override
public String select(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<String> selectList(Element element) {
Elements elements = element.select("a");
List<String> links = new ArrayList<String>(elements.size());
for (Element element0 : elements) {
if (!StringUtil.isBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
}
}
return links;
}
@Override
public Element selectElement(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<Element> selectElements(Element element) {
throw new UnsupportedOperationException();
}
@Override
public boolean hasAttribute() {
return true;
}
}
...@@ -48,4 +48,14 @@ public class HtmlTest { ...@@ -48,4 +48,14 @@ public class HtmlTest {
Selectable selectable = html.xpath("//a[1]").nodes().get(0); Selectable selectable = html.xpath("//a[1]").nodes().get(0);
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
} }
@Test
public void testGetHrefsByJsoup(){
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","https://github.com/code4craft/webmagic/");
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
html = new Html("<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
}
} }
package us.codecraft.webmagic.selector;
import org.junit.Test;
import java.util.List;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
* Time: 下午9:41
*/
public class LinksSelectorTest {
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
@Test
public void testLinks() throws Exception {
List<String> links = new LinksSelector().selectList(html);
System.out.println(links);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment