Commit 55368919 authored by yihua.huang's avatar yihua.huang

add attribute 'text' support for CssSelector #66

parent 88b50d41
......@@ -89,7 +89,7 @@
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.1.0</version>
<version>0.2.0</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
......
......@@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor {
@Override
public void process(Page page) {
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
}
......
......@@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import java.util.ArrayList;
......@@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector {
return element.outerHtml();
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
return element.html();
} else if ("text".equalsIgnoreCase(attrName)) {
return getText(element);
} else if ("allText".equalsIgnoreCase(attrName)) {
return element.text();
} else {
return element.attr(attrName);
}
}
protected String getText(Element element) {
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.text());
}
}
return accum.toString();
}
@Override
public String select(Element element) {
Elements elements = element.select(selectorText);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment