Commit 55368919 authored by yihua.huang's avatar yihua.huang

add attribute 'text' support for CssSelector #66

parent 88b50d41
...@@ -89,7 +89,7 @@ ...@@ -89,7 +89,7 @@
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId> <artifactId>xsoup</artifactId>
<version>0.1.0</version> <version>0.2.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
......
...@@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor { ...@@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString()); page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
} }
......
...@@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector; ...@@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector { ...@@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector {
return element.outerHtml(); return element.outerHtml();
} else if ("innerHtml".equalsIgnoreCase(attrName)) { } else if ("innerHtml".equalsIgnoreCase(attrName)) {
return element.html(); return element.html();
} else if ("text".equalsIgnoreCase(attrName)) {
return getText(element);
} else if ("allText".equalsIgnoreCase(attrName)) {
return element.text();
} else { } else {
return element.attr(attrName); return element.attr(attrName);
} }
} }
protected String getText(Element element) {
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.text());
}
}
return accum.toString();
}
@Override @Override
public String select(Element element) { public String select(Element element) {
Elements elements = element.select(selectorText); Elements elements = element.select(selectorText);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment