Commit 5e9e8b25 authored by yihua.huang's avatar yihua.huang

add TextContentSelector

parent 0cc0ccee
...@@ -47,32 +47,44 @@ public class Html extends PlainText { ...@@ -47,32 +47,44 @@ public class Html extends PlainText {
@Override @Override
public Selectable smartContent() { public Selectable smartContent() {
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings); return select(smartContentSelector, strings);
} }
@Override @Override
public Selectable links() { public Selectable links() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); XpathSelector xpathSelector = Selectors.xpath("//a/@href");
return selectList(xpathSelector, strings); return selectList(xpathSelector, strings);
} }
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectList(xpathSelector, strings); return selectList(xpathSelector, strings);
} }
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector); CssSelector cssSelector = Selectors.$(selector);
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
@Override @Override
public Selectable $(String selector, String attrName) { public Selectable $(String selector, String attrName) {
CssSelector cssSelector = new CssSelector(selector, attrName); CssSelector cssSelector = Selectors.$(selector, attrName);
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
@Override
public Selectable text() {
TextContentSelector selector = Selectors.text();
return select(selector, strings);
}
@Override
public Selectable text(String newlineSeparator) {
TextContentSelector selector = Selectors.text(newlineSeparator);
return select(selector, strings);
}
} }
...@@ -57,13 +57,13 @@ public class PlainText implements Selectable { ...@@ -57,13 +57,13 @@ public class PlainText implements Selectable {
@Override @Override
public Selectable regex(String regex) { public Selectable regex(String regex) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); RegexSelector regexSelector = Selectors.regex(regex);
return selectList(regexSelector, strings); return selectList(regexSelector, strings);
} }
@Override @Override
public Selectable regex(String regex, int group) { public Selectable regex(String regex, int group) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group); RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, strings); return selectList(regexSelector, strings);
} }
...@@ -106,4 +106,21 @@ public class PlainText implements Selectable { ...@@ -106,4 +106,21 @@ public class PlainText implements Selectable {
return null; return null;
} }
} }
@Override
public Selectable text() {
//do nothing
return this;
}
@Override
public Selectable text(String newlineSeparator) {
//do nothing
return this;
}
@Override
public boolean match() {
return strings != null && strings.size() > 0;
}
} }
...@@ -82,6 +82,27 @@ public interface Selectable { ...@@ -82,6 +82,27 @@ public interface Selectable {
*/ */
public String toString(); public String toString();
/**
* select text content of html
*
* @return text
*/
public Selectable text();
/**
* select text content of html
*
* @return text
*/
public Selectable text(String newlineSeparator);
/**
* if result exist for select
*
* @return true if result exist
*/
public boolean match();
/** /**
* multi string result * multi string result
* *
......
...@@ -16,6 +16,10 @@ public abstract class Selectors { ...@@ -16,6 +16,10 @@ public abstract class Selectors {
return SelectorFactory.getInstatnce().newRegexSelector(expr, group); return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
} }
public static SmartContentSelector smartContent() {
return SelectorFactory.getInstatnce().newSmartContentSelector();
}
public static CssSelector $(String expr) { public static CssSelector $(String expr) {
return new CssSelector(expr); return new CssSelector(expr);
} }
...@@ -36,6 +40,14 @@ public abstract class Selectors { ...@@ -36,6 +40,14 @@ public abstract class Selectors {
return new OrSelector(selectors); return new OrSelector(selectors);
} }
public static TextContentSelector text() {
return new TextContentSelector();
}
public static TextContentSelector text(String newlineSeperator) {
return new TextContentSelector(newlineSeperator);
}
public static void main(String[] args) { public static void main(String[] args) {
String s = "a"; String s = "a";
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s); or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
......
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public class TextContentSelector implements Selector {
private String newLineSeperator = "\n";
public TextContentSelector() {
}
public TextContentSelector(String newLineSeperator) {
this.newLineSeperator = newLineSeperator;
}
private final static Set<String> TAGS_IN_NEWLINE = new HashSet<String>();
private final static Set<String> TAGS_TO_IGNORE = new HashSet<String>();
static {
TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"}));
TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"}));
}
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
return select0(doc);
}
protected String select0(Element element) {
String tagName = element.tagName().toLowerCase();
if (TAGS_TO_IGNORE.contains(tagName)) {
return "";
}
StringBuilder textBuilder = new StringBuilder();
textBuilder.append(element.text());
if (element.children() != null) {
for (Element child : element.children()) {
textBuilder.append(select0(child));
}
}
if (TAGS_IN_NEWLINE.contains(tagName)) {
textBuilder.append(newLineSeperator);
}
return textBuilder.toString();
}
@Override
public List<String> selectList(String text) {
throw new UnsupportedOperationException();
}
}
package us.codecraft.webmagic.selector;
import junit.framework.Assert;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public class TextContentSelectorTest {
@Test
public void test() {
String html = "<div class=\"edit-comment-hide\">\n" +
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n" +
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n" +
" </div>\n" +
" </div>";
TextContentSelector textContentSelector = new TextContentSelector("<br>");
String text = textContentSelector.select(html);
Assert.assertNotNull(text);
}
@Ignore("takes long time")
@Test
public void testDownload() {
String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8")
.smartContent().text().toString();
Assert.assertNotNull(text);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment