Commit c1471718 authored by yihua.huang's avatar yihua.huang

extractors

parent 20705b34
...@@ -43,6 +43,8 @@ public class CssSelector implements Selector { ...@@ -43,6 +43,8 @@ public class CssSelector implements Selector {
private String getValue(Element element) { private String getValue(Element element) {
if (attrName == null) { if (attrName == null) {
return element.outerHtml(); return element.outerHtml();
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
return element.html();
} else { } else {
return element.attr(attrName); return element.attr(attrName);
} }
......
...@@ -26,9 +26,9 @@ public class OrSelector implements Selector { ...@@ -26,9 +26,9 @@ public class OrSelector implements Selector {
@Override @Override
public String select(String text) { public String select(String text) {
for (Selector selector : selectors) { for (Selector selector : selectors) {
text = selector.select(text); String result = selector.select(text);
if (text != null) { if (result != null) {
return text; return result;
} }
} }
return null; return null;
......
...@@ -20,7 +20,9 @@ public class RegexSelector implements Selector { ...@@ -20,7 +20,9 @@ public class RegexSelector implements Selector {
private Pattern regex; private Pattern regex;
public RegexSelector(String regexStr) { private int group = 1;
public RegexSelector(String regexStr, int group) {
if (StringUtils.isBlank(regexStr)) { if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty"); throw new IllegalArgumentException("regex must not be empty");
} }
...@@ -36,11 +38,16 @@ public class RegexSelector implements Selector { ...@@ -36,11 +38,16 @@ public class RegexSelector implements Selector {
} catch (PatternSyntaxException e) { } catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e); throw new IllegalArgumentException("invalid regex", e);
} }
this.group = group;
}
public RegexSelector(String regexStr) {
this(regexStr, 1);
} }
@Override @Override
public String select(String text) { public String select(String text) {
return selectGroup(text).get(1); return selectGroup(text).get(group);
} }
@Override @Override
...@@ -48,7 +55,7 @@ public class RegexSelector implements Selector { ...@@ -48,7 +55,7 @@ public class RegexSelector implements Selector {
List<String> strings = new ArrayList<String>(); List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text); List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) { for (RegexResult result : results) {
strings.add(result.get(1)); strings.add(result.get(group));
} }
return strings; return strings;
} }
......
...@@ -27,7 +27,11 @@ public class SelectorFactory { ...@@ -27,7 +27,11 @@ public class SelectorFactory {
} }
public RegexSelector newRegexSelector(String regex, int group) { public RegexSelector newRegexSelector(String regex, int group) {
return newSelector(RegexSelector.class, regex, String.valueOf(group)); String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
if (innerCache.get(cacheKey) != null) {
return (RegexSelector) innerCache.get(cacheKey);
}
return new RegexSelector(regex, group);
} }
public ReplaceSelector newReplaceSelector(String regex, String replacement) { public ReplaceSelector newReplaceSelector(String regex, String replacement) {
......
package us.codecraft.webmagic.selector;
/**
* Convenient methods for selectors.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public abstract class Selectors {
public static RegexSelector regex(String expr) {
return SelectorFactory.getInstatnce().newRegexSelector(expr);
}
public static RegexSelector regex(String expr, int group) {
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
}
public static CssSelector $(String expr) {
return new CssSelector(expr);
}
public static CssSelector $(String expr, String attrName) {
return new CssSelector(expr, attrName);
}
public static XpathSelector xpath(String expr) {
return SelectorFactory.getInstatnce().newXpathSelector(expr);
}
public static AndSelector and(Selector... selectors) {
return new AndSelector(selectors);
}
public static OrSelector or(Selector... selectors) {
return new OrSelector(selectors);
}
public static void main(String[] args) {
String s = "a";
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
}
}
\ No newline at end of file
package us.codecraft.webmagic.selector;
import junit.framework.Assert;
import org.junit.Test;
import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com <br>
*/
public class ExtractorsTest {
String html = "<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>";
String html2 = "<title>aabbcc</title>";
@Test
public void testEach() {
Assert.assertEquals("<a href=\"xxx\">aabbcc</a>", $("div h1 a").select(html));
Assert.assertEquals("xxx", $("div h1 a", "href").select(html));
Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html));
Assert.assertEquals("xxx", xpath("//a/@href").select(html));
Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html));
Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html));
}
@Test
public void testCombo() {
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
Assert.assertEquals("aabbcc", or.select(html));
Assert.assertEquals("aabbcc", or.select(html2));
}
}
...@@ -5,8 +5,6 @@ import org.junit.Test; ...@@ -5,8 +5,6 @@ import org.junit.Test;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午7:13
*/ */
public class RegexSelectorTest { public class RegexSelectorTest {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment