Commit 59ad4cad authored by yihua.huang's avatar yihua.huang

#42 Add jsonpath in annotation mode for json result

parent c2d6d495
...@@ -9,7 +9,7 @@ import java.util.ArrayList; ...@@ -9,7 +9,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* Selectable plain text.<br> * Selectable html.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.1.0
...@@ -23,16 +23,28 @@ public class Html extends PlainText { ...@@ -23,16 +23,28 @@ public class Html extends PlainText {
*/ */
private Document document; private Document document;
private boolean init = false;
public Html(List<String> strings) { public Html(List<String> strings) {
super(strings); super(strings);
} }
public Html(String text) { public Html(String text) {
super(text); super(text);
try { }
this.document = Jsoup.parse(text);
} catch (Exception e) { /**
logger.warn("parse document error ", e); * lazy init
*/
private void initDocument() {
if (this.document == null && !init) {
init = true;
//just init once whether the parsing succeeds or not
try {
this.document = Jsoup.parse(getText());
} catch (Exception e) {
logger.warn("parse document error ", e);
}
} }
} }
...@@ -47,6 +59,7 @@ public class Html extends PlainText { ...@@ -47,6 +59,7 @@ public class Html extends PlainText {
@Override @Override
protected Selectable select(Selector selector, List<String> strings) { protected Selectable select(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
for (String string : strings) { for (String string : strings) {
String result = selector.select(string); String result = selector.select(string);
...@@ -59,6 +72,7 @@ public class Html extends PlainText { ...@@ -59,6 +72,7 @@ public class Html extends PlainText {
@Override @Override
protected Selectable selectList(Selector selector, List<String> strings) { protected Selectable selectList(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
for (String string : strings) { for (String string : strings) {
List<String> result = selector.selectList(string); List<String> result = selector.selectList(string);
...@@ -69,6 +83,7 @@ public class Html extends PlainText { ...@@ -69,6 +83,7 @@ public class Html extends PlainText {
@Override @Override
public Selectable smartContent() { public Selectable smartContent() {
initDocument();
SmartContentSelector smartContentSelector = Selectors.smartContent(); SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings); return select(smartContentSelector, strings);
} }
......
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class AppStore {
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
private String trackName;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
private String description;
public static void main(String[] args) {
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
System.out.println(appStore.trackName);
System.out.println(appStore.description);
}
}
...@@ -239,7 +239,7 @@ class PageModelExtractor { ...@@ -239,7 +239,7 @@ class PageModelExtractor {
} else { } else {
if (objectExtractor.multi) { if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>(); List<Object> os = new ArrayList<Object>();
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString()); List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
for (String s : list) { for (String s : list) {
Object o = processSingle(page, s, false); Object o = processSingle(page, s, false);
if (o != null) { if (o != null) {
...@@ -248,7 +248,7 @@ class PageModelExtractor { ...@@ -248,7 +248,7 @@ class PageModelExtractor {
} }
return os; return os;
} else { } else {
String select = objectExtractor.getSelector().select(page.getHtml().toString()); String select = objectExtractor.getSelector().select(page.getRawText());
Object o = processSingle(page, select, false); Object o = processSingle(page, select, false);
return o; return o;
} }
......
...@@ -24,7 +24,7 @@ public @interface ExtractBy { ...@@ -24,7 +24,7 @@ public @interface ExtractBy {
/** /**
* types of extractor expressions * types of extractor expressions
*/ */
public static enum Type {XPath, Regex, Css} public static enum Type {XPath, Regex, Css, JsonPath}
/** /**
* Extractor type, support XPath, CSS Selector and regex. * Extractor type, support XPath, CSS Selector and regex.
......
...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention; ...@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* Define a extractor for url. Only regex can be used. <br> * Define a extractor to extract data in url of current page. Only regex can be used. <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
......
...@@ -27,6 +27,9 @@ public class ExtractorUtils { ...@@ -27,6 +27,9 @@ public class ExtractorUtils {
case XPath: case XPath:
selector = getXpathSelector(value); selector = getXpathSelector(value);
break; break;
case JsonPath:
selector = new JsonPathSelector(value);
break;
default: default:
selector = getXpathSelector(value); selector = getXpathSelector(value);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment