Commit 194518fd authored by yihua.huang's avatar yihua.huang

add switch

parent 326b97c6
...@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline; ...@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import us.codecraft.webmagic.utils.ThreadUtils; import us.codecraft.webmagic.utils.ThreadUtils;
import java.io.Closeable; import java.io.Closeable;
...@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task { ...@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
/**
* switch off xsoup
* @return
*/
public static void xsoupOff(){
EnvironmentUtil.setUseXsoup(false);
}
@Override @Override
public String getUUID() { public String getUUID() {
if (uuid != null) { if (uuid != null) {
......
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Cache parsed element for extract.
*
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public class CacheElement {
public String text;
public Element element;
public String select(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getElement());
} else {
return selector.select(getText());
}
}
public List<String> selectList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getElement());
} else {
return selector.selectList(getText());
}
}
}
...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector; ...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -72,17 +73,22 @@ public class Html extends PlainText { ...@@ -72,17 +73,22 @@ public class Html extends PlainText {
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
if (EnvironmentUtil.useXsoup()) {
XsoupSelector xsoupSelector = new XsoupSelector(xpath); XsoupSelector xsoupSelector = new XsoupSelector(xpath);
if (document!=null){ if (document != null) {
return new Html(xsoupSelector.selectList(document)); return new Html(xsoupSelector.selectList(document));
} }
return selectList(xsoupSelector, strings); return selectList(xsoupSelector, strings);
} else {
XpathSelector xpathSelector = new XpathSelector(xpath);
return selectList(xpathSelector, strings);
}
} }
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector); CssSelector cssSelector = Selectors.$(selector);
if (document!=null){ if (document != null) {
return new Html(cssSelector.selectList(document)); return new Html(cssSelector.selectList(document));
} }
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
...@@ -91,7 +97,7 @@ public class Html extends PlainText { ...@@ -91,7 +97,7 @@ public class Html extends PlainText {
@Override @Override
public Selectable $(String selector, String attrName) { public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName); CssSelector cssSelector = Selectors.$(selector, attrName);
if (document!=null){ if (document != null) {
return new Html(cssSelector.selectList(document)); return new Html(cssSelector.selectList(document));
} }
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
...@@ -102,15 +108,17 @@ public class Html extends PlainText { ...@@ -102,15 +108,17 @@ public class Html extends PlainText {
} }
public String getText() { public String getText() {
if (strings!=null&&strings.size()>0){
return strings.get(0);
}
return document.html(); return document.html();
} }
/** /**
*
* @param selector * @param selector
* @return * @return
*/ */
public String select(Selector selector) { public String selectDocument(Selector selector) {
if (selector instanceof ElementSelector) { if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector; ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument()); return elementSelector.select(getDocument());
...@@ -119,7 +127,7 @@ public class Html extends PlainText { ...@@ -119,7 +127,7 @@ public class Html extends PlainText {
} }
} }
public List<String> selectList(Selector selector) { public List<String> selectDocumentForList(Selector selector) {
if (selector instanceof ElementSelector) { if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector; ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument()); return elementSelector.selectList(getDocument());
......
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.BooleanUtils;
import java.util.Properties;
/**
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public abstract class EnvironmentUtil {
private static final String USE_XSOUP = "xsoup";
public static boolean useXsoup() {
Properties properties = System.getProperties();
Object o = properties.get(USE_XSOUP);
if (o == null) {
return true;
}
return BooleanUtils.toBoolean(((String) o).toLowerCase());
}
public static void setUseXsoup(boolean useXsoup) {
Properties properties = System.getProperties();
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
}
}
package us.codecraft.webmagic.utils;
import org.junit.Test;
import static junit.framework.Assert.*;
/**
* @author code4crafter@gmail.com
*/
public class EnvironmentUtilTest {
@Test
public void test() {
assertTrue(EnvironmentUtil.useXsoup());
EnvironmentUtil.setUseXsoup(false);
assertFalse(EnvironmentUtil.useXsoup());
}
}
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.selector.*;
...@@ -185,13 +184,13 @@ class PageModelExtractor { ...@@ -185,13 +184,13 @@ class PageModelExtractor {
return null; return null;
} }
if (objectExtractor == null) { if (objectExtractor == null) {
return processSingle(page, page.getHtml().toString()); return processSingle(page, null, false);
} else { } else {
if (objectExtractor.multi) { if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>(); List<Object> os = new ArrayList<Object>();
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString()); List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) { for (String s : list) {
Object o = processSingle(page, s); Object o = processSingle(page, s, false);
if (o != null) { if (o != null) {
os.add(o); os.add(o);
} }
...@@ -199,19 +198,13 @@ class PageModelExtractor { ...@@ -199,19 +198,13 @@ class PageModelExtractor {
return os; return os;
} else { } else {
String select = objectExtractor.getSelector().select(page.getHtml().toString()); String select = objectExtractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select); Object o = processSingle(page, select, false);
return o; return o;
} }
} }
} }
private List<String> select(Selector selector,Element element,String html){ private Object processSingle(Page page, String html, boolean isRaw) {
if (selector instanceof ElementSelector){
}
}
private Object processSingle(Page page, String html) {
Object o = null; Object o = null;
try { try {
o = clazz.newInstance(); o = clazz.newInstance();
...@@ -220,10 +213,14 @@ class PageModelExtractor { ...@@ -220,10 +213,14 @@ class PageModelExtractor {
List<String> value; List<String> value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case RawHtml: case RawHtml:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break; break;
case Html: case Html:
if (isRaw) {
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().selectList(html); value = fieldExtractor.getSelector().selectList(html);
}
break; break;
case Url: case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
...@@ -239,10 +236,14 @@ class PageModelExtractor { ...@@ -239,10 +236,14 @@ class PageModelExtractor {
String value; String value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case RawHtml: case RawHtml:
value = fieldExtractor.getSelector().select(page.getHtml().toString()); value = page.getHtml().selectDocument(fieldExtractor.getSelector());
break; break;
case Html: case Html:
if (isRaw) {
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().select(html); value = fieldExtractor.getSelector().select(html);
}
break; break;
case Url: case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString()); value = fieldExtractor.getSelector().select(page.getUrl().toString());
......
...@@ -8,6 +8,7 @@ import java.util.List; ...@@ -8,6 +8,7 @@ import java.util.List;
/** /**
* Tools for annotation converting. <br> * Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.1 * @since 0.2.1
*/ */
...@@ -24,17 +25,27 @@ public class ExtractorUtils { ...@@ -24,17 +25,27 @@ public class ExtractorUtils {
selector = new RegexSelector(value); selector = new RegexSelector(value);
break; break;
case XPath: case XPath:
selector = new XsoupSelector(value); selector = getXpathSelector(value);
break; break;
default: default:
selector = getXpathSelector(value);
}
return selector;
}
private static Selector getXpathSelector(String value) {
Selector selector;
if (EnvironmentUtil.useXsoup()) {
selector = new XsoupSelector(value); selector = new XsoupSelector(value);
} else {
selector = new XpathSelector(value);
} }
return selector; return selector;
} }
public static List<Selector> getSelectors(ExtractBy[] extractBies) { public static List<Selector> getSelectors(ExtractBy[] extractBies) {
List<Selector> selectors = new ArrayList<Selector>(); List<Selector> selectors = new ArrayList<Selector>();
if (extractBies==null){ if (extractBies == null) {
return selectors; return selectors;
} }
for (ExtractBy extractBy : extractBies) { for (ExtractBy extractBy : extractBies) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment