Commit e06b0c1a authored by yihua.huang's avatar yihua.huang

Merge branch 'xsoup'

parents b9eeb88f aefd0569
......@@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.2.2-SNAPSHOT</version>
<version>0.3.0-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
......
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.2.2-SNAPSHOT</version>
<version>0.3.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......@@ -25,6 +25,12 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
......
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
......@@ -28,7 +29,7 @@ public class Page {
private ResultItems resultItems = new ResultItems();
private Selectable html;
private Html html;
private Selectable url;
......@@ -58,11 +59,11 @@ public class Page {
*
* @return html
*/
public Selectable getHtml() {
public Html getHtml() {
return html;
}
public void setHtml(Selectable html) {
public void setHtml(Html html) {
this.html = html;
}
......
......@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import us.codecraft.webmagic.utils.ThreadUtils;
import java.io.Closeable;
......@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return this;
}
/**
* switch off xsoup
* @return
*/
public static void xsoupOff(){
EnvironmentUtil.setUseXsoup(false);
}
@Override
public String getUUID() {
if (uuid != null) {
......
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class BaseElementSelector implements Selector,ElementSelector {
@Override
public String select(String text) {
return select(Jsoup.parse(text));
}
@Override
public List<String> selectList(String text) {
return selectList(Jsoup.parse(text));
}
}
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
......@@ -15,7 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class CssSelector implements Selector {
public class CssSelector extends BaseElementSelector {
private String selectorText;
......@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
this.attrName = attrName;
}
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
return getValue(elements.get(0));
}
private String getValue(Element element) {
if (attrName == null) {
return element.outerHtml();
......@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
}
@Override
public List<String> selectList(String text) {
public String select(Element element) {
Elements elements = element.select(selectorText);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
return getValue(elements.get(0));
}
@Override
public List<String> selectList(Element doc) {
List<String> strings = new ArrayList<String>();
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
......
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Selector(extractor) for html elements.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public interface ElementSelector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param element
* @return result
*/
public String select(Element element);
/**
* Extract all results in text.<br>
*
* @param element
* @return results
*/
public List<String> selectList(Element element);
}
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import java.util.ArrayList;
import java.util.List;
......@@ -11,12 +15,23 @@ import java.util.List;
*/
public class Html extends PlainText {
/**
* Store parsed document for better performance when only one text exist.
*/
private Document document;
public Html(List<String> strings) {
super(strings);
}
public Html(String text) {
super(text);
this.document = Jsoup.parse(text);
}
public Html(Document document) {
super(document.html());
this.document = document;
}
public static Html create(String text) {
......@@ -53,38 +68,71 @@ public class Html extends PlainText {
@Override
public Selectable links() {
XpathSelector xpathSelector = Selectors.xpath("//a/@href");
return selectList(xpathSelector, strings);
return xpath("//a/@href");
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
if (EnvironmentUtil.useXsoup()) {
XsoupSelector xsoupSelector = new XsoupSelector(xpath);
if (document != null) {
return new Html(xsoupSelector.selectList(document));
}
return selectList(xsoupSelector, strings);
} else {
XpathSelector xpathSelector = new XpathSelector(xpath);
return selectList(xpathSelector, strings);
}
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
if (document != null) {
return new Html(cssSelector.selectList(document));
}
return selectList(cssSelector, strings);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
if (document != null) {
return new Html(cssSelector.selectList(document));
}
return selectList(cssSelector, strings);
}
@Override
public Selectable text() {
TextContentSelector selector = Selectors.text();
return select(selector, strings);
public Document getDocument() {
return document;
}
@Override
public Selectable text(String newlineSeparator) {
TextContentSelector selector = Selectors.text(newlineSeparator);
return select(selector, strings);
public String getText() {
if (strings!=null&&strings.size()>0){
return strings.get(0);
}
return document.html();
}
/**
* @param selector
* @return
*/
public String selectDocument(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
return selector.select(getText());
}
}
public List<String> selectDocumentForList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
return selector.selectList(getText());
}
}
}
......@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement);
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, strings);
}
......@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
}
}
@Override
public Selectable text() {
//do nothing
return this;
}
@Override
public Selectable text(String newlineSeparator) {
//do nothing
return this;
}
@Override
public boolean match() {
return strings != null && strings.size() > 0;
......
......@@ -82,20 +82,6 @@ public interface Selectable {
*/
public String toString();
/**
* select text content of html
*
* @return text
*/
public Selectable text();
/**
* select text content of html
*
* @return text
*/
public Selectable text(String newlineSeparator);
/**
* if result exist for select
*
......
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.lang.reflect.Constructor;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SelectorFactory {
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
private static final SelectorFactory INSTATNCE = new SelectorFactory();
public static SelectorFactory getInstatnce() {
return INSTATNCE;
}
public RegexSelector newRegexSelector(String regex) {
return newSelector(RegexSelector.class, regex);
}
public RegexSelector newRegexSelector(String regex, int group) {
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
if (innerCache.get(cacheKey) != null) {
return (RegexSelector) innerCache.get(cacheKey);
}
return new RegexSelector(regex, group);
}
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
return newSelector(ReplaceSelector.class, regex, replacement);
}
public XpathSelector newXpathSelector(String xpath) {
return newSelector(XpathSelector.class, xpath);
}
public SmartContentSelector newSmartContentSelector() {
return newSelector(SmartContentSelector.class);
}
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
String cacheKey = getCacheKey(RegexSelector.class, param);
if (innerCache.get(cacheKey) != null) {
return (T) innerCache.get(cacheKey);
}
T selector = newSelector(clazz, param);
if (selector != null) {
innerCache.put(cacheKey, selector);
}
return selector;
}
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
try {
if (param.length == 0) {
Constructor<T> constructor
= clazz.getConstructor();
T selector = constructor.newInstance();
return selector;
} else if (param.length == 1) {
Constructor<T> constructor
= clazz.getConstructor(String.class);
T selector = constructor.newInstance(param[0]);
return selector;
} else if (param.length == 2) {
Constructor<T> constructor
= clazz.getConstructor(String.class, String.class);
T selector = constructor.newInstance(param[0], param[1]);
return selector;
} else {
throw new UnsupportedOperationException();
}
} catch (Exception e) {
throw new IllegalArgumentException("init object error", e);
}
}
private String getCacheKey(Class<?> clazz, String... param) {
return clazz.toString() + "_" + StringUtils.join(param, "_");
}
}
......@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
public abstract class Selectors {
public static RegexSelector regex(String expr) {
return SelectorFactory.getInstatnce().newRegexSelector(expr);
return new RegexSelector(expr);
}
public static RegexSelector regex(String expr, int group) {
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
return new RegexSelector(expr,group);
}
public static SmartContentSelector smartContent() {
return SelectorFactory.getInstatnce().newSmartContentSelector();
return new SmartContentSelector();
}
public static CssSelector $(String expr) {
......@@ -29,7 +29,11 @@ public abstract class Selectors {
}
public static XpathSelector xpath(String expr) {
return SelectorFactory.getInstatnce().newXpathSelector(expr);
return new XpathSelector(expr);
}
public static XsoupSelector xsoup(String expr) {
return new XsoupSelector(expr);
}
public static AndSelector and(Selector... selectors) {
......@@ -40,14 +44,6 @@ public abstract class Selectors {
return new OrSelector(selectors);
}
public static TextContentSelector text() {
return new TextContentSelector();
}
public static TextContentSelector text(String newlineSeperator) {
return new TextContentSelector(newlineSeperator);
}
public static void main(String[] args) {
String s = "a";
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
......
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public class TextContentSelector implements Selector {
private String newLineSeperator = "\n";
public TextContentSelector() {
}
public TextContentSelector(String newLineSeperator) {
this.newLineSeperator = newLineSeperator;
}
private final static Set<String> TAGS_IN_NEWLINE = new HashSet<String>();
private final static Set<String> TAGS_TO_IGNORE = new HashSet<String>();
static {
TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"}));
TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"}));
}
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
return select0(doc);
}
protected String select0(Element element) {
String tagName = element.tagName().toLowerCase();
if (TAGS_TO_IGNORE.contains(tagName)) {
return "";
}
StringBuilder textBuilder = new StringBuilder();
textBuilder.append(element.text());
if (element.children() != null) {
for (Element child : element.children()) {
textBuilder.append(select0(child));
}
}
if (TAGS_IN_NEWLINE.contains(tagName)) {
textBuilder.append(newLineSeperator);
}
return textBuilder.toString();
}
@Override
public List<String> selectList(String text) {
throw new UnsupportedOperationException();
}
}
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
import java.util.List;
/**
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public class XsoupSelector extends BaseElementSelector {
private XPathEvaluator xPathEvaluator;
public XsoupSelector(String xpathStr) {
this.xPathEvaluator = Xsoup.compile(xpathStr);
}
@Override
public String select(Element element) {
return xPathEvaluator.evaluate(element).get();
}
@Override
public List<String> selectList(Element element) {
return xPathEvaluator.evaluate(element).list();
}
}
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.BooleanUtils;
import java.util.Properties;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class EnvironmentUtil {
private static final String USE_XSOUP = "xsoup";
public static boolean useXsoup() {
Properties properties = System.getProperties();
Object o = properties.get(USE_XSOUP);
if (o == null) {
return true;
}
return BooleanUtils.toBoolean(((String) o).toLowerCase());
}
public static void setUseXsoup(boolean useXsoup) {
Properties properties = System.getProperties();
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
}
}
package us.codecraft.webmagic.selector;
import junit.framework.Assert;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public class TextContentSelectorTest {
@Test
public void test() {
String html = "<div class=\"edit-comment-hide\">\n" +
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n" +
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n" +
" </div>\n" +
" </div>";
TextContentSelector textContentSelector = new TextContentSelector("<br>");
String text = textContentSelector.select(html);
Assert.assertNotNull(text);
}
@Ignore("takes long time")
@Test
public void testDownload() {
String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8")
.smartContent().text().toString();
Assert.assertNotNull(s);
}
}
package us.codecraft.webmagic.utils;
import org.junit.Test;
import static junit.framework.Assert.*;
/**
* @author code4crafter@gmail.com
*/
public class EnvironmentUtilTest {
@Test
public void test() {
assertTrue(EnvironmentUtil.useXsoup());
EnvironmentUtil.setUseXsoup(false);
assertFalse(EnvironmentUtil.useXsoup());
}
}
......@@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.2.2-SNAPSHOT</version>
<version>0.3.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -34,7 +34,7 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
private Extractor objectExtractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
......@@ -169,7 +169,7 @@ class PageModelExtractor {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
......@@ -183,28 +183,28 @@ class PageModelExtractor {
if (!matched) {
return null;
}
if (extractor == null) {
return processSingle(page, page.getHtml().toString());
if (objectExtractor == null) {
return processSingle(page, null, false);
} else {
if (extractor.multi) {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
Object o = processSingle(page, s, false);
if (o != null) {
os.add(o);
}
}
return os;
} else {
String select = extractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
String select = objectExtractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select, false);
return o;
}
}
}
private Object processSingle(Page page, String html) {
private Object processSingle(Page page, String html, boolean isRaw) {
Object o = null;
try {
o = clazz.newInstance();
......@@ -213,10 +213,14 @@ class PageModelExtractor {
List<String> value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().selectList(html);
}
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
......@@ -232,10 +236,14 @@ class PageModelExtractor {
String value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().select(html);
}
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
......
......@@ -18,7 +18,7 @@ import java.io.PrintWriter;
* Otherwise use SHA1 as file name.
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
* @since 0.3.0
*/
public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
......
package us.codecraft.webmagic.utils;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.selector.CssSelector;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import us.codecraft.webmagic.selector.*;
import java.util.ArrayList;
import java.util.List;
/**
* Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
......@@ -27,9 +25,19 @@ public class ExtractorUtils {
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
selector = getXpathSelector(value);
break;
default:
selector = getXpathSelector(value);
}
return selector;
}
private static Selector getXpathSelector(String value) {
Selector selector;
if (EnvironmentUtil.useXsoup()) {
selector = new XsoupSelector(value);
} else {
selector = new XpathSelector(value);
}
return selector;
......@@ -37,7 +45,7 @@ public class ExtractorUtils {
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
List<Selector> selectors = new ArrayList<Selector>();
if (extractBies==null){
if (extractBies == null) {
return selectors;
}
for (ExtractBy extractBy : extractBies) {
......
......@@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.2.1</version>
<version>0.3.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
......
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.PlainText;
......@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
......@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
}
return site;
}
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
page.putField("body",page.getHtml().xpath("//dd"));
}
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -14,10 +15,9 @@ import java.util.List;
public class HuxiuProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
List<String> requests = page.getHtml().links().regex(".*article.*").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
page.putField("content",page.getHtml().smartContent());
}
......@@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
}
}
......@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
......@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/temp/webmagic/"))
.thread(5)
.run();
}
......
......@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
......@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run();
Spider.create(new IteyeBlogProcessor()).thread(5).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
......@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
}
}
......@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
@Override
......@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
......@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
......@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
Spider.create(new OschinaBlogPageProcesser()).run();
}
}
......@@ -17,6 +17,11 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
......
package us.codecraft.webmagic.selector;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
/**
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
......@@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
Selectors.xpath("/abc/").select("");
}
@Test
......@@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
XpathSelector xpathSelector = new XpathSelector("//a");
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpathSelector.selectList(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis() - time);
CssSelector cssSelector = new CssSelector("a");
time =System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
cssSelector.selectList(html);
}
System.out.println("css "+(System.currentTimeMillis()-time));
}
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
System.out.println(html.length());
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(html);
Document document = Jsoup.parse(html);
long time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis()-time);
System.out.println("=============");
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
Jsoup.parse(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
document.select("a");
}
System.out.println(System.currentTimeMillis()-time);
System.out.println("=============");
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis()-time);
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis()-time);
System.out.println("=============");
XPathEvaluator compile = Xsoup.compile("//a");
time =System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
compile.evaluate(document);
}
System.out.println(System.currentTimeMillis()-time);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment