Commit f9825c21 authored by yihua.huang's avatar yihua.huang

refactor selectable for html fragment #113

parent 03d26c16
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafer@gmail.com
* @since 0.5.2
*/
public abstract class AbstractSelectable implements Selectable {
protected List<String> strings;
public AbstractSelectable(String text) {
List<String> results = new ArrayList<String>();
results.add(text);
this.strings = results;
}
public AbstractSelectable(List<String> strings) {
this.strings = strings;
}
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
if (result != null) {
results.add(result);
}
}
return new PlainText(results);
}
protected Selectable selectList(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) {
List<String> result = selector.selectList(string);
results.addAll(result);
}
return new PlainText(results);
}
@Override
public List<String> all() {
return strings;
}
@Override
public Selectable jsonPath(String jsonPath) {
throw new UnsupportedOperationException();
}
@Override
public String get() {
if (CollectionUtils.isNotEmpty(all())) {
return all().get(0);
} else {
return null;
}
}
@Override
public Selectable select(Selector selector) {
return select(selector, strings);
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, strings);
}
@Override
public Selectable regex(String regex) {
RegexSelector regexSelector = Selectors.regex(regex);
return selectList(regexSelector, strings);
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, strings);
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, strings);
}
@Override
public String toString() {
return get();
}
@Override
public boolean match() {
return strings != null && strings.size() > 0;
}
}
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -28,4 +30,23 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { ...@@ -28,4 +30,23 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
} }
} }
public Element selectElement(String text) {
if (text != null) {
return selectElement(Jsoup.parse(text));
}
return null;
}
public Elements selectElements(String text) {
if (text != null) {
return selectElements(Jsoup.parse(text));
} else {
return new Elements();
}
}
public abstract Element selectElement(Element element);
public abstract Elements selectElements(Element element);
} }
...@@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector { ...@@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
@Override @Override
public String select(Element element) { public String select(Element element) {
Elements elements = element.select(selectorText); Elements elements = selectElements(element);
if (CollectionUtils.isEmpty(elements)) { if (CollectionUtils.isEmpty(elements)) {
return null; return null;
} }
...@@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector { ...@@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
@Override @Override
public List<String> selectList(Element doc) { public List<String> selectList(Element doc) {
List<String> strings = new ArrayList<String>(); List<String> strings = new ArrayList<String>();
Elements elements = doc.select(selectorText); Elements elements = selectElements(doc);
if (CollectionUtils.isNotEmpty(elements)) { if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) { for (Element element : elements) {
String value = getValue(element); String value = getValue(element);
...@@ -78,4 +78,18 @@ public class CssSelector extends BaseElementSelector { ...@@ -78,4 +78,18 @@ public class CssSelector extends BaseElementSelector {
} }
return strings; return strings;
} }
@Override
public Element selectElement(Element element) {
Elements elements = element.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
return elements.get(0);
}
return null;
}
@Override
public Elements selectElements(Element element) {
return element.select(selectorText);
}
} }
...@@ -142,6 +142,13 @@ public class Html extends PlainText { ...@@ -142,6 +142,13 @@ public class Html extends PlainText {
return document.html(); return document.html();
} }
@Override
public List<Selectable> nodes() {
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
selectables.add(this);
return selectables;
}
/** /**
* @param selector * @param selector
* @return * @return
......
package us.codecraft.webmagic.selector;
/**
* @author code4crafer@gmail.com
*/
public class HtmlFragment {
}
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -12,18 +10,14 @@ import java.util.List; ...@@ -12,18 +10,14 @@ import java.util.List;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.1.0
*/ */
public class PlainText implements Selectable { public class PlainText extends AbstractSelectable {
protected List<String> strings;
public PlainText(List<String> strings) { public PlainText(List<String> strings) {
this.strings = strings; super(strings);
} }
public PlainText(String text) { public PlainText(String text) {
List<String> results = new ArrayList<String>(); super(text);
results.add(text);
this.strings = results;
} }
public static PlainText create(String text) { public static PlainText create(String text) {
...@@ -45,16 +39,6 @@ public class PlainText implements Selectable { ...@@ -45,16 +39,6 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
@Override @Override
public Selectable smartContent() { public Selectable smartContent() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
...@@ -66,79 +50,12 @@ public class PlainText implements Selectable { ...@@ -66,79 +50,12 @@ public class PlainText implements Selectable {
} }
@Override @Override
public Selectable regex(String regex) { public List<Selectable> nodes() {
RegexSelector regexSelector = Selectors.regex(regex); List<Selectable> nodes = new ArrayList<Selectable>(strings.size());
return selectList(regexSelector, strings);
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, strings);
}
protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
if (result != null) {
results.add(result);
}
}
return new PlainText(results);
}
protected Selectable selectList(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) { for (String string : strings) {
List<String> result = selector.selectList(string); nodes.add(PlainText.create(string));
results.addAll(result);
}
return new PlainText(results);
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, strings);
}
@Override
public List<String> all() {
return strings;
}
@Override
public Selectable jsonPath(String jsonPath) {
throw new UnsupportedOperationException();
}
@Override
public String get() {
if (CollectionUtils.isNotEmpty(all())) {
return all().get(0);
} else {
return null;
} }
return nodes;
} }
@Override
public Selectable select(Selector selector) {
return select(selector, strings);
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, strings);
}
@Override
public String toString() {
return get();
}
@Override
public boolean match() {
return strings != null && strings.size() > 0;
}
} }
...@@ -143,4 +143,10 @@ public interface Selectable { ...@@ -143,4 +143,10 @@ public interface Selectable {
* @return * @return
*/ */
public Selectable selectList(Selector selector); public Selectable selectList(Selector selector);
/**
* get all nodes
* @return
*/
public List<Selectable> nodes();
} }
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup; import us.codecraft.xsoup.Xsoup;
...@@ -29,4 +31,18 @@ public class XpathSelector extends BaseElementSelector { ...@@ -29,4 +31,18 @@ public class XpathSelector extends BaseElementSelector {
public List<String> selectList(Element element) { public List<String> selectList(Element element) {
return xPathEvaluator.evaluate(element).list(); return xPathEvaluator.evaluate(element).list();
} }
@Override
public Element selectElement(Element element) {
Elements elements = selectElements(element);
if (CollectionUtils.isNotEmpty(elements)){
return elements.get(0);
}
return null;
}
@Override
public Elements selectElements(Element element) {
return xPathEvaluator.evaluate(element).getElements();
}
} }
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.samples.pipeline.OneFilePipeline;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
/**
* @author code4crafer@gmail.com
*/
public class MamacnPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100);
@Override
public void process(Page page) {
Selectable images = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li");
page.putField("img", images.xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src").get());
page.putField("title", page.getHtml().xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt").get());
page.putField("url", page.getUrl().toString());
if (page.getResultItems().get("title") == null) {
page.setSkip(true);
}
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
Spider.create(new MamacnPageProcessor())
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
.addUrl("http://www.mama.cn/photo/t1-p1.html")
.addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
.thread(5)
.run();
}
}
package us.codecraft.webmagic.samples.pipeline;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.*;
import java.util.Map;
/**
* @author code4crafer@gmail.com
*/
public class OneFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
private PrintWriter printWriter;
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
this("/data/webmagic/");
}
public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
setPath(path);
printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
}
@Override
public synchronized void process(ResultItems resultItems, Task task) {
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.flush();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment