Commit 8b35d795 authored by yihua.huang's avatar yihua.huang

Do not cache document in Selectable for selected Html element #73

parent 6201fd69
...@@ -7,3 +7,25 @@ CREATE TABLE `DynamicClass` ( ...@@ -7,3 +7,25 @@ CREATE TABLE `DynamicClass` (
PRIMARY KEY (`Id`), PRIMARY KEY (`Id`),
UNIQUE KEY `un_class_name` (`ClassName`) UNIQUE KEY `un_class_name` (`ClassName`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8; ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `Spider` (
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`PageProcessorId` int(11) unsigned NOT NULL AUTO_INCREMENT,
`PipelineId` int(11) unsigned NOT NULL AUTO_INCREMENT,
`SchedulerId` int(11) unsigned NOT NULL AUTO_INCREMENT,
`Config` text NOT NULL,
`AddTime` datetime NOT NULL,
`UpdateTime` datetime NOT NULL,
PRIMARY KEY (`Id`),
UNIQUE KEY `un_class_name` (`ClassName`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `PageProcessor` (
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`ClassName` varchar(200) NOT NULL,
`Params` text NOT NULL,
`AddTime` datetime NOT NULL,
`UpdateTime` datetime NOT NULL,
PRIMARY KEY (`Id`),
UNIQUE KEY `un_class_name` (`ClassName`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
\ No newline at end of file
package us.codecraft.webmagic.avalon.web;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.servlet.ModelAndView;
/**
* @author code4crafter@gmail.com
*/
@Controller("dashboard")
@RequestMapping("/")
public class DashBoardController {
@RequestMapping
public ModelAndView index() {
ModelAndView map = new ModelAndView("dashboard");
return map;
}
}
...@@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap; ...@@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
/** /**
* Container of Spiders.
*
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class Worker { public class Worker {
......
package us.codecraft.webmagic.avalon.web; package us.codecraft.webmagic.worker.controller;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller; import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody; import org.springframework.web.bind.annotation.ResponseBody;
import us.codecraft.webmagic.worker.Worker;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -10,15 +13,19 @@ import java.util.Map; ...@@ -10,15 +13,19 @@ import java.util.Map;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
@Controller("spider") @Controller
@RequestMapping("spider") @RequestMapping("spider")
public class SpiderController { public class SpiderController {
@Autowired
private Worker worker;
@RequestMapping("create") @RequestMapping("create")
@ResponseBody @ResponseBody
public Map<String, Object> create() { public Map<String, Object> create(@RequestParam("id") String id) {
HashMap<String, Object> map = new HashMap<String, Object>(); HashMap<String, Object> map = new HashMap<String, Object>();
map.put("code", 200); map.put("code", 200);
return map; return map;
} }
} }
...@@ -23,7 +23,7 @@ public class Html extends PlainText { ...@@ -23,7 +23,7 @@ public class Html extends PlainText {
*/ */
private Document document; private Document document;
private boolean init = false; private boolean needInitCache = true;
public Html(List<String> strings) { public Html(List<String> strings) {
super(strings); super(strings);
...@@ -33,12 +33,22 @@ public class Html extends PlainText { ...@@ -33,12 +33,22 @@ public class Html extends PlainText {
super(text); super(text);
} }
public Html(List<String> strings, boolean needInitCache) {
super(strings);
this.needInitCache = needInitCache;
}
public Html(String text, boolean needInitCache) {
super(text);
this.needInitCache = needInitCache;
}
/** /**
* lazy init * lazy init
*/ */
private void initDocument() { private void initDocument() {
if (this.document == null && !init) { if (this.document == null && needInitCache) {
init = true; needInitCache = false;
//just init once whether the parsing succeeds or not //just init once whether the parsing succeeds or not
try { try {
this.document = Jsoup.parse(getText()); this.document = Jsoup.parse(getText());
...@@ -67,7 +77,7 @@ public class Html extends PlainText { ...@@ -67,7 +77,7 @@ public class Html extends PlainText {
results.add(result); results.add(result);
} }
} }
return new Html(results); return new Html(results, false);
} }
@Override @Override
...@@ -78,7 +88,7 @@ public class Html extends PlainText { ...@@ -78,7 +88,7 @@ public class Html extends PlainText {
List<String> result = selector.selectList(string); List<String> result = selector.selectList(string);
results.addAll(result); results.addAll(result);
} }
return new Html(results); return new Html(results, false);
} }
@Override @Override
...@@ -95,9 +105,9 @@ public class Html extends PlainText { ...@@ -95,9 +105,9 @@ public class Html extends PlainText {
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
XpathSelector xpathSelector = new XpathSelector(xpath); XpathSelector xpathSelector = Selectors.xpath(xpath);
if (document != null) { if (document != null) {
return new Html(xpathSelector.selectList(document)); return new Html(xpathSelector.selectList(document), false);
} }
return selectList(xpathSelector, strings); return selectList(xpathSelector, strings);
} }
...@@ -106,7 +116,7 @@ public class Html extends PlainText { ...@@ -106,7 +116,7 @@ public class Html extends PlainText {
public Selectable $(String selector) { public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector); CssSelector cssSelector = Selectors.$(selector);
if (document != null) { if (document != null) {
return new Html(cssSelector.selectList(document)); return new Html(cssSelector.selectList(document), false);
} }
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
...@@ -115,7 +125,7 @@ public class Html extends PlainText { ...@@ -115,7 +125,7 @@ public class Html extends PlainText {
public Selectable $(String selector, String attrName) { public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName); CssSelector cssSelector = Selectors.$(selector, attrName);
if (document != null) { if (document != null) {
return new Html(cssSelector.selectList(document)); return new Html(cssSelector.selectList(document), false);
} }
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
......
package us.codecraft.webmagic.selector;
import org.junit.Test;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
*/
public class SelectorTest {
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
@Test
public void testChain() throws Exception {
Html selectable = new Html(html);
List<String> linksWithoutChain = selectable.links().all();
Selectable xpath = selectable.xpath("//div");
List<String> linksWithChainFirstCall = xpath.links().all();
List<String> linksWithChainSecondCall = xpath.links().all();
assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall);
assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment