Commit 81e7f798 authored by yihua.huang's avatar yihua.huang

invite jsoup and cssselector

parent c7330460
......@@ -52,6 +52,12 @@
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
......
......@@ -154,9 +154,11 @@ public class Spider implements Runnable, Task {
request = scheduler.poll(this);
}
} else {
//multi thread
final AtomicInteger threadAlive = new AtomicInteger(0);
while (true) {
if (request == null) {
//when no request found but some thread is alive, sleep a while.
try {
Thread.sleep(100);
} catch (InterruptedException e) {
......
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
*/
public class CssSelector implements Selector {
private String selectorText;
public CssSelector(String selectorText) {
this.selectorText = selectorText;
}
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
return null;
}
return elements.get(0).outerHtml();
}
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
strings.add(element.outerHtml());
}
}
return strings;
}
}
......@@ -62,4 +62,10 @@ public class Html extends PlainText {
return selectList(xpathSelector, strings);
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector);
return selectList(cssSelector,strings);
}
}
......@@ -33,6 +33,11 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException();
}
@Override
public Selectable $(String selector) {
throw new UnsupportedOperationException();
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException();
......
......@@ -17,6 +17,14 @@ public interface Selectable {
*/
public Selectable xpath(String xpath);
/**
* select list with jquery selector
*
* @param
* @return
*/
public Selectable $(String selector);
/**
* select smart content with ReadAbility algorithm
*
......
......@@ -17,4 +17,5 @@ public class HtmlTest {
Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
}
}
......@@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
public void testOschina() {
Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
System.out.println(html1.regex("(<body>.*?</body>)").links().toStrings());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment