Commit 42508af0 authored by yihua.huang's avatar yihua.huang

add huaban processor

parent fe224cbf
...@@ -15,6 +15,9 @@ public class ConsolePipeline implements Pipeline{ ...@@ -15,6 +15,9 @@ public class ConsolePipeline implements Pipeline{
@Override @Override
public void process(ResultItems resultItems,Task task) { public void process(ResultItems resultItems,Task task) {
if (resultItems.isSkip()){
return;
}
System.out.println("get page: "+resultItems.getRequest().getUrl()); System.out.println("get page: "+resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey()+":\t"+entry.getValue()); System.out.println(entry.getKey()+":\t"+entry.getValue());
......
...@@ -45,6 +45,9 @@ public class FilePipeline implements Pipeline { ...@@ -45,6 +45,9 @@ public class FilePipeline implements Pipeline {
if (!file.exists()) { if (!file.exists()) {
file.mkdirs(); file.mkdirs();
} }
if (resultItems.isSkip()){
return;
}
try { try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl()); printWriter.println("url:\t" + resultItems.getRequest().getUrl());
......
...@@ -53,6 +53,7 @@ public class SeleniumDownloader implements Downloader,Destroyable { ...@@ -53,6 +53,7 @@ public class SeleniumDownloader implements Downloader,Destroyable {
logger.warn("interrupted", e); logger.warn("interrupted", e);
return null; return null;
} }
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl()); webDriver.get(request.getUrl());
WebDriver.Options manage = webDriver.manage(); WebDriver.Options manage = webDriver.manage();
Site site = task.getSite(); Site site = task.getSite();
......
...@@ -7,8 +7,6 @@ import org.openqa.selenium.WebDriver; ...@@ -7,8 +7,6 @@ import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement; import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriver;
import java.util.List;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br> * @date: 13-7-26 <br>
...@@ -18,14 +16,12 @@ public class SeleniumTest { ...@@ -18,14 +16,12 @@ public class SeleniumTest {
@Ignore("need chrome driver") @Ignore("need chrome driver")
@Test @Test
public void test(){ public void testSelenium() {
System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver");
WebDriver webDriver = new ChromeDriver(); WebDriver webDriver = new ChromeDriver();
webDriver.get("http://huaban.com/"); webDriver.get("http://huaban.com/");
List<WebElement> elements = webDriver.findElements(By.xpath("/html")); WebElement webElement = webDriver.findElement(By.xpath("/html"));
for (WebElement element : elements) { System.out.println(webElement.getAttribute("outerHTML"));
System.out.println(element.getAttribute("outerHTML"));
}
webDriver.close(); webDriver.close();
} }
} }
...@@ -22,6 +22,11 @@ ...@@ -22,6 +22,11 @@
<artifactId>webmagic-misc</artifactId> <artifactId>webmagic-misc</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br>
* Time: 下午4:08 <br>
*/
public class HuabanProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) {
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString());
} else {
page.getResultItems().setSkip(true);
}
}
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/");
}
return site;
}
public static void main(String[] args) {
Spider.create(new HuabanProcessor())
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/webmagic/test/"))
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.runAsync();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment