Commit 1e486d20 authored by shenjunlin's avatar shenjunlin

修改代码

parent 6f611a1f
......@@ -51,7 +51,6 @@
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-scripts/</module>
<module>webmagic-selenium</module>
<module>webmagic-saxon</module>
<module>webmagic-samples</module>
</modules>
......
......@@ -16,7 +16,7 @@ public class ZhihuPageProcessor implements PageProcessor {
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("title", page.getHtml().xpath("//*[@id=\"zh-recommend-list\"]/div/h2/a/text()").toString());
page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title")==null){
......
......@@ -8,6 +8,9 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-extension</artifactId>
<properties>
<webdrivermanager.version>2.1.0</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
......@@ -18,8 +21,6 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>us.codecraft.duiba</groupId>
......@@ -30,6 +31,31 @@
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>${webdrivermanager.version}</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.4.3</version>
<exclusions>
<exclusion>
<artifactId>selenium-remote-driver</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
<exclusion>
<artifactId>selenium-api</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package us.codecraft.webmagic.downloader.selenium;
package us.codecraft.webmagic.downloader;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
......@@ -10,7 +10,6 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
......@@ -99,8 +98,8 @@ public class SeleniumDownloader implements Downloader, Closeable {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
//webDriverPool.returnToPool(webDriver);
webDriverPool.closeAll();
webDriverPool.returnToPool(webDriver);
//webDriverPool.closeAll();
return page;
}
......
package us.codecraft.webmagic.downloader.selenium;
package us.codecraft.webmagic.downloader;
import io.github.bonigarcia.wdm.*;
import org.openqa.selenium.WebDriver;
......
package us.codecraft.webmagic.downloader.selenium;
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
......
package us.codecraft.webmagic.downloader.selenium;
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
......
......@@ -44,6 +44,18 @@
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.6</version>
</dependency>
</dependencies>
......
package us.codecraft.webmagic.model.samples;
public class MobileVO {
private String oldModel;
private String brandName;
private String brandPinyin;
private String mobileName;
private String model;
private String price;
public String getOldModel() {
return oldModel;
}
public void setOldModel(String oldModel) {
this.oldModel = oldModel;
}
public String getBrandName() {
return brandName;
}
public void setBrandName(String brandName) {
this.brandName = brandName;
}
public String getBrandPinyin() {
return brandPinyin;
}
public void setBrandPinyin(String brandPinyin) {
this.brandPinyin = brandPinyin;
}
public String getMobileName() {
return mobileName;
}
public void setMobileName(String mobileName) {
this.mobileName = mobileName;
}
public String getModel() {
return model;
}
public void setModel(String model) {
this.model = model;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
@Override
public String toString() {
return "MobileVO{" +
"oldModel='" + oldModel + '\'' +
", brandName='" + brandName + '\'' +
", brandPinyin='" + brandPinyin + '\'' +
", mobileName='" + mobileName + '\'' +
", model='" + model + '\'' +
", price='" + price + '\'' +
'}';
}
}
......@@ -73,7 +73,7 @@ public class News163 implements MultiPageModel {
public static void main(String[] args) {
OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
.scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
.addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
}
}
......@@ -73,7 +73,6 @@ public class AnjukeSpider implements PageProcessor {
public static void main(String[] args) throws JMException {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
// httpClientDownloader.setProxyProvider(new SimpleProxyProvider(ProxyUtils.getAllProxy()));
Spider anjuke = Spider.create(new AnjukeSpider()).setDownloader(httpClientDownloader)
.setScheduler(new FileCacheQueueScheduler("spider").setDuplicateRemover(new HashSetDuplicateRemover()))
......
......@@ -34,4 +34,13 @@ public class GithubRepo {
public void setReadme(String readme) {
this.readme = readme;
}
@Override
public String toString() {
return "GithubRepo{" +
"name='" + name + '\'' +
", author='" + author + '\'' +
", readme='" + readme + '\'' +
'}';
}
}
\ No newline at end of file
......@@ -27,6 +27,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
} else {
page.putField("repo", githubRepo);
}
}
@Override
......
package us.codecraft.webmagic.samples;
import com.google.common.collect.Lists;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
......@@ -16,7 +18,6 @@ public class IteyeBlogProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
page.putField("title",page.getHtml().xpath("//title").toString());
page.putField("content",page.getHtml().smartContent().toString());
}
@Override
......@@ -28,6 +29,11 @@ public class IteyeBlogProcessor implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").addPipeline(resultItemsCollectorPipeline).run();
// Spider.create(new IteyeBlogProcessor()).thread(5).startUrls(Lists.newArrayList("http://yanghaoli.iteye.com/")).addPipeline(resultItemsCollectorPipeline).run();
resultItemsCollectorPipeline.getCollected().stream().forEach(resultItems -> {
System.out.println(resultItems);
});
}
}
......@@ -30,6 +30,7 @@ public class SinaBlogProcessor implements PageProcessor {
//文章页
} else {
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
System.out.println(page.getHtml().xpath("//div[@class='articalTitle']/h2"));
page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("date",
page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
......
......@@ -5,7 +5,7 @@ import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.downloader.SeleniumDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
......
package us.codecraft.webmagic.samples;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
public class ZolBrandSpider implements PageProcessor {
private static final Logger logger = LoggerFactory.getLogger(AnjukeSpider.class);
private Site site = Site.me().setRetryTimes(3).setSleepTime(3000).setCycleRetryTimes(3)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
// .addHeader("Cookie","ip_ck=4sKJ5Pn/j7QuNTU4MzM4LjE1MTc5MTk0NDQ%3D; lv=1517919445; vn=1; z_pro_city=s_provice%3Dzhejiang%26s_city%3Dhangzhou; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1517919449; visited_subcateId=57; Hm_lvt_63bf9e4e99a63f89aa91dd6bd5978c7a=1517919462; Hm_lpvt_63bf9e4e99a63f89aa91dd6bd5978c7a=1517919462; userProvinceId=26; userCityId=153; userCountyId=0; userLocationId=158648; realLocationId=158648; userFidLocationId=158648; z_day=izol97232%3D2%26ixgo20%3D1%26rdetail%3D7; Adshow=5; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1517919686\n")
.addHeader("Referer", "http://www.zol.com.cn/");
private List<String> mobileBrandUrlList = new CopyOnWriteArrayList<>();
private Set<String> mobileBrandNames = Sets.newHashSet("vivo","OPPO","华为","三星","苹果","荣耀","金立","魅族","中兴","Moto","努比亚",
"一加","锤子科技","360","国美手机","小米","夏普","华硕","美图","诺基亚","HTC","8848","SUGAR","黑莓","海信","AGM",
"索尼移动","酷派","LG","联想","联想ZUK","谷歌","飞利浦","朵唯","大神","酷比","天语","微软","小辣椒","TCL","长虹",
"康佳","中国移动","YotaPhone","雷蛇","MANN","纽曼","邦华","海尔","VEB","惠普","乐目","格力","云创通","COMIO",
"小格雷","sonim","神舟","先锋","BDV","imoo","innos","蓝魔","汇威","柯达","富可视","Acer宏碁","PPTV","松下",
"manta","TP-LINK","索野","同洲","达闼","奇酷","乐视","明基","UT斯达康","大可乐","ivvi","青橙","守护宝","21克",
"克里特","保千里","新石器","GEMRY","云狐","阿尔卡特","朗界","卡布奇诺","青葱","彩石","首云","领虎","传奇","独影天幕",
"米蓝","青想","华度","超多维","优豊","百合","铂爵","易百年","全普","泛泰","意龙","阔密","Ant one","途为","VAIO",
"小宇宙","图灵","VANO","美猴王","垦鑫达","读书郎","IUNI","波导","红鸟","BROR","言信","雅马亚虎","卓普","宝丽来",
"nibiru","美莱仕","直角","百事","欧恩","亿通","Gigaset金阶");
@Override
public void process(Page page) {
String oldBrand = StringUtils.substringAfter(page.getUrl().get(), "&keyword=");
System.out.println(oldBrand);
System.out.println(page.getHtml().get());
//手机列表区域
Selectable selectable = page.getHtml().xpath("//*[@class=\"list-item\"]");
//*[@id="wrapper"]/div[2]/div[1]/div
String brand = selectable.xpath("#pro-intro > ul > li.cate > a:nth-child(2)").get();
System.out.println(brand);
// List<Selectable> selectables = page.getHtml().xpath("*[@id=\"list-content\"]/div[@class='li-itemmod']").nodes();
// if (selectables.size() == 0) {
// logger.warn("没找到指定内容={}", page.getRequest().getUrl());
// }
// for (Selectable selectable : selectables) {
// String communityName = selectable.$("h3 > a","text").get();
// String areaAddress = selectable.$("address","text").get();
// String area = StringUtils.substringBetween(areaAddress, "[","]").trim();
// String address = StringUtils.substringAfter(areaAddress, "]").trim();
// String price = selectable.$(".li-side > p > strong","text").get().trim();
// AnjuKeVO anjuKeVO = new AnjuKeVO();
// anjuKeVO.setAddress(address);
// anjuKeVO.setArea(area);
// anjuKeVO.setCity(city);
// anjuKeVO.setPrice(price);
// anjuKeVO.setCommunityName(communityName);
// anjuKeVO.setUrl(page.getRequest().getUrl());
// DBUtils.add(anjuKeVO);
// }
// page.addTargetRequests(page.getHtml().$(".page-content").links().all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws JMException {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
List<String> urls = Lists.newArrayList();
for (String url : Mobils.models) {
urls.add("http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=" + url);
}
String[] urlArr = urls.toArray(new String[urls.size()]);
Spider zol = Spider.create(new ZolBrandSpider()).setDownloader(httpClientDownloader)
// .setScheduler(new FileCacheQueueScheduler("zol_spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.addUrl(urlArr).thread(1);
// .addUrl("https://lvliang.anjuke.com/community/p1/")
zol.start();
}
}
package us.codecraft.webmagic.samples;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.model.samples.AnjuKeVO;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.DBUtils;
import javax.management.JMException;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
public class ZolMobileSpider implements PageProcessor {
private static final Logger logger = LoggerFactory.getLogger(AnjukeSpider.class);
private Site site = Site.me().setRetryTimes(3).setSleepTime(3000).setCycleRetryTimes(3)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
private List<String> mobileBrandUrlList = new CopyOnWriteArrayList<>();
private Set<String> mobileBrandNames = Sets.newHashSet("vivo","OPPO","华为","三星","苹果","荣耀","金立","魅族","中兴","Moto","努比亚",
"一加","锤子科技","360","国美手机","小米","夏普","华硕","美图","诺基亚","HTC","8848","SUGAR","黑莓","海信","AGM",
"索尼移动","酷派","LG","联想","联想ZUK","谷歌","飞利浦","朵唯","大神","酷比","天语","微软","小辣椒","TCL","长虹",
"康佳","中国移动","YotaPhone","雷蛇","MANN","纽曼","邦华","海尔","VEB","惠普","乐目","格力","云创通","COMIO",
"小格雷","sonim","神舟","先锋","BDV","imoo","innos","蓝魔","汇威","柯达","富可视","Acer宏碁","PPTV","松下",
"manta","TP-LINK","索野","同洲","达闼","奇酷","乐视","明基","UT斯达康","大可乐","ivvi","青橙","守护宝","21克",
"克里特","保千里","新石器","GEMRY","云狐","阿尔卡特","朗界","卡布奇诺","青葱","彩石","首云","领虎","传奇","独影天幕",
"米蓝","青想","华度","超多维","优豊","百合","铂爵","易百年","全普","泛泰","意龙","阔密","Ant one","途为","VAIO",
"小宇宙","图灵","VANO","美猴王","垦鑫达","读书郎","IUNI","波导","红鸟","BROR","言信","雅马亚虎","卓普","宝丽来",
"nibiru","美莱仕","直角","百事","欧恩","亿通","Gigaset金阶");
@Override
public void process(Page page) {
// if (!DBUtils.isSpider(page.getRequest().getUrl())) {
// logger.warn("{}已经采集过", page.getRequest().getUrl());
// return;
// }
//手机列表区域
Selectable mobileSelectable = page.getHtml().xpath("/html/body/div[5]/div[1]/div[4]");
List<Selectable> mobils = mobileSelectable.xpath("//*[@id=\"J_PicMode\"]/li").nodes();
String brandName = null;
String model = null;
for (Selectable selectable : mobils) {
String name = StringUtils.substringBefore(selectable.xpath("/li/h3/a/text()").get(),"(").trim();
for (String brand : mobileBrandNames) {
if (name.contains(brand)) {
brandName = brand;
model = StringUtils.replace(name, brand, "");
break;
}
}
String price = selectable.xpath("/li/div/span[2]/b[2]/text()").get();
String pinyin = changeToPinYin(brandName);
System.out.println(pinyin + "--" + brandName + "--" + model + "--" + price);
}
// List<Selectable> selectables = page.getHtml().xpath("*[@id=\"list-content\"]/div[@class='li-itemmod']").nodes();
// if (selectables.size() == 0) {
// logger.warn("没找到指定内容={}", page.getRequest().getUrl());
// }
// for (Selectable selectable : selectables) {
// String communityName = selectable.$("h3 > a","text").get();
// String areaAddress = selectable.$("address","text").get();
// String area = StringUtils.substringBetween(areaAddress, "[","]").trim();
// String address = StringUtils.substringAfter(areaAddress, "]").trim();
// String price = selectable.$(".li-side > p > strong","text").get().trim();
// AnjuKeVO anjuKeVO = new AnjuKeVO();
// anjuKeVO.setAddress(address);
// anjuKeVO.setArea(area);
// anjuKeVO.setCity(city);
// anjuKeVO.setPrice(price);
// anjuKeVO.setCommunityName(communityName);
// anjuKeVO.setUrl(page.getRequest().getUrl());
// DBUtils.add(anjuKeVO);
// }
// TODO: 2018/2/5 目标页面加入
// page.addTargetRequests(page.getHtml().$(".page-content").links().all());
}
@Override
public Site getSite() {
return site;
}
private String changeToPinYin(String str){
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
char[] chars = str.toCharArray();
StringBuilder fullPrint = new StringBuilder();
for (int i =0; i < chars.length; i++) {
try {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(chars[i], format);
if (temp == null) {
return str;
}
fullPrint.append(temp[0]);
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
}
return fullPrint.toString();
}
public static void main(String[] args) throws JMException {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Spider zol = Spider.create(new ZolMobileSpider()).setDownloader(httpClientDownloader)
// .setScheduler(new FileCacheQueueScheduler("zol_spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.addUrl("http://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html")
// .addUrl("https://lvliang.anjuke.com/community/p1/")
.thread(2);
zol.start();
}
}
......@@ -6,6 +6,7 @@ import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import us.codecraft.webmagic.model.samples.AnjuKeVO;
import us.codecraft.webmagic.model.samples.CollegeVO;
import us.codecraft.webmagic.model.samples.MobileVO;
import java.sql.Connection;
import java.sql.SQLException;
......@@ -69,4 +70,31 @@ public class DBUtils {
}
}
public static void add(MobileVO mobileVO) {
QueryRunner qr = new QueryRunner();
String sql = "insert into mobile(old_model, brand_name, brand_pinyin, mobile_name,model, price) values(?,?,?,?,?,?)";
Object params[] = {mobileVO.getOldModel(), mobileVO.getBrandName(), mobileVO.getBrandPinyin(), mobileVO.getMobileName(), mobileVO.getModel(), mobileVO.getPrice() };
try(Connection connection = dataSource.getConnection()) {
qr.update(connection, sql, params);
} catch (SQLException e) {
e.printStackTrace();
}
}
public static boolean isSpiderMobile(String oldModel) {
QueryRunner qr = new QueryRunner(dataSource);
String selectSql = "select * from mobile where old_model = ?";
try {
String[] params = {oldModel};
List<MobileVO> mobileVOS = qr.query(selectSql, new BeanListHandler<>(MobileVO.class), params);
if (CollectionUtils.isNotEmpty(mobileVOS)) {
return false;
}
} catch (SQLException e) {
e.printStackTrace();
}
return true;
}
}
webmagic-extension
-------
webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。
\ No newline at end of file
# What WebDriver to use for the tests
driver=phantomjs
#driver=firefox
#driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
phantomjs_driver_loglevel=DEBUG
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId>
<properties>
<webdrivermanager.version>2.1.0</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>us.codecraft.duiba</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>com.github.detro</groupId>-->
<!--<artifactId>phantomjsdriver</artifactId>-->
<!--<version>1.2.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>${webdrivermanager.version}</version>
</dependency>
<dependency>
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.4.3</version>
<exclusions>
<exclusion>
<artifactId>selenium-remote-driver</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
<exclusion>
<artifactId>selenium-api</artifactId>
<groupId>org.seleniumhq.selenium</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午12:27 <br>
*/
public class SeleniumTest {
@Ignore("need chrome driver")
@Test
public void testSelenium() {
System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver");
Map<String, Object> contentSettings = new HashMap<String, Object>();
contentSettings.put("images", 2);
Map<String, Object> preferences = new HashMap<String, Object>();
preferences.put("profile.default_content_settings", contentSettings);
DesiredCapabilities caps = DesiredCapabilities.chrome();
caps.setCapability("chrome.prefs", preferences);
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
WebDriver webDriver = new ChromeDriver(caps);
webDriver.get("http://huaban.com/");
WebElement webElement = webDriver.findElement(By.xpath("/html"));
System.out.println(webElement.getAttribute("outerHTML"));
webDriver.close();
}
}
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
phantomjs_driver_path=../../src/main.js
phantomjs_driver_loglevel=DEBUG
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment