Commit fa792e31 authored by shenjunlin's avatar shenjunlin

添加爬虫

parent 7726aabc
......@@ -8,3 +8,4 @@ out/
bin/
.myeclipse
*.log
spider/
......@@ -177,8 +177,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
......
......@@ -15,15 +15,36 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.16</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
package us.codecraft.webmagic.model.samples;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class AnjuKeVO {
private String city;
private String communityName;
private String area;
private String address;
private String price;
private String url;
}
package us.codecraft.webmagic.model.samples;
/**
* 爬取的结果是 http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1.dhtml
*/
public class CollegeVO {
private String name;
private String city;
private String BelongTo;
private String education;
private String type;
private String schoolType;
private String eduType;
private int is_985;
private int is_211;
private int is_yan;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getBelongTo() {
return BelongTo;
}
public void setBelongTo(String belongTo) {
BelongTo = belongTo;
}
public String getEducation() {
return education;
}
public void setEducation(String education) {
this.education = education;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getSchoolType() {
return schoolType;
}
public void setSchoolType(String schoolType) {
this.schoolType = schoolType;
}
public String getEduType() {
return eduType;
}
public void setEduType(String eduType) {
this.eduType = eduType;
}
public int getIs_985() {
return is_985;
}
public void setIs_985(int is_985) {
this.is_985 = is_985;
}
public int getIs_211() {
return is_211;
}
public void setIs_211(int is_211) {
this.is_211 = is_211;
}
public int getIs_yan() {
return is_yan;
}
public void setIs_yan(int is_yan) {
this.is_yan = is_yan;
}
@Override
public String toString() {
return "CollegeVO{" +
"name='" + name + '\'' +
", city='" + city + '\'' +
", BelongTo='" + BelongTo + '\'' +
", education='" + education + '\'' +
", type='" + type + '\'' +
", schoolType='" + schoolType + '\'' +
", eduType='" + eduType + '\'' +
", is_985=" + is_985 +
", is_211=" + is_211 +
", is_yan=" + is_yan +
'}';
}
}
package us.codecraft.webmagic.model.samples;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class TopVO {
private String keywords;
private String url;
private String score;
}
package us.codecraft.webmagic.samples;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.model.samples.AnjuKeVO;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.DBUtils;
import javax.management.JMException;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
public class AnjukeSpider implements PageProcessor {
private static final Logger logger = LoggerFactory.getLogger(AnjukeSpider.class);
private Site site = Site.me().setRetryTimes(3).setSleepTime(3000).setCycleRetryTimes(3)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
private List<String> cityUrlList = new CopyOnWriteArrayList<>();
@Override
public void process(Page page) {
if (!DBUtils.isSpider(page.getRequest().getUrl())) {
logger.warn("{}已经采集过", page.getRequest().getUrl());
return;
}
if (StringUtils.equalsIgnoreCase("https://www.anjuke.com/sy-city.html", page.getRequest().getUrl())) {
//城市列表
cityUrlList.addAll(page.getHtml().xpath("/html/body/div[3]/div/div[2]").links().regex("(https://\\w+.anjuke.com)").all());
for (String detailUrl : cityUrlList) {
page.addTargetRequest(detailUrl + "/community/p1/");
}
} else {
//房产列表页
String city = page.getHtml().xpath("//*[@id=\"list-content\"]/div[1]/span/em[1]/text()").get();
List<Selectable> selectables = page.getHtml().xpath("*[@id=\"list-content\"]/div[@class='li-itemmod']").nodes();
if (selectables.size() == 0) {
logger.warn("没找到指定内容={}", page.getRequest().getUrl());
}
for (Selectable selectable : selectables) {
String communityName = selectable.$("h3 > a","text").get();
String areaAddress = selectable.$("address","text").get();
String area = StringUtils.substringBetween(areaAddress, "[","]").trim();
String address = StringUtils.substringAfter(areaAddress, "]").trim();
String price = selectable.$(".li-side > p > strong","text").get().trim();
AnjuKeVO anjuKeVO = new AnjuKeVO();
anjuKeVO.setAddress(address);
anjuKeVO.setArea(area);
anjuKeVO.setCity(city);
anjuKeVO.setPrice(price);
anjuKeVO.setCommunityName(communityName);
anjuKeVO.setUrl(page.getRequest().getUrl());
DBUtils.add(anjuKeVO);
}
page.addTargetRequests(page.getHtml().$(".page-content").links().all());
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws JMException {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
// httpClientDownloader.setProxyProvider(new SimpleProxyProvider(ProxyUtils.getAllProxy()));
Spider anjuke = Spider.create(new AnjukeSpider()).setDownloader(httpClientDownloader)
.setScheduler(new FileCacheQueueScheduler("spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.addUrl("https://www.anjuke.com/sy-city.html")
// .addUrl("https://lvliang.anjuke.com/community/p1/")
.thread(2);
anjuke.start();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class QzoneBlogProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());
}
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.samples.TopVO;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* 目标页面 http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1.dhtml
*/
public class TopBaiduSpider implements PageProcessor {
private static final Logger logger = LoggerFactory.getLogger(TopBaiduSpider.class);
private Site site = Site.me().setRetryTimes(3).setSleepTime(3000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
@Override
public void process(Page page) {
Selectable hotList = page.getHtml().$(".list-table");
List<Selectable> trs = hotList.$("tr").nodes();
for (int i = 1; i < trs.size(); i++) {
Selectable selectable = trs.get(i);
Selectable td = selectable.$(".first");
if (td != null) {
TopVO vo = new TopVO();
vo.setKeywords(selectable.$(".keyword > a","text").get());
vo.setUrl(selectable.$(".keyword > a","href").get());
String score = selectable.$(".last > span","text").get();
vo.setScore(score);
logger.warn("{}", vo);
}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws JMException {
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
executorService.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
Spider topBaidu = Spider.create(new TopBaiduSpider())
.setScheduler(new FileCacheQueueScheduler("spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.addUrl("http://top.baidu.com/buzz?b=1&fr=topindex")
.thread(1);
topBaidu.start();
}
}, 1, 600, TimeUnit.SECONDS);
}
}
......@@ -11,6 +11,7 @@ import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
import java.util.List;
import java.util.concurrent.*;
/**
* 爬取的页面 http://s.weibo.com/top/summary
......@@ -42,9 +43,21 @@ public class WeiboTopSpider implements PageProcessor {
}
public static void main(String[] args) throws JMException {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(new SeleniumDownloader())
.thread(1);
weiboSpider.start();
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
executorService.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
Spider weiboSpider = Spider.create(new WeiboTopSpider())
.addUrl("http://s.weibo.com/top/summary").setDownloader(new SeleniumDownloader())
.thread(1);
weiboSpider.start();
}
}, 1, 30, TimeUnit.SECONDS);
}
}
package us.codecraft.webmagic.utils;
import com.alibaba.druid.pool.DruidDataSource;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import us.codecraft.webmagic.model.samples.AnjuKeVO;
import us.codecraft.webmagic.model.samples.CollegeVO;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.List;
public class DBUtils {
private static DruidDataSource dataSource = new DruidDataSource();
static {
String dbUrl = "jdbc:mysql://127.0.0.1:3306/spider?autoReconnect=true&useUnicode=true&characterEncoding=utf-8";
String dbUser = "root";
String dbPwd = "123456";
String jdbcDriver = "com.mysql.jdbc.Driver";
dataSource.setUrl(dbUrl);
dataSource.setPassword(dbPwd);
dataSource.setUsername(dbUser);
dataSource.setDriverClassName(jdbcDriver);
dataSource.setMaxActive(300);
dataSource.setInitialSize(2);
dataSource.setMaxWait(60000);
dataSource.setMinIdle(3);
}
public static boolean isSpider(String url) {
QueryRunner qr = new QueryRunner(dataSource);
String selectSql = "select * from anjuke where url = ?";
try {
String[] params = {url};
List<AnjuKeVO> anjuKeVOList = qr.query(selectSql, new BeanListHandler<>(AnjuKeVO.class), params);
if (CollectionUtils.isNotEmpty(anjuKeVOList)) {
return false;
}
} catch (SQLException e) {
e.printStackTrace();
}
return true;
}
public static void add(AnjuKeVO anjuKeVO) {
QueryRunner qr = new QueryRunner();
String sql = "insert into anjuke(city, community_name,area,address, price, url) values(?,?,?,?,?, ?)";
Object params[] = {anjuKeVO.getCity(), anjuKeVO.getCommunityName(), anjuKeVO.getArea(), anjuKeVO.getAddress(), anjuKeVO.getPrice(), anjuKeVO.getUrl()};
try(Connection connection = dataSource.getConnection()) {
qr.update(connection, sql, params);
} catch (SQLException e) {
e.printStackTrace();
}
}
public static void add(CollegeVO collegeVO) {
QueryRunner qr = new QueryRunner();
String sql = "insert into college(name, city, belong_to,education,type, school_type, edu_type , is_985, is_211, is_yan) values(?,?,?,?,?,?,?,?,?,?)";
Object params[] = {collegeVO.getName(), collegeVO.getCity(), collegeVO.getBelongTo(), collegeVO.getEducation() ,collegeVO.getEduType(), collegeVO.getSchoolType(), collegeVO.getEduType(), collegeVO.getIs_985(), collegeVO.getIs_211(), collegeVO.getIs_yan()};
try {
qr.update(dataSource.getConnection(), sql, params);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
......@@ -17,13 +17,9 @@ import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
* 使用Selieum渲染网页进行爬取
*/
public class SeleniumDownloader implements Downloader, Closeable {
......
......@@ -5,6 +5,7 @@ import org.apache.log4j.Logger;
import org.openqa.selenium.Capabilities;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.opera.OperaDriver;
......@@ -19,11 +20,8 @@ import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
* web Driver连接池
*/
public class WebDriverPool {
private Logger logger = Logger.getLogger(getClass());
......@@ -74,6 +72,10 @@ public class WebDriverPool {
InternetExplorerDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new InternetExplorerDriver();
break;
case Microsoft_Edge:
EdgeDriverManager.getInstance().useTaobaoMirror().forceCache().setup();
mDriver = new EdgeDriver();
break;
}
}
......@@ -86,7 +88,7 @@ public class WebDriverPool {
/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<>();
public WebDriverPool(int capacity, DriverType driverType) {
this.capacity = capacity;
......@@ -150,7 +152,6 @@ public class WebDriverPool {
for (WebDriver webDriver : webDriverList) {
logger.info("Quit webDriver" + webDriver);
webDriver.quit();
webDriver = null;
}
}
......
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
*
* Using Selenium with PhantomJS to fetch web-page with JS<br>
*
* @author bob.li.0718@gmail.com <br>
* Date: 15-7-11 <br>
*/
public class GooglePlayProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
page.putField("whole-html", page.getHtml().toString());
}
@Override
public Site getSite() {
if (null == site) {
site = Site.me().setDomain("play.google.com").setSleepTime(300);
}
return site;
}
public static void main(String[] args) {
Spider.create(new GooglePlayProcessor())
.thread(5)
.addPipeline(
new FilePipeline(
"/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/"))
.setDownloader(new SeleniumDownloader())
.addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm")
.runAsync();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.downloader.selenium.WebDriverPool;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 花瓣网抽取器。<br>
* 使用Selenium做页面动态渲染。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午4:08 <br>
*/
public class HuabanProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) {
page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
} else {
page.getResultItems().setSkip(true);
}
}
@Override
public Site getSite() {
if (null == site) {
site = Site.me().setDomain("huaban.com").setSleepTime(0);
}
return site;
}
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.addPipeline(new FilePipeline("/data/webmagic/test/"))
.setDownloader(new SeleniumDownloader(WebDriverPool.DriverType.PhantomJS))
.addUrl("http://huaban.com/")
.runAsync();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment