Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
1e486d20
Commit
1e486d20
authored
Mar 13, 2018
by
shenjunlin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改代码
parent
6f611a1f
Changes
25
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
25 changed files
with
3750 additions
and
142 deletions
+3750
-142
pom.xml
pom.xml
+0
-1
ZhihuPageProcessor.java
...ecraft/webmagic/processor/example/ZhihuPageProcessor.java
+1
-1
pom.xml
webmagic-extension/pom.xml
+28
-2
SeleniumDownloader.java
.../us/codecraft/webmagic/downloader/SeleniumDownloader.java
+3
-4
WebDriverPool.java
.../java/us/codecraft/webmagic/downloader/WebDriverPool.java
+1
-1
SeleniumDownloaderTest.java
...codecraft/webmagic/downloader/SeleniumDownloaderTest.java
+1
-1
WebDriverPoolTest.java
...a/us/codecraft/webmagic/downloader/WebDriverPoolTest.java
+1
-1
pom.xml
webmagic-samples/pom.xml
+12
-0
MobileVO.java
...in/java/us/codecraft/webmagic/model/samples/MobileVO.java
+76
-0
News163.java
...ain/java/us/codecraft/webmagic/model/samples/News163.java
+1
-1
AnjukeSpider.java
...main/java/us/codecraft/webmagic/samples/AnjukeSpider.java
+0
-1
GithubRepo.java
...c/main/java/us/codecraft/webmagic/samples/GithubRepo.java
+9
-0
GithubRepoPageProcessor.java
...s/codecraft/webmagic/samples/GithubRepoPageProcessor.java
+1
-0
IteyeBlogProcessor.java
...ava/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+8
-2
Mobils.java
...s/src/main/java/us/codecraft/webmagic/samples/Mobils.java
+3342
-0
SinaBlogProcessor.java
...java/us/codecraft/webmagic/samples/SinaBlogProcessor.java
+1
-0
WeiboTopSpider.java
...in/java/us/codecraft/webmagic/samples/WeiboTopSpider.java
+1
-1
ZolBrandSpider.java
...in/java/us/codecraft/webmagic/samples/ZolBrandSpider.java
+102
-0
ZolMobileSpider.java
...n/java/us/codecraft/webmagic/samples/ZolMobileSpider.java
+134
-0
DBUtils.java
...es/src/main/java/us/codecraft/webmagic/utils/DBUtils.java
+28
-0
README.md
webmagic-selenium/README.md
+0
-3
config.ini
webmagic-selenium/config.ini
+0
-12
pom.xml
webmagic-selenium/pom.xml
+0
-59
SeleniumTest.java
...t/java/us/codecraft/webmagic/downloader/SeleniumTest.java
+0
-41
config.ini
webmagic-selenium/src/test/resources/config.ini
+0
-11
No files found.
pom.xml
View file @
1e486d20
...
...
@@ -51,7 +51,6 @@
<module>
webmagic-core
</module>
<module>
webmagic-extension/
</module>
<module>
webmagic-scripts/
</module>
<module>
webmagic-selenium
</module>
<module>
webmagic-saxon
</module>
<module>
webmagic-samples
</module>
</modules>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
View file @
1e486d20
...
...
@@ -16,7 +16,7 @@ public class ZhihuPageProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*"
).
all
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//
h1[@class='QuestionHeader-title']
/text()"
).
toString
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//
*[@id=\"zh-recommend-list\"]/div/h2/a
/text()"
).
toString
());
page
.
putField
(
"question"
,
page
.
getHtml
().
xpath
(
"//div[@class='QuestionRichText']//tidyText()"
).
toString
());
page
.
putField
(
"answer"
,
page
.
getHtml
().
xpath
(
"//div[@class='QuestionAnswer-content']/tidyText()"
).
toString
());
if
(
page
.
getResultItems
().
get
(
"title"
)==
null
){
...
...
webmagic-extension/pom.xml
View file @
1e486d20
...
...
@@ -8,6 +8,9 @@
<modelVersion>
4.0.0
</modelVersion>
<artifactId>
webmagic-extension
</artifactId>
<properties>
<webdrivermanager.version>
2.1.0
</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
...
...
@@ -18,8 +21,6 @@
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
<version>
15.0
</version>
<optional>
true
</optional>
</dependency>
<dependency>
<groupId>
us.codecraft.duiba
</groupId>
...
...
@@ -30,6 +31,31 @@
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
<dependency>
<groupId>
io.github.bonigarcia
</groupId>
<artifactId>
webdrivermanager
</artifactId>
<version>
${webdrivermanager.version}
</version>
</dependency>
<dependency>
<groupId>
org.seleniumhq.selenium
</groupId>
<artifactId>
selenium-java
</artifactId>
<version>
3.8.1
</version>
</dependency>
<dependency>
<groupId>
com.codeborne
</groupId>
<artifactId>
phantomjsdriver
</artifactId>
<version>
1.4.3
</version>
<exclusions>
<exclusion>
<artifactId>
selenium-remote-driver
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
<exclusion>
<artifactId>
selenium-api
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
\ No newline at end of file
webmagic-
selenium/src/main/java/us/codecraft/webmagic/downloader/selenium
/SeleniumDownloader.java
→
webmagic-
extension/src/main/java/us/codecraft/webmagic/downloader
/SeleniumDownloader.java
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
downloader
.
selenium
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.openqa.selenium.By
;
import
org.openqa.selenium.Cookie
;
...
...
@@ -10,7 +10,6 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.PlainText
;
...
...
@@ -99,8 +98,8 @@ public class SeleniumDownloader implements Downloader, Closeable {
page
.
setHtml
(
new
Html
(
content
,
request
.
getUrl
()));
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
//
webDriverPool.returnToPool(webDriver);
webDriverPool
.
closeAll
();
webDriverPool
.
returnToPool
(
webDriver
);
//
webDriverPool.closeAll();
return
page
;
}
...
...
webmagic-
selenium/src/main/java/us/codecraft/webmagic/downloader/selenium
/WebDriverPool.java
→
webmagic-
extension/src/main/java/us/codecraft/webmagic/downloader
/WebDriverPool.java
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
downloader
.
selenium
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
io.github.bonigarcia.wdm.*
;
import
org.openqa.selenium.WebDriver
;
...
...
webmagic-
selenium/src/test/java/us/codecraft/webmagic/downloader/selenium
/SeleniumDownloaderTest.java
→
webmagic-
extension/src/test/java/us/codecraft/webmagic/downloader
/SeleniumDownloaderTest.java
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
downloader
.
selenium
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.junit.Ignore
;
import
org.junit.Test
;
...
...
webmagic-
selenium/src/test/java/us/codecraft/webmagic/downloader/selenium
/WebDriverPoolTest.java
→
webmagic-
extension/src/test/java/us/codecraft/webmagic/downloader
/WebDriverPoolTest.java
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
downloader
.
selenium
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.junit.Ignore
;
import
org.junit.Test
;
...
...
webmagic-samples/pom.xml
View file @
1e486d20
...
...
@@ -44,6 +44,18 @@
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
<dependency>
<groupId>
com.belerweb
</groupId>
<artifactId>
pinyin4j
</artifactId>
<version>
2.5.0
</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>
mysql
</groupId>
<artifactId>
mysql-connector-java
</artifactId>
<version>
5.1.6
</version>
</dependency>
</dependencies>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/MobileVO.java
0 → 100644
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
public
class
MobileVO
{
private
String
oldModel
;
private
String
brandName
;
private
String
brandPinyin
;
private
String
mobileName
;
private
String
model
;
private
String
price
;
public
String
getOldModel
()
{
return
oldModel
;
}
public
void
setOldModel
(
String
oldModel
)
{
this
.
oldModel
=
oldModel
;
}
public
String
getBrandName
()
{
return
brandName
;
}
public
void
setBrandName
(
String
brandName
)
{
this
.
brandName
=
brandName
;
}
public
String
getBrandPinyin
()
{
return
brandPinyin
;
}
public
void
setBrandPinyin
(
String
brandPinyin
)
{
this
.
brandPinyin
=
brandPinyin
;
}
public
String
getMobileName
()
{
return
mobileName
;
}
public
void
setMobileName
(
String
mobileName
)
{
this
.
mobileName
=
mobileName
;
}
public
String
getModel
()
{
return
model
;
}
public
void
setModel
(
String
model
)
{
this
.
model
=
model
;
}
public
String
getPrice
()
{
return
price
;
}
public
void
setPrice
(
String
price
)
{
this
.
price
=
price
;
}
@Override
public
String
toString
()
{
return
"MobileVO{"
+
"oldModel='"
+
oldModel
+
'\''
+
", brandName='"
+
brandName
+
'\''
+
", brandPinyin='"
+
brandPinyin
+
'\''
+
", mobileName='"
+
mobileName
+
'\''
+
", model='"
+
model
+
'\''
+
", price='"
+
price
+
'\''
+
'}'
;
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
View file @
1e486d20
...
...
@@ -73,7 +73,7 @@ public class News163 implements MultiPageModel {
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
(),
News163
.
class
).
addUrl
(
"http://news.163.com/13/0802/05/958I1E330001124J_2.html"
)
.
scheduler
(
new
RedisScheduler
(
"localhost"
)).
addPipeline
(
new
MultiPagePipeline
()).
addPipeline
(
new
ConsolePipeline
()).
run
();
.
addPipeline
(
new
MultiPagePipeline
()).
addPipeline
(
new
ConsolePipeline
()).
run
();
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AnjukeSpider.java
View file @
1e486d20
...
...
@@ -73,7 +73,6 @@ public class AnjukeSpider implements PageProcessor {
public
static
void
main
(
String
[]
args
)
throws
JMException
{
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
// httpClientDownloader.setProxyProvider(new SimpleProxyProvider(ProxyUtils.getAllProxy()));
Spider
anjuke
=
Spider
.
create
(
new
AnjukeSpider
()).
setDownloader
(
httpClientDownloader
)
.
setScheduler
(
new
FileCacheQueueScheduler
(
"spider"
).
setDuplicateRemover
(
new
HashSetDuplicateRemover
()))
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
View file @
1e486d20
...
...
@@ -34,4 +34,13 @@ public class GithubRepo {
public
void
setReadme
(
String
readme
)
{
this
.
readme
=
readme
;
}
@Override
public
String
toString
()
{
return
"GithubRepo{"
+
"name='"
+
name
+
'\''
+
", author='"
+
author
+
'\''
+
", readme='"
+
readme
+
'\''
+
'}'
;
}
}
\ No newline at end of file
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
View file @
1e486d20
...
...
@@ -27,6 +27,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
}
else
{
page
.
putField
(
"repo"
,
githubRepo
);
}
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
samples
;
import
com.google.common.collect.Lists
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
...
...
@@ -16,7 +18,6 @@ public class IteyeBlogProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
".*yanghaoli\\.iteye\\.com/blog/\\d+"
).
all
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
().
toString
());
}
@Override
...
...
@@ -28,6 +29,11 @@ public class IteyeBlogProcessor implements PageProcessor {
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
addUrl
(
"http://yanghaoli.iteye.com/"
).
run
();
final
ResultItemsCollectorPipeline
resultItemsCollectorPipeline
=
new
ResultItemsCollectorPipeline
();
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
addUrl
(
"http://yanghaoli.iteye.com/"
).
addPipeline
(
resultItemsCollectorPipeline
).
run
();
// Spider.create(new IteyeBlogProcessor()).thread(5).startUrls(Lists.newArrayList("http://yanghaoli.iteye.com/")).addPipeline(resultItemsCollectorPipeline).run();
resultItemsCollectorPipeline
.
getCollected
().
stream
().
forEach
(
resultItems
->
{
System
.
out
.
println
(
resultItems
);
});
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/Mobils.java
0 → 100644
View file @
1e486d20
This diff is collapsed.
Click to expand it.
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java
View file @
1e486d20
...
...
@@ -30,6 +30,7 @@ public class SinaBlogProcessor implements PageProcessor {
//文章页
}
else
{
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
System
.
out
.
println
(
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//span[@class='time SG_txtc']"
).
regex
(
"\\((.*)\\)"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/WeiboTopSpider.java
View file @
1e486d20
...
...
@@ -5,7 +5,7 @@ import org.slf4j.LoggerFactory;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.
selenium.
SeleniumDownloader
;
import
us.codecraft.webmagic.downloader.SeleniumDownloader
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.Selectable
;
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZolBrandSpider.java
0 → 100644
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
samples
;
import
com.google.common.collect.Lists
;
import
com.google.common.collect.Sets
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.Selectable
;
import
javax.management.JMException
;
import
java.util.List
;
import
java.util.Set
;
import
java.util.concurrent.CopyOnWriteArrayList
;
public
class
ZolBrandSpider
implements
PageProcessor
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
AnjukeSpider
.
class
);
private
Site
site
=
Site
.
me
().
setRetryTimes
(
3
).
setSleepTime
(
3000
).
setCycleRetryTimes
(
3
)
.
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
)
// .addHeader("Cookie","ip_ck=4sKJ5Pn/j7QuNTU4MzM4LjE1MTc5MTk0NDQ%3D; lv=1517919445; vn=1; z_pro_city=s_provice%3Dzhejiang%26s_city%3Dhangzhou; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1517919449; visited_subcateId=57; Hm_lvt_63bf9e4e99a63f89aa91dd6bd5978c7a=1517919462; Hm_lpvt_63bf9e4e99a63f89aa91dd6bd5978c7a=1517919462; userProvinceId=26; userCityId=153; userCountyId=0; userLocationId=158648; realLocationId=158648; userFidLocationId=158648; z_day=izol97232%3D2%26ixgo20%3D1%26rdetail%3D7; Adshow=5; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1517919686\n")
.
addHeader
(
"Referer"
,
"http://www.zol.com.cn/"
);
private
List
<
String
>
mobileBrandUrlList
=
new
CopyOnWriteArrayList
<>();
private
Set
<
String
>
mobileBrandNames
=
Sets
.
newHashSet
(
"vivo"
,
"OPPO"
,
"华为"
,
"三星"
,
"苹果"
,
"荣耀"
,
"金立"
,
"魅族"
,
"中兴"
,
"Moto"
,
"努比亚"
,
"一加"
,
"锤子科技"
,
"360"
,
"国美手机"
,
"小米"
,
"夏普"
,
"华硕"
,
"美图"
,
"诺基亚"
,
"HTC"
,
"8848"
,
"SUGAR"
,
"黑莓"
,
"海信"
,
"AGM"
,
"索尼移动"
,
"酷派"
,
"LG"
,
"联想"
,
"联想ZUK"
,
"谷歌"
,
"飞利浦"
,
"朵唯"
,
"大神"
,
"酷比"
,
"天语"
,
"微软"
,
"小辣椒"
,
"TCL"
,
"长虹"
,
"康佳"
,
"中国移动"
,
"YotaPhone"
,
"雷蛇"
,
"MANN"
,
"纽曼"
,
"邦华"
,
"海尔"
,
"VEB"
,
"惠普"
,
"乐目"
,
"格力"
,
"云创通"
,
"COMIO"
,
"小格雷"
,
"sonim"
,
"神舟"
,
"先锋"
,
"BDV"
,
"imoo"
,
"innos"
,
"蓝魔"
,
"汇威"
,
"柯达"
,
"富可视"
,
"Acer宏碁"
,
"PPTV"
,
"松下"
,
"manta"
,
"TP-LINK"
,
"索野"
,
"同洲"
,
"达闼"
,
"奇酷"
,
"乐视"
,
"明基"
,
"UT斯达康"
,
"大可乐"
,
"ivvi"
,
"青橙"
,
"守护宝"
,
"21克"
,
"克里特"
,
"保千里"
,
"新石器"
,
"GEMRY"
,
"云狐"
,
"阿尔卡特"
,
"朗界"
,
"卡布奇诺"
,
"青葱"
,
"彩石"
,
"首云"
,
"领虎"
,
"传奇"
,
"独影天幕"
,
"米蓝"
,
"青想"
,
"华度"
,
"超多维"
,
"优豊"
,
"百合"
,
"铂爵"
,
"易百年"
,
"全普"
,
"泛泰"
,
"意龙"
,
"阔密"
,
"Ant one"
,
"途为"
,
"VAIO"
,
"小宇宙"
,
"图灵"
,
"VANO"
,
"美猴王"
,
"垦鑫达"
,
"读书郎"
,
"IUNI"
,
"波导"
,
"红鸟"
,
"BROR"
,
"言信"
,
"雅马亚虎"
,
"卓普"
,
"宝丽来"
,
"nibiru"
,
"美莱仕"
,
"直角"
,
"百事"
,
"欧恩"
,
"亿通"
,
"Gigaset金阶"
);
@Override
public
void
process
(
Page
page
)
{
String
oldBrand
=
StringUtils
.
substringAfter
(
page
.
getUrl
().
get
(),
"&keyword="
);
System
.
out
.
println
(
oldBrand
);
System
.
out
.
println
(
page
.
getHtml
().
get
());
//手机列表区域
Selectable
selectable
=
page
.
getHtml
().
xpath
(
"//*[@class=\"list-item\"]"
);
//*[@id="wrapper"]/div[2]/div[1]/div
String
brand
=
selectable
.
xpath
(
"#pro-intro > ul > li.cate > a:nth-child(2)"
).
get
();
System
.
out
.
println
(
brand
);
// List<Selectable> selectables = page.getHtml().xpath("*[@id=\"list-content\"]/div[@class='li-itemmod']").nodes();
// if (selectables.size() == 0) {
// logger.warn("没找到指定内容={}", page.getRequest().getUrl());
// }
// for (Selectable selectable : selectables) {
// String communityName = selectable.$("h3 > a","text").get();
// String areaAddress = selectable.$("address","text").get();
// String area = StringUtils.substringBetween(areaAddress, "[","]").trim();
// String address = StringUtils.substringAfter(areaAddress, "]").trim();
// String price = selectable.$(".li-side > p > strong","text").get().trim();
// AnjuKeVO anjuKeVO = new AnjuKeVO();
// anjuKeVO.setAddress(address);
// anjuKeVO.setArea(area);
// anjuKeVO.setCity(city);
// anjuKeVO.setPrice(price);
// anjuKeVO.setCommunityName(communityName);
// anjuKeVO.setUrl(page.getRequest().getUrl());
// DBUtils.add(anjuKeVO);
// }
// page.addTargetRequests(page.getHtml().$(".page-content").links().all());
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
static
void
main
(
String
[]
args
)
throws
JMException
{
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
List
<
String
>
urls
=
Lists
.
newArrayList
();
for
(
String
url
:
Mobils
.
models
)
{
urls
.
add
(
"http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword="
+
url
);
}
String
[]
urlArr
=
urls
.
toArray
(
new
String
[
urls
.
size
()]);
Spider
zol
=
Spider
.
create
(
new
ZolBrandSpider
()).
setDownloader
(
httpClientDownloader
)
// .setScheduler(new FileCacheQueueScheduler("zol_spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.
addUrl
(
urlArr
).
thread
(
1
);
// .addUrl("https://lvliang.anjuke.com/community/p1/")
zol
.
start
();
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZolMobileSpider.java
0 → 100644
View file @
1e486d20
package
us
.
codecraft
.
webmagic
.
samples
;
import
com.google.common.collect.Lists
;
import
com.google.common.collect.Sets
;
import
net.sourceforge.pinyin4j.PinyinHelper
;
import
net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat
;
import
net.sourceforge.pinyin4j.format.HanyuPinyinToneType
;
import
net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.model.samples.AnjuKeVO
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.FileCacheQueueScheduler
;
import
us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.DBUtils
;
import
javax.management.JMException
;
import
java.util.List
;
import
java.util.Set
;
import
java.util.concurrent.CopyOnWriteArrayList
;
public
class
ZolMobileSpider
implements
PageProcessor
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
AnjukeSpider
.
class
);
private
Site
site
=
Site
.
me
().
setRetryTimes
(
3
).
setSleepTime
(
3000
).
setCycleRetryTimes
(
3
)
.
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
private
List
<
String
>
mobileBrandUrlList
=
new
CopyOnWriteArrayList
<>();
private
Set
<
String
>
mobileBrandNames
=
Sets
.
newHashSet
(
"vivo"
,
"OPPO"
,
"华为"
,
"三星"
,
"苹果"
,
"荣耀"
,
"金立"
,
"魅族"
,
"中兴"
,
"Moto"
,
"努比亚"
,
"一加"
,
"锤子科技"
,
"360"
,
"国美手机"
,
"小米"
,
"夏普"
,
"华硕"
,
"美图"
,
"诺基亚"
,
"HTC"
,
"8848"
,
"SUGAR"
,
"黑莓"
,
"海信"
,
"AGM"
,
"索尼移动"
,
"酷派"
,
"LG"
,
"联想"
,
"联想ZUK"
,
"谷歌"
,
"飞利浦"
,
"朵唯"
,
"大神"
,
"酷比"
,
"天语"
,
"微软"
,
"小辣椒"
,
"TCL"
,
"长虹"
,
"康佳"
,
"中国移动"
,
"YotaPhone"
,
"雷蛇"
,
"MANN"
,
"纽曼"
,
"邦华"
,
"海尔"
,
"VEB"
,
"惠普"
,
"乐目"
,
"格力"
,
"云创通"
,
"COMIO"
,
"小格雷"
,
"sonim"
,
"神舟"
,
"先锋"
,
"BDV"
,
"imoo"
,
"innos"
,
"蓝魔"
,
"汇威"
,
"柯达"
,
"富可视"
,
"Acer宏碁"
,
"PPTV"
,
"松下"
,
"manta"
,
"TP-LINK"
,
"索野"
,
"同洲"
,
"达闼"
,
"奇酷"
,
"乐视"
,
"明基"
,
"UT斯达康"
,
"大可乐"
,
"ivvi"
,
"青橙"
,
"守护宝"
,
"21克"
,
"克里特"
,
"保千里"
,
"新石器"
,
"GEMRY"
,
"云狐"
,
"阿尔卡特"
,
"朗界"
,
"卡布奇诺"
,
"青葱"
,
"彩石"
,
"首云"
,
"领虎"
,
"传奇"
,
"独影天幕"
,
"米蓝"
,
"青想"
,
"华度"
,
"超多维"
,
"优豊"
,
"百合"
,
"铂爵"
,
"易百年"
,
"全普"
,
"泛泰"
,
"意龙"
,
"阔密"
,
"Ant one"
,
"途为"
,
"VAIO"
,
"小宇宙"
,
"图灵"
,
"VANO"
,
"美猴王"
,
"垦鑫达"
,
"读书郎"
,
"IUNI"
,
"波导"
,
"红鸟"
,
"BROR"
,
"言信"
,
"雅马亚虎"
,
"卓普"
,
"宝丽来"
,
"nibiru"
,
"美莱仕"
,
"直角"
,
"百事"
,
"欧恩"
,
"亿通"
,
"Gigaset金阶"
);
@Override
public
void
process
(
Page
page
)
{
// if (!DBUtils.isSpider(page.getRequest().getUrl())) {
// logger.warn("{}已经采集过", page.getRequest().getUrl());
// return;
// }
//手机列表区域
Selectable
mobileSelectable
=
page
.
getHtml
().
xpath
(
"/html/body/div[5]/div[1]/div[4]"
);
List
<
Selectable
>
mobils
=
mobileSelectable
.
xpath
(
"//*[@id=\"J_PicMode\"]/li"
).
nodes
();
String
brandName
=
null
;
String
model
=
null
;
for
(
Selectable
selectable
:
mobils
)
{
String
name
=
StringUtils
.
substringBefore
(
selectable
.
xpath
(
"/li/h3/a/text()"
).
get
(),
"("
).
trim
();
for
(
String
brand
:
mobileBrandNames
)
{
if
(
name
.
contains
(
brand
))
{
brandName
=
brand
;
model
=
StringUtils
.
replace
(
name
,
brand
,
""
);
break
;
}
}
String
price
=
selectable
.
xpath
(
"/li/div/span[2]/b[2]/text()"
).
get
();
String
pinyin
=
changeToPinYin
(
brandName
);
System
.
out
.
println
(
pinyin
+
"--"
+
brandName
+
"--"
+
model
+
"--"
+
price
);
}
// List<Selectable> selectables = page.getHtml().xpath("*[@id=\"list-content\"]/div[@class='li-itemmod']").nodes();
// if (selectables.size() == 0) {
// logger.warn("没找到指定内容={}", page.getRequest().getUrl());
// }
// for (Selectable selectable : selectables) {
// String communityName = selectable.$("h3 > a","text").get();
// String areaAddress = selectable.$("address","text").get();
// String area = StringUtils.substringBetween(areaAddress, "[","]").trim();
// String address = StringUtils.substringAfter(areaAddress, "]").trim();
// String price = selectable.$(".li-side > p > strong","text").get().trim();
// AnjuKeVO anjuKeVO = new AnjuKeVO();
// anjuKeVO.setAddress(address);
// anjuKeVO.setArea(area);
// anjuKeVO.setCity(city);
// anjuKeVO.setPrice(price);
// anjuKeVO.setCommunityName(communityName);
// anjuKeVO.setUrl(page.getRequest().getUrl());
// DBUtils.add(anjuKeVO);
// }
// TODO: 2018/2/5 目标页面加入
// page.addTargetRequests(page.getHtml().$(".page-content").links().all());
}
@Override
public
Site
getSite
()
{
return
site
;
}
private
String
changeToPinYin
(
String
str
){
HanyuPinyinOutputFormat
format
=
new
HanyuPinyinOutputFormat
();
format
.
setToneType
(
HanyuPinyinToneType
.
WITHOUT_TONE
);
char
[]
chars
=
str
.
toCharArray
();
StringBuilder
fullPrint
=
new
StringBuilder
();
for
(
int
i
=
0
;
i
<
chars
.
length
;
i
++)
{
try
{
String
[]
temp
=
PinyinHelper
.
toHanyuPinyinStringArray
(
chars
[
i
],
format
);
if
(
temp
==
null
)
{
return
str
;
}
fullPrint
.
append
(
temp
[
0
]);
}
catch
(
BadHanyuPinyinOutputFormatCombination
e
)
{
e
.
printStackTrace
();
}
}
return
fullPrint
.
toString
();
}
public
static
void
main
(
String
[]
args
)
throws
JMException
{
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
Spider
zol
=
Spider
.
create
(
new
ZolMobileSpider
()).
setDownloader
(
httpClientDownloader
)
// .setScheduler(new FileCacheQueueScheduler("zol_spider").setDuplicateRemover(new HashSetDuplicateRemover()))
.
addUrl
(
"http://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html"
)
// .addUrl("https://lvliang.anjuke.com/community/p1/")
.
thread
(
2
);
zol
.
start
();
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/utils/DBUtils.java
View file @
1e486d20
...
...
@@ -6,6 +6,7 @@ import org.apache.commons.dbutils.QueryRunner;
import
org.apache.commons.dbutils.handlers.BeanListHandler
;
import
us.codecraft.webmagic.model.samples.AnjuKeVO
;
import
us.codecraft.webmagic.model.samples.CollegeVO
;
import
us.codecraft.webmagic.model.samples.MobileVO
;
import
java.sql.Connection
;
import
java.sql.SQLException
;
...
...
@@ -69,4 +70,31 @@ public class DBUtils {
}
}
public
static
void
add
(
MobileVO
mobileVO
)
{
QueryRunner
qr
=
new
QueryRunner
();
String
sql
=
"insert into mobile(old_model, brand_name, brand_pinyin, mobile_name,model, price) values(?,?,?,?,?,?)"
;
Object
params
[]
=
{
mobileVO
.
getOldModel
(),
mobileVO
.
getBrandName
(),
mobileVO
.
getBrandPinyin
(),
mobileVO
.
getMobileName
(),
mobileVO
.
getModel
(),
mobileVO
.
getPrice
()
};
try
(
Connection
connection
=
dataSource
.
getConnection
())
{
qr
.
update
(
connection
,
sql
,
params
);
}
catch
(
SQLException
e
)
{
e
.
printStackTrace
();
}
}
public
static
boolean
isSpiderMobile
(
String
oldModel
)
{
QueryRunner
qr
=
new
QueryRunner
(
dataSource
);
String
selectSql
=
"select * from mobile where old_model = ?"
;
try
{
String
[]
params
=
{
oldModel
};
List
<
MobileVO
>
mobileVOS
=
qr
.
query
(
selectSql
,
new
BeanListHandler
<>(
MobileVO
.
class
),
params
);
if
(
CollectionUtils
.
isNotEmpty
(
mobileVOS
))
{
return
false
;
}
}
catch
(
SQLException
e
)
{
e
.
printStackTrace
();
}
return
true
;
}
}
webmagic-selenium/README.md
deleted
100644 → 0
View file @
6f611a1f
webmagic-extension
-------
webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。
\ No newline at end of file
webmagic-selenium/config.ini
deleted
100644 → 0
View file @
6f611a1f
# What WebDriver to use for the tests
driver
=
phantomjs
#driver=firefox
#driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
phantomjs_exec_path
=
/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
phantomjs_driver_loglevel
=
DEBUG
\ No newline at end of file
webmagic-selenium/pom.xml
deleted
100644 → 0
View file @
6f611a1f
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<parent>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.7.6-SNAPSHOT
</version>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<artifactId>
webmagic-selenium
</artifactId>
<properties>
<webdrivermanager.version>
2.1.0
</webdrivermanager.version>
</properties>
<dependencies>
<dependency>
<groupId>
org.seleniumhq.selenium
</groupId>
<artifactId>
selenium-java
</artifactId>
<version>
3.8.1
</version>
</dependency>
<dependency>
<groupId>
us.codecraft.duiba
</groupId>
<artifactId>
webmagic-core
</artifactId>
<version>
${project.version}
</version>
</dependency>
<!--<dependency>-->
<!--<groupId>com.github.detro</groupId>-->
<!--<artifactId>phantomjsdriver</artifactId>-->
<!--<version>1.2.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>
io.github.bonigarcia
</groupId>
<artifactId>
webdrivermanager
</artifactId>
<version>
${webdrivermanager.version}
</version>
</dependency>
<dependency>
<groupId>
com.codeborne
</groupId>
<artifactId>
phantomjsdriver
</artifactId>
<version>
1.4.3
</version>
<exclusions>
<exclusion>
<artifactId>
selenium-remote-driver
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
<exclusion>
<artifactId>
selenium-api
</artifactId>
<groupId>
org.seleniumhq.selenium
</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
</dependencies>
</project>
webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java
deleted
100644 → 0
View file @
6f611a1f
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.openqa.selenium.By
;
import
org.openqa.selenium.WebDriver
;
import
org.openqa.selenium.WebElement
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.remote.DesiredCapabilities
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.Map
;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午12:27 <br>
*/
public
class
SeleniumTest
{
@Ignore
(
"need chrome driver"
)
@Test
public
void
testSelenium
()
{
System
.
getProperties
().
setProperty
(
"webdriver.chrome.driver"
,
"/Users/yihua/Downloads/chromedriver"
);
Map
<
String
,
Object
>
contentSettings
=
new
HashMap
<
String
,
Object
>();
contentSettings
.
put
(
"images"
,
2
);
Map
<
String
,
Object
>
preferences
=
new
HashMap
<
String
,
Object
>();
preferences
.
put
(
"profile.default_content_settings"
,
contentSettings
);
DesiredCapabilities
caps
=
DesiredCapabilities
.
chrome
();
caps
.
setCapability
(
"chrome.prefs"
,
preferences
);
caps
.
setCapability
(
"chrome.switches"
,
Arrays
.
asList
(
"--user-data-dir=/Users/yihua/temp/chrome"
));
WebDriver
webDriver
=
new
ChromeDriver
(
caps
);
webDriver
.
get
(
"http://huaban.com/"
);
WebElement
webElement
=
webDriver
.
findElement
(
By
.
xpath
(
"/html"
));
System
.
out
.
println
(
webElement
.
getAttribute
(
"outerHTML"
));
webDriver
.
close
();
}
}
webmagic-selenium/src/test/resources/config.ini
deleted
100644 → 0
View file @
6f611a1f
#driver=phantomjs
#driver=firefox
driver
=
chrome
#driver=http://localhost:8910
driver
=
http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
phantomjs_exec_path
=
/Users/detro/bin/phantomjs-upstream
phantomjs_driver_path
=
../../src/main.js
phantomjs_driver_loglevel
=
DEBUG
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment