Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
986ae0be
Commit
986ae0be
authored
Jun 19, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update Select api: remove x() s() etc.
parent
586d23ef
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
75 additions
and
127 deletions
+75
-127
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+1
-1
SimplePageProcessor.java
.../us/codecraft/webmagic/processor/SimplePageProcessor.java
+3
-3
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+3
-15
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+5
-21
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+5
-29
HtmlTest.java
...ic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
+1
-1
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+1
-1
DiandianBlogProcessor.java
.../us/codecraft/webmagic/samples/DiandianBlogProcessor.java
+8
-8
DianpingProcessor.java
...java/us/codecraft/webmagic/samples/DianpingProcessor.java
+4
-4
DiaoyuwengProcessor.java
...va/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+6
-6
F58PageProcesser.java
.../java/us/codecraft/webmagic/samples/F58PageProcesser.java
+3
-3
HuxiuProcessor.java
...in/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+3
-3
KaichibaProcessor.java
...java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+3
-3
MeicanProcessor.java
...n/java/us/codecraft/webmagic/samples/MeicanProcessor.java
+4
-4
NjuBBSProcessor.java
...n/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
+3
-3
OschinaBlogPageProcesser.java
.../codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+4
-4
OschinaPageProcesser.java
...a/us/codecraft/webmagic/samples/OschinaPageProcesser.java
+3
-3
QzoneBlogProcessor.java
...ava/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
+3
-3
SinaBlogProcesser.java
...java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
+6
-6
TianyaPageProcesser.java
...va/us/codecraft/webmagic/samples/TianyaPageProcesser.java
+3
-3
SpiderTest.java
...mples/src/test/java/us/codecraft/webmagic/SpiderTest.java
+3
-3
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
986ae0be
...
...
@@ -33,7 +33,7 @@ public class HttpClientDownloader implements Downloader {
if
(
site
.
getAcceptStatCode
().
contains
(
statusCode
))
{
if
(
site
.
getEncoding
()
==
null
){
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
site
.
setEncoding
(
new
PlainText
(
value
).
r
(
"charset=([^\\s]+)"
).
toString
());
site
.
setEncoding
(
new
PlainText
(
value
).
r
egex
(
"charset=([^\\s]+)"
).
toString
());
}
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
(),
site
.
getEncoding
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
View file @
986ae0be
...
...
@@ -28,13 +28,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
as
().
rs
(
urlPattern
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
urlPattern
).
toStrings
();
//调用page.addTargetRequests()方法添加待抓取链接
page
.
addTargetRequests
(
requests
);
//xpath方式抽取
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//title"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//title"
));
//sc表示使用Readability技术抽取正文
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
}
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
986ae0be
...
...
@@ -18,12 +18,6 @@ public class Html extends PlainText {
super
(
text
);
}
@Override
public
Selectable
x
(
String
xpath
)
{
XpathSelector
xpathSelector
=
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
xpath
);
return
select
(
xpathSelector
,
strings
);
}
@Override
protected
Selectable
select
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
...
...
@@ -47,25 +41,19 @@ public class Html extends PlainText {
}
@Override
public
Selectable
s
c
()
{
public
Selectable
s
martContent
()
{
SmartContentSelector
smartContentSelector
=
SelectorFactory
.
getInstatnce
().
newSmartContentSelector
();
return
select
(
smartContentSelector
,
strings
);
}
@Override
public
Selectable
a
()
{
XpathSelector
xpathSelector
=
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
"//a/@href"
);
return
select
(
xpathSelector
,
strings
);
}
@Override
public
Selectable
as
()
{
public
Selectable
links
()
{
XpathSelector
xpathSelector
=
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
}
@Override
public
Selectable
x
s
(
String
xpath
)
{
public
Selectable
x
path
(
String
xpath
)
{
XpathSelector
xpathSelector
=
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
986ae0be
...
...
@@ -25,38 +25,22 @@ public class PlainText implements Selectable {
}
@Override
public
Selectable
x
(
String
xpath
)
{
public
Selectable
x
path
(
String
xpath
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
xs
(
String
xpath
)
{
public
Selectable
smartContent
(
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
sc
()
{
public
Selectable
links
()
{
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
a
()
{
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
as
()
{
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
r
(
String
regex
)
{
RegexSelector
regexSelector
=
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
regex
);
return
select
(
regexSelector
,
strings
);
}
@Override
public
Selectable
rs
(
String
regex
)
{
public
Selectable
regex
(
String
regex
)
{
RegexSelector
regexSelector
=
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
regex
);
return
selectList
(
regexSelector
,
strings
);
}
...
...
@@ -82,7 +66,7 @@ public class PlainText implements Selectable {
}
@Override
public
Selectable
r
p
(
String
regex
,
String
replacement
)
{
public
Selectable
r
eplace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
SelectorFactory
.
getInstatnce
().
newReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
986ae0be
...
...
@@ -9,51 +9,27 @@ import java.util.List;
*/
public
interface
Selectable
{
/**
* select with xpath
*
* @param xpath
* @return new Selectable after extract
*/
public
Selectable
x
(
String
xpath
);
/**
* select list with xpath
*
* @param xpath
* @return new Selectable after extract
*/
public
Selectable
x
s
(
String
xpath
);
public
Selectable
x
path
(
String
xpath
);
/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public
Selectable
sc
();
/**
* select a link
*
* @return first link
*/
public
Selectable
a
();
public
Selectable
smartContent
();
/**
* select all links
*
* @return all links
*/
public
Selectable
as
();
/**
* select with regex
*
* @param regex
* @return new Selectable after extract
*/
public
Selectable
r
(
String
regex
);
public
Selectable
links
();
/**
* select list with regex
...
...
@@ -61,7 +37,7 @@ public interface Selectable {
* @param regex
* @return new Selectable after extract
*/
public
Selectable
r
s
(
String
regex
);
public
Selectable
r
egex
(
String
regex
);
/**
* replace with regex
...
...
@@ -70,7 +46,7 @@ public interface Selectable {
* @param replacement
* @return new Selectable after extract
*/
public
Selectable
r
p
(
String
regex
,
String
replacement
);
public
Selectable
r
eplace
(
String
regex
,
String
replacement
);
/**
* single string result
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
View file @
986ae0be
...
...
@@ -14,7 +14,7 @@ public class HtmlTest {
@Test
public
void
testRegexSelector
()
{
Html
selectable
=
new
Html
(
"aaaaaaab"
);
Assert
.
assertEquals
(
"abbabbab"
,
(
selectable
.
r
(
"(.*)"
).
rp
(
"aa(a)"
,
"$1bb"
).
toString
()));
Assert
.
assertEquals
(
"abbabbab"
,
(
selectable
.
r
egex
(
"(.*)"
).
replace
(
"aa(a)"
,
"$1bb"
).
toString
()));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
986ae0be
...
...
@@ -1350,7 +1350,7 @@ public class XpathSelectorTest {
@Test
public
void
testOschina
()
{
Html
html1
=
new
Html
(
html
);
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
x
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
x
path
(
".//*[@class='QTitle']/h1/a"
).
toString
());
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
View file @
986ae0be
...
...
@@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
//a()表示提取链接,
a
s()表示提取所有链接
//a()表示提取链接,
link
s()表示提取所有链接
//getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,r
s
()表示提取多条内容
//r()表示用正则表达式提取一条内容,r
egex
()表示提取多条内容
//toString()表示取单条结果,toStrings()表示取多条
List
<
String
>
requests
=
page
.
getHtml
().
as
().
rs
(
"(.*/post/.*)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(.*/post/.*)"
).
toStrings
();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page
.
addTargetRequests
(
requests
);
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//title"
).
r
(
"(.*?)\\|"
));
//s
c
()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"date"
,
page
.
getUrl
().
r
(
"post/(\\d+-\\d+-\\d+)/"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
r
(
"post/\\d+-\\d+-\\d+/(\\d+)"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//title"
).
regex
(
"(.*?)\\|"
));
//s
martContent
()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
page
.
putField
(
"date"
,
page
.
getUrl
().
r
egex
(
"post/(\\d+-\\d+-\\d+)/"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
r
egex
(
"post/\\d+-\\d+-\\d+/(\\d+)"
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
View file @
986ae0be
...
...
@@ -15,13 +15,13 @@ import java.util.List;
public
class
DianpingProcessor
implements
PageProcessor
{
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
as
().
rs
(
".*shop.*"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
".*shop.*"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
requests
=
page
.
getHtml
().
r
s
(
".*search/category/.*"
).
toStrings
();
requests
=
page
.
getHtml
().
r
egex
(
".*search/category/.*"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"shop"
))
{
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//h1[@class='shop-title']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//h1[@class='shop-title']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
View file @
986ae0be
...
...
@@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
as
().
rs
(
"(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
requests
=
page
.
getHtml
().
as
().
rs
(
"(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)"
).
toStrings
();
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
(
"//div[@class='pcb']//tbody"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
r
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
r
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
path
(
"//div[@class='pcb']//tbody"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
r
egex
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
r
egex
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
View file @
986ae0be
...
...
@@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
r
s
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
r
egex
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}"
).
toStrings
();
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
r
(
"<title>(.*)</title>"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
x
(
"//dd[@class='w133']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
r
egex
(
"<title>(.*)</title>"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
x
path
(
"//dd[@class='w133']"
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
View file @
986ae0be
...
...
@@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
r
s
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
r
egex
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@class='neirong']//h1[@class='ph xs5']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@class='neirong']//h1[@class='ph xs5']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
View file @
986ae0be
...
...
@@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int
i
=
Integer
.
valueOf
(
page
.
getUrl
().
r
(
"shop/(\\d+)"
).
toString
())
+
1
;
int
i
=
Integer
.
valueOf
(
page
.
getUrl
().
r
egex
(
"shop/(\\d+)"
).
toString
())
+
1
;
page
.
addTargetRequest
(
"http://kaichiba.com/shop/"
+
i
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//Title"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
x
s
(
"//li[@class=\"foodTitle\"]"
).
rp
(
"^\\s+"
,
""
).
rp
(
"\\s+$"
,
""
).
rp
(
"<span>.*?</span>"
,
""
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//Title"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
x
path
(
"//li[@class=\"foodTitle\"]"
).
replace
(
"^\\s+"
,
""
).
replace
(
"\\s+$"
,
""
).
replace
(
"<span>.*?</span>"
,
""
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
View file @
986ae0be
...
...
@@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
x
s
(
"//a[@class=\"area_link flat_btn\"]/@href"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
x
path
(
"//a[@class=\"area_link flat_btn\"]/@href"
).
toStrings
();
if
(
requests
.
size
()
>
2
)
{
requests
=
requests
.
subList
(
0
,
2
);
}
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
page
.
getHtml
().
as
().
rs
(
"(.*/restaurant/[^#]+)"
).
toStrings
());
page
.
putField
(
"items"
,
page
.
getHtml
().
x
s
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
x
s
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
toStrings
());
page
.
putField
(
"items"
,
page
.
getHtml
().
x
path
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
x
path
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
View file @
986ae0be
...
...
@@ -14,10 +14,10 @@ import java.util.List;
public
class
NjuBBSProcessor
implements
PageProcessor
{
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
r
s
(
"<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
r
egex
(
"<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
View file @
986ae0be
...
...
@@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
as
().
r
(
"(http://my\\.oschina\\.net)"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
links
().
regex
(
"(http://my\\.oschina\\.net)"
).
toStrings
();
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
s
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"author"
,
page
.
getUrl
().
r
(
"my\\.oschina\\.net/(\\w+)/blog/\\d+"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
page
.
putField
(
"author"
,
page
.
getUrl
().
r
egex
(
"my\\.oschina\\.net/(\\w+)/blog/\\d+"
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
View file @
986ae0be
...
...
@@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
r
s
(
"<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
r
egex
(
"<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}"
).
toStrings
();
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@class='QTitle']/h1/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
s
(
"//div[@class='Question']//div[@class='Content']/div[@class='detail']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@class='QTitle']/h1/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
path
(
"//div[@class='Question']//div[@class='Content']/div[@class='detail']"
));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
View file @
986ae0be
...
...
@@ -18,10 +18,10 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List
<
String
>
requests
=
page
.
getHtml
().
r
s
(
"<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
r
egex
(
"<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
s
martContent
());
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
View file @
986ae0be
...
...
@@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
as
().
rs
(
"(http://blog\\.sina\\.com\\.cn/s/blog_.*)"
).
toStrings
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
r
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
x
(
"//div[@id='articlebody']//span[@class='time SG_txtc']"
).
r
(
"\\((.*)\\)"
));
// page.putField("tags",page.getHtml().x
s
("//td[@class='blog_tag']/h3/a"));
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(http://blog\\.sina\\.com\\.cn/s/blog_.*)"
).
toStrings
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
path
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
r
egex
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
x
path
(
"//div[@id='articlebody']//span[@class='time SG_txtc']"
).
regex
(
"\\((.*)\\)"
));
// page.putField("tags",page.getHtml().x
path
("//td[@class='blog_tag']/h3/a"));
}
@Override
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
View file @
986ae0be
...
...
@@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
r
s
(
"<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
r
egex
(
"<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}"
).
toStrings
();
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//div[@id='post_head']//span[@class='s_title']//b"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
s
c
());
page
.
putField
(
"title"
,
page
.
getHtml
().
x
path
(
"//div[@id='post_head']//span[@class='s_title']//b"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
s
martContent
());
}
@Override
...
...
webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
View file @
986ae0be
...
...
@@ -51,7 +51,7 @@ public class SpiderTest {
/**
*
* _hrefs = r
s
("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* _hrefs = r
egex
("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']")
*
...
...
@@ -72,7 +72,7 @@ public class SpiderTest {
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
*
* body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + x
s
('')) . r('')
* tags[%] = (tags[%] + x
path
('')) . r('')
*
* _targetUrls.add('' + x('').r(''))
* _sourceUrls.add()
...
...
@@ -114,7 +114,7 @@ public class SpiderTest {
* content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + x
s
('') > r('')
* tags[%] = tags + x
path
('') > r('')
* model.setTargetUrl();
*
* _targetUrl = '' + x('') & r('')
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment