Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
c7330460
Commit
c7330460
authored
Jul 19, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
+sina blog
parent
900172c9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
3 deletions
+11
-3
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+1
-0
SinaBlogProcesser.java
...java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
+8
-3
DiaoyuwengProcessorTest.java
...codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
+2
-0
No files found.
webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
c7330460
...
@@ -1351,6 +1351,7 @@ public class XpathSelectorTest {
...
@@ -1351,6 +1351,7 @@ public class XpathSelectorTest {
public
void
testOschina
()
{
public
void
testOschina
()
{
Html
html1
=
new
Html
(
html
);
Html
html1
=
new
Html
(
html
);
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
System
.
out
.
println
(
html1
.
regex
(
"(<body>.*?</body>)"
).
links
().
toStrings
());
}
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
View file @
c7330460
package
us
.
codecraft
.
webmagic
.
samples
;
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
/**
...
@@ -15,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
...
@@ -15,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(http://blog\\.sina\\.com\\.cn/s/blog_.*)"
).
toStrings
());
page
.
addTargetRequests
(
page
.
getHtml
().
xpath
(
"//div[@class='articalfrontback SG_j_linedot1 clearfix']"
).
links
(
).
toStrings
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
regex
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
regex
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
...
@@ -26,9 +27,13 @@ public class SinaBlogProcesser implements PageProcessor {
...
@@ -26,9 +27,13 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
@Override
public
Site
getSite
()
{
public
Site
getSite
()
{
if
(
site
==
null
){
if
(
site
==
null
){
site
=
Site
.
me
().
setDomain
(
"blog.sina.com.cn"
).
addStartUrl
(
"http://blog.sina.com.cn/
flashsword20
"
).
setSleepTime
(
3000
).
site
=
Site
.
me
().
setDomain
(
"blog.sina.com.cn"
).
addStartUrl
(
"http://blog.sina.com.cn/
s/blog_4701280b0102egl0.html
"
).
setSleepTime
(
3000
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
return
site
;
return
site
;
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
SinaBlogProcesser
()).
run
();
}
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
View file @
c7330460
package
us
.
codecraft
.
webmagic
.
processor
;
package
us
.
codecraft
.
webmagic
.
processor
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
...
@@ -16,6 +17,7 @@ import java.io.IOException;
...
@@ -16,6 +17,7 @@ import java.io.IOException;
*/
*/
public
class
DiaoyuwengProcessorTest
{
public
class
DiaoyuwengProcessorTest
{
@Ignore
@Test
@Test
public
void
test
()
throws
IOException
{
public
void
test
()
throws
IOException
{
DiaoyuwengProcessor
diaoyuwengProcessor
=
new
DiaoyuwengProcessor
();
DiaoyuwengProcessor
diaoyuwengProcessor
=
new
DiaoyuwengProcessor
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment