Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
ecb61d13
Commit
ecb61d13
authored
Jun 09, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update pipeline
parent
755b9aa8
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
208 additions
and
17 deletions
+208
-17
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+11
-3
FilePipeline.java
...ain/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+4
-6
RegexSelector.java
...in/java/us/codecraft/webmagic/selector/RegexSelector.java
+3
-0
pom.xml
webmagic-plugin/pom.xml
+5
-0
FreemarkerPipeline.java
...va/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
+57
-0
wordpress.ftl
webmagic-plugin/src/main/resources/ftl/wordpress.ftl
+23
-0
FreemarkerPipelineTest.java
...st/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
+19
-0
wordpress.ftl
webmagic-plugin/src/test/resources/ftl/wordpress.ftl
+23
-0
pom.xml
webmagic-samples/pom.xml
+5
-0
DiaoyuwengProcessor.java
...va/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+10
-8
DiaoyuwengProcessorTest.java
...codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
+26
-0
wordpress.ftl
webmagic-samples/src/test/resources/ftl/wordpress.ftl
+22
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
ecb61d13
...
...
@@ -10,6 +10,9 @@ import us.codecraft.webmagic.processor.PageProcessor;
import
us.codecraft.webmagic.schedular.QueueSchedular
;
import
us.codecraft.webmagic.schedular.Schedular
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* User: cairne
* Date: 13-4-21
...
...
@@ -19,7 +22,7 @@ public class Spider implements Runnable {
private
Downloader
downloader
=
new
HttpClientDownloader
();
private
Pipeline
pipeline
=
new
ConsolePipeline
();
private
List
<
Pipeline
>
pipelines
=
new
ArrayList
<
Pipeline
>
();
private
PageProcessor
pageProcessor
;
...
...
@@ -47,7 +50,7 @@ public class Spider implements Runnable {
}
public
Spider
pipeline
(
Pipeline
pipeline
)
{
this
.
pipeline
=
pipeline
;
this
.
pipeline
s
.
add
(
pipeline
)
;
return
this
;
}
...
...
@@ -56,6 +59,9 @@ public class Spider implements Runnable {
public
void
run
()
{
Site
site
=
pageProcessor
.
getSite
();
Request
request
=
schedular
.
poll
(
site
);
if
(
pipelines
.
isEmpty
()){
pipelines
.
add
(
new
ConsolePipeline
());
}
while
(
request
!=
null
)
{
Page
page
=
downloader
.
download
(
request
,
site
);
if
(
page
==
null
)
{
...
...
@@ -64,7 +70,9 @@ public class Spider implements Runnable {
}
pageProcessor
.
process
(
page
);
addRequest
(
page
);
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
,
site
);
}
sleep
(
site
.
getSleepTime
());
request
=
schedular
.
poll
(
site
);
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
View file @
ecb61d13
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.io.File
;
import
java.io.FileWriter
;
...
...
@@ -21,7 +21,7 @@ public class FilePipeline implements Pipeline {
private
String
path
=
"/data/temp/webmagic/"
;
public
FilePipeline
(){
public
FilePipeline
()
{
}
...
...
@@ -30,11 +30,9 @@ public class FilePipeline implements Pipeline {
}
@Override
public
void
process
(
Page
page
,
Site
site
)
{
public
void
process
(
Page
page
,
Site
site
)
{
String
domain
=
site
.
getDomain
();
domain
=
StringUtils
.
removeStart
(
domain
,
"http://"
);
domain
=
StringUtils
.
removeStart
(
domain
,
"https://"
);
domain
=
StringUtils
.
replace
(
domain
,
"/"
,
""
);
domain
=
UrlUtils
.
getDomain
(
domain
);
String
path
=
this
.
path
+
""
+
domain
+
"/"
;
File
file
=
new
File
(
path
);
if
(!
file
.
exists
())
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
View file @
ecb61d13
...
...
@@ -23,6 +23,9 @@ public class RegexSelector implements Selector {
if
(
StringUtils
.
isBlank
(
regexStr
)){
throw
new
IllegalArgumentException
(
"regex must not be empty"
);
}
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)&&!
StringUtils
.
contains
(
regexStr
,
")"
)){
regexStr
=
"("
+
regexStr
+
")"
;
}
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)||!
StringUtils
.
contains
(
regexStr
,
")"
)){
throw
new
IllegalArgumentException
(
"regex must have capture group 1"
);
}
...
...
webmagic-plugin/pom.xml
View file @
ecb61d13
...
...
@@ -20,6 +20,11 @@
<version>
4.7
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.freemarker
</groupId>
<artifactId>
freemarker
</artifactId>
<version>
2.3.19
</version>
</dependency>
</dependencies>
<build>
...
...
webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
0 → 100644
View file @
ecb61d13
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
freemarker.template.Configuration
;
import
freemarker.template.Template
;
import
freemarker.template.TemplateException
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.io.*
;
/**
* User: cairne
* Date: 13-6-8
* Time: 下午9:00
*/
public
class
FreemarkerPipeline
implements
Pipeline
{
private
Configuration
configuration
;
private
Template
template
;
private
String
path
=
"/data/temp/webmagic/ftl/"
;
public
FreemarkerPipeline
(
String
template
,
String
path
)
throws
IOException
{
configuration
=
new
Configuration
();
configuration
.
setDirectoryForTemplateLoading
(
new
File
(
this
.
getClass
().
getClassLoader
().
getResource
(
"ftl/"
).
getFile
()));
this
.
template
=
configuration
.
getTemplate
(
template
);
this
.
path
=
path
;
File
file
=
new
File
(
path
);
if
(!
file
.
exists
())
{
file
.
mkdir
();
}
}
public
FreemarkerPipeline
(
String
template
)
throws
IOException
{
this
(
template
,
"/data/temp/webmagic/ftl/"
);
}
@Override
public
void
process
(
Page
page
,
Site
site
)
{
String
domain
=
site
.
getDomain
();
domain
=
UrlUtils
.
getDomain
(
domain
);
String
path
=
this
.
path
+
""
+
domain
+
"/"
;
try
{
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
page
.
getUrl
().
toString
())
+
".html"
));
template
.
process
(
page
.
getFields
(),
printWriter
);
printWriter
.
close
();
}
catch
(
TemplateException
e
)
{
e
.
printStackTrace
();
//To change body of catch statement use File | Settings | File Templates.
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
//To change body of catch statement use File | Settings | File Templates.
}
}
}
webmagic-plugin/src/main/resources/ftl/wordpress.ftl
0 → 100644
View file @
ecb61d13
<item>
<title>$it.Title</title>
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
<description></description>
<content:encoded><![CDATA[${text}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
$tags
</item>
\ No newline at end of file
webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
0 → 100644
View file @
ecb61d13
package
us
.
codecraft
.
webmagic
;
import
org.junit.Test
;
import
us.codecraft.webmagic.pipeline.FreemarkerPipeline
;
import
java.io.IOException
;
/**
* User: cairne
* Date: 13-6-9
* Time: 上午7:14
*/
public
class
FreemarkerPipelineTest
{
@Test
public
void
test
()
throws
IOException
{
FreemarkerPipeline
freemarkerPipeline
=
new
FreemarkerPipeline
(
"wordpress.ftl"
);
}
}
webmagic-plugin/src/test/resources/ftl/wordpress.ftl
0 → 100644
View file @
ecb61d13
<item>
<title>$it.Title</title>
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
<description></description>
<content:encoded><![CDATA[${text}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
$tags
</item>
\ No newline at end of file
webmagic-samples/pom.xml
View file @
ecb61d13
...
...
@@ -15,6 +15,11 @@
<artifactId>
webmagic-core
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-plugin
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
View file @
ecb61d13
...
...
@@ -3,6 +3,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.PlainText
;
import
java.util.List
;
...
...
@@ -14,20 +15,21 @@ import java.util.List;
public
class
DiaoyuwengProcessor
implements
PageProcessor
{
@Override
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
rs
(
"<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
as
().
rs
(
"(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
requests
=
page
.
getHtml
().
rs
(
"<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}
"
).
toStrings
();
requests
=
page
.
getHtml
().
as
().
rs
(
"(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)
"
).
toStrings
();
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"shop"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//h1[@class='shop-title']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
sc
());
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
x
(
"//div[@class='pcb']//tbody"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
r
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
r
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
}
}
@Override
public
Site
getSite
()
{
return
Site
.
me
().
setDomain
(
"www.dia
nping.com"
).
setStartUrl
(
"http://www.dianping.com/
"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
return
Site
.
me
().
setDomain
(
"www.dia
oyuweng.com"
).
setStartUrl
(
"http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space
"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
)
.
setEncoding
(
"GBK"
).
setSleepTime
(
500
)
;
}
}
webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
0 → 100644
View file @
ecb61d13
package
us
.
codecraft
.
webmagic
.
processor
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.pipeline.FreemarkerPipeline
;
import
us.codecraft.webmagic.samples.DiaoyuwengProcessor
;
import
us.codecraft.webmagic.schedular.FileCacheQueueSchedular
;
import
java.io.IOException
;
/**
* User: cairne
* Date: 13-6-9
* Time: 上午8:02
*/
public
class
DiaoyuwengProcessorTest
{
@Test
public
void
test
()
throws
IOException
{
DiaoyuwengProcessor
diaoyuwengProcessor
=
new
DiaoyuwengProcessor
();
FreemarkerPipeline
pipeline
=
new
FreemarkerPipeline
(
"wordpress.ftl"
);
Spider
.
me
().
pipeline
(
new
FilePipeline
()).
pipeline
(
pipeline
).
schedular
(
new
FileCacheQueueSchedular
(
diaoyuwengProcessor
.
getSite
(),
"/data/temp/webmagic/cache/"
)).
processor
(
diaoyuwengProcessor
).
run
();
}
}
webmagic-samples/src/test/resources/ftl/wordpress.ftl
0 → 100644
View file @
ecb61d13
<item>
<title>${title}</title>
<link>http://127.0.0.1/wordpress/?p=${id}</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
<description></description>
<content:encoded><![CDATA[${content}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<#--<wp:post_id>$it.Id</wp:post_id>-->
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
</item>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment