Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
65dc3721
Commit
65dc3721
authored
Jul 25, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update pipeline api
parent
55d80129
Changes
26
Show whitespace changes
Inline
Side-by-side
Showing
26 changed files
with
119 additions
and
95 deletions
+119
-95
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+6
-49
ResultItems.java
...core/src/main/java/us/codecraft/webmagic/ResultItems.java
+64
-0
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+1
-1
ConsolePipeline.java
.../java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
+5
-9
FilePipeline.java
...ain/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+8
-5
Pipeline.java
...rc/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
+2
-2
SimplePageProcessor.java
.../us/codecraft/webmagic/processor/SimplePageProcessor.java
+2
-1
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+3
-3
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+1
-1
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+1
-1
FreemarkerPipeline.java
...va/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
+5
-5
RedisScheduler.java
.../java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+2
-0
RedisSchedulerTest.java
...a/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
+2
-1
DiandianBlogProcessor.java
.../us/codecraft/webmagic/samples/DiandianBlogProcessor.java
+3
-3
DianpingProcessor.java
...java/us/codecraft/webmagic/samples/DianpingProcessor.java
+1
-1
DiaoyuwengProcessor.java
...va/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+2
-2
F58PageProcesser.java
.../java/us/codecraft/webmagic/samples/F58PageProcesser.java
+1
-1
GlobalProcessor.java
...n/java/us/codecraft/webmagic/samples/GlobalProcessor.java
+1
-1
HuxiuProcessor.java
...in/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+1
-1
MeicanProcessor.java
...n/java/us/codecraft/webmagic/samples/MeicanProcessor.java
+2
-2
NjuBBSProcessor.java
...n/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
+1
-1
OschinaBlogPageProcesser.java
.../codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+1
-1
OschinaPageProcesser.java
...a/us/codecraft/webmagic/samples/OschinaPageProcesser.java
+1
-1
QzoneBlogProcessor.java
...ava/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
+1
-1
SinaBlogProcesser.java
...java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
+1
-1
TianyaPageProcesser.java
...va/us/codecraft/webmagic/samples/TianyaPageProcesser.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
65dc3721
...
@@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
...
@@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
/**
* <pre>
* <pre>
...
@@ -27,7 +25,7 @@ public class Page {
...
@@ -27,7 +25,7 @@ public class Page {
private
Request
request
;
private
Request
request
;
private
Map
<
String
,
Selectable
>
fields
=
new
ConcurrentHashMap
<
String
,
Selectable
>
();
private
ResultItems
resultItems
=
new
ResultItems
();
private
Selectable
html
;
private
Selectable
html
;
...
@@ -35,44 +33,16 @@ public class Page {
...
@@ -35,44 +33,16 @@ public class Page {
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
private
boolean
skip
;
private
Object
extra
;
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
*/
public
boolean
isSkip
()
{
return
skip
;
}
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip 是否忽略 true 忽略
*/
public
void
setSkip
(
boolean
skip
)
{
this
.
skip
=
skip
;
}
public
Page
()
{
public
Page
()
{
}
}
/**
* 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
* @return fields 抽取的结果
*/
public
Map
<
String
,
Selectable
>
getFields
()
{
return
fields
;
}
/**
/**
* 保存抽取的结果
* 保存抽取的结果
* @param key 结果的key
* @param key 结果的key
* @param field 结果的value
* @param field 结果的value
*/
*/
public
void
putField
(
String
key
,
Selectable
field
)
{
public
void
putField
(
String
key
,
Object
field
)
{
field
s
.
put
(
key
,
field
);
resultItem
s
.
put
(
key
,
field
);
}
}
/**
/**
...
@@ -157,23 +127,10 @@ public class Page {
...
@@ -157,23 +127,10 @@ public class Page {
public
void
setRequest
(
Request
request
)
{
public
void
setRequest
(
Request
request
)
{
this
.
request
=
request
;
this
.
request
=
request
;
this
.
resultItems
.
setRequest
(
request
);
}
}
/**
public
ResultItems
getResultItems
()
{
* 获取附加对象
return
resultItems
;
* @param <T> 对象类型
* @return 对象内容
*/
public
<
T
>
T
getExtra
()
{
return
(
T
)
extra
;
}
/**
* 设置附加对象
* @param extra 对象内容
* @param <T> 对象类型
*/
public
<
T
>
void
setExtra
(
T
extra
)
{
this
.
extra
=
extra
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
0 → 100644
View file @
65dc3721
package
us
.
codecraft
.
webmagic
;
import
java.util.HashMap
;
import
java.util.Map
;
/**
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* Time: 下午12:20 <br>
*/
public
class
ResultItems
{
private
Map
<
String
,
Object
>
fields
=
new
HashMap
<
String
,
Object
>();
private
Request
request
;
private
boolean
skip
;
public
<
T
>
T
get
(
String
key
)
{
Object
o
=
fields
.
get
(
key
);
if
(
o
==
null
)
{
return
null
;
}
return
(
T
)
fields
.
get
(
key
);
}
public
Map
<
String
,
Object
>
getAll
()
{
return
fields
;
}
public
<
T
>
ResultItems
put
(
String
key
,
T
value
)
{
fields
.
put
(
key
,
value
);
return
this
;
}
public
Request
getRequest
()
{
return
request
;
}
public
ResultItems
setRequest
(
Request
request
)
{
this
.
request
=
request
;
return
this
;
}
/**
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @return 是否忽略 true 忽略
*/
public
boolean
isSkip
()
{
return
skip
;
}
/**
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
* @param skip
* @return this
*/
public
ResultItems
setSkip
(
boolean
skip
)
{
this
.
skip
=
skip
;
return
this
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
65dc3721
...
@@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
...
@@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
pageProcessor
.
process
(
page
);
pageProcessor
.
process
(
page
);
addRequest
(
page
);
addRequest
(
page
);
for
(
Pipeline
pipeline
:
pipelines
)
{
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
,
this
);
pipeline
.
process
(
page
.
getResultItems
()
,
this
);
}
}
sleep
(
site
.
getSleepTime
());
sleep
(
site
.
getSleepTime
());
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
View file @
65dc3721
package
us
.
codecraft
.
webmagic
.
pipeline
;
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
us.codecraft.webmagic.
Page
;
import
us.codecraft.webmagic.
ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.selector.Selectable
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -15,13 +14,10 @@ import java.util.Map;
...
@@ -15,13 +14,10 @@ import java.util.Map;
public
class
ConsolePipeline
implements
Pipeline
{
public
class
ConsolePipeline
implements
Pipeline
{
@Override
@Override
public
void
process
(
Page
page
,
Task
task
)
{
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
System
.
out
.
println
(
"get page: "
+
page
.
getUrl
());
System
.
out
.
println
(
"get page: "
+
resultItems
.
getRequest
().
getUrl
());
for
(
Map
.
Entry
<
String
,
Selectable
>
entry
:
page
.
getFields
().
entrySet
())
{
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
resultItems
.
getAll
().
entrySet
())
{
System
.
out
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
().
toStrings
());
System
.
out
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
());
}
if
(
page
.
getExtra
()!=
null
){
System
.
out
.
println
(
page
.
getExtra
());
}
}
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
View file @
65dc3721
...
@@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
...
@@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.
Page
;
import
us.codecraft.webmagic.
ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
java.io.File
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.PrintWriter
;
import
java.io.PrintWriter
;
import
java.util.Map
;
/**
/**
* 持久化到文件的接口。
* 持久化到文件的接口。
...
@@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
...
@@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
}
}
@Override
@Override
public
void
process
(
Page
page
,
Task
task
)
{
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
String
path
=
this
.
path
+
"/"
+
task
.
getUUID
()
+
"/"
;
String
path
=
this
.
path
+
"/"
+
task
.
getUUID
()
+
"/"
;
File
file
=
new
File
(
path
);
File
file
=
new
File
(
path
);
if
(!
file
.
exists
())
{
if
(!
file
.
exists
())
{
file
.
mkdirs
();
file
.
mkdirs
();
}
}
try
{
try
{
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
page
.
getUrl
().
toString
())));
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
resultItems
.
getRequest
().
getUrl
())));
printWriter
.
println
(
"url:\t"
+
page
.
getUrl
());
printWriter
.
println
(
"url:\t"
+
resultItems
.
getRequest
().
getUrl
());
printWriter
.
println
(
"html:\t"
+
page
.
getHtml
());
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
resultItems
.
getAll
().
entrySet
())
{
printWriter
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
());
}
printWriter
.
close
();
printWriter
.
close
();
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
logger
.
warn
(
"write file error"
,
e
);
logger
.
warn
(
"write file error"
,
e
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
View file @
65dc3721
package
us
.
codecraft
.
webmagic
.
pipeline
;
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
us.codecraft.webmagic.
Page
;
import
us.codecraft.webmagic.
ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
...
@@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
...
@@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
*/
*/
public
interface
Pipeline
{
public
interface
Pipeline
{
public
void
process
(
Page
page
,
Task
task
);
public
void
process
(
ResultItems
resultItems
,
Task
task
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
View file @
65dc3721
...
@@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
...
@@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
urlPattern
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
urlPattern
).
all
();
//调用page.addTargetRequests()方法添加待抓取链接
//调用page.addTargetRequests()方法添加待抓取链接
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
//xpath方式抽取
//xpath方式抽取
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
));
//sc表示使用Readability技术抽取正文
//sc表示使用Readability技术抽取正文
page
.
putField
(
"html"
,
page
.
getHtml
().
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
65dc3721
...
@@ -82,14 +82,14 @@ public class PlainText implements Selectable {
...
@@ -82,14 +82,14 @@ public class PlainText implements Selectable {
}
}
@Override
@Override
public
List
<
String
>
toStrings
()
{
public
List
<
String
>
all
()
{
return
strings
;
return
strings
;
}
}
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
if
(
CollectionUtils
.
isNotEmpty
(
toStrings
()))
{
if
(
CollectionUtils
.
isNotEmpty
(
all
()))
{
return
toStrings
().
get
(
0
);
return
all
().
get
(
0
);
}
else
{
}
else
{
return
null
;
return
null
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
65dc3721
...
@@ -69,5 +69,5 @@ public interface Selectable {
...
@@ -69,5 +69,5 @@ public interface Selectable {
*
*
* @return multi string result
* @return multi string result
*/
*/
public
List
<
String
>
toStrings
();
public
List
<
String
>
all
();
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
65dc3721
...
@@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
...
@@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
public
void
testOschina
()
{
public
void
testOschina
()
{
Html
html1
=
new
Html
(
html
);
Html
html1
=
new
Html
(
html
);
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
toStrings
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
all
());
}
}
}
}
webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
View file @
65dc3721
...
@@ -4,7 +4,7 @@ import freemarker.template.Configuration;
...
@@ -4,7 +4,7 @@ import freemarker.template.Configuration;
import
freemarker.template.Template
;
import
freemarker.template.Template
;
import
freemarker.template.TemplateException
;
import
freemarker.template.TemplateException
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
us.codecraft.webmagic.
Page
;
import
us.codecraft.webmagic.
ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
java.io.File
;
import
java.io.File
;
...
@@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
...
@@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
@Override
@Override
public
void
process
(
Page
page
,
Task
task
)
{
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
if
(
page
.
isSkip
())
{
if
(
resultItems
.
isSkip
())
{
return
;
return
;
}
}
String
path
=
this
.
path
+
""
+
task
.
getUUID
()
+
"/"
;
String
path
=
this
.
path
+
""
+
task
.
getUUID
()
+
"/"
;
...
@@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
...
@@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
file
.
mkdirs
();
file
.
mkdirs
();
}
}
try
{
try
{
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
page
.
getUrl
().
toString
())
+
".html"
));
PrintWriter
printWriter
=
new
PrintWriter
(
new
FileWriter
(
path
+
DigestUtils
.
md5Hex
(
resultItems
.
getRequest
().
getUrl
())
+
".html"
));
template
.
process
(
page
.
getFields
(),
printWriter
);
template
.
process
(
resultItems
.
getAll
(),
printWriter
);
printWriter
.
close
();
printWriter
.
close
();
}
catch
(
TemplateException
e
)
{
}
catch
(
TemplateException
e
)
{
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
...
...
webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
View file @
65dc3721
...
@@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
...
@@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
@Override
@Override
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
Jedis
jedis
=
pool
.
getResource
();
//使用SortedSet进行url去重
if
(
jedis
.
zrank
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
())==
null
){
if
(
jedis
.
zrank
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
())==
null
){
//使用List保存队列
jedis
.
rpush
(
QUEUE_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
jedis
.
rpush
(
QUEUE_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
jedis
.
zadd
(
SET_PREFIX
+
task
.
getUUID
(),
System
.
currentTimeMillis
(),
request
.
getUrl
());
jedis
.
zadd
(
SET_PREFIX
+
task
.
getUUID
(),
System
.
currentTimeMillis
(),
request
.
getUrl
());
}
}
...
...
webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
View file @
65dc3721
package
us
.
codecraft
.
webmagic
.
scheduler
;
package
us
.
codecraft
.
webmagic
.
scheduler
;
import
org.junit.Before
;
import
org.junit.Before
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
...
@@ -20,6 +21,7 @@ public class RedisSchedulerTest {
...
@@ -20,6 +21,7 @@ public class RedisSchedulerTest {
redisScheduler
=
new
RedisScheduler
(
"localhost"
);
redisScheduler
=
new
RedisScheduler
(
"localhost"
);
}
}
@Ignore
(
"environment depended"
)
@Test
@Test
public
void
test
()
{
public
void
test
()
{
Task
task
=
new
Task
()
{
Task
task
=
new
Task
()
{
...
@@ -35,7 +37,6 @@ public class RedisSchedulerTest {
...
@@ -35,7 +37,6 @@ public class RedisSchedulerTest {
};
};
redisScheduler
.
push
(
new
Request
(
"http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"
),
task
);
redisScheduler
.
push
(
new
Request
(
"http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"
),
task
);
Request
poll
=
redisScheduler
.
poll
(
task
);
Request
poll
=
redisScheduler
.
poll
(
task
);
System
.
out
.
println
(
poll
.
getUrl
());
}
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
View file @
65dc3721
...
@@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
...
@@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
//a()表示提取链接,links()表示提取所有链接
//a()表示提取链接,links()表示提取所有链接
//getHtml()返回Html对象,支持链式调用
//getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//toString()表示取单条结果,
toStrings
()表示取多条
//toString()表示取单条结果,
all
()表示取多条
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(.*/post/.*)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(.*/post/.*)"
).
all
();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
//page.putField(key,value)将抽取的内容加入结果Map
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
//x()和xs()使用xpath进行抽取
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
).
regex
(
"(.*?)\\|"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
).
regex
(
"(.*?)\\|"
)
.
toString
()
);
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"date"
,
page
.
getUrl
().
regex
(
"post/(\\d+-\\d+-\\d+)/"
));
page
.
putField
(
"date"
,
page
.
getUrl
().
regex
(
"post/(\\d+-\\d+-\\d+)/"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
View file @
65dc3721
...
@@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
...
@@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"http://info-search-web121361\\.alpha\\.dp:8080/search/.*"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"http://info-search-web121361\\.alpha\\.dp:8080/search/.*"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
View file @
65dc3721
...
@@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
...
@@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)"
).
toStrings
();
requests
=
page
.
getHtml
().
links
().
regex
(
"(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
View file @
65dc3721
...
@@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
...
@@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd[@class='w133']"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd[@class='w133']"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java
View file @
65dc3721
...
@@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
...
@@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
final
List
<
String
>
requests
=
page
.
getHtml
().
links
().
toStrings
();
final
List
<
String
>
requests
=
page
.
getHtml
().
links
().
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
View file @
65dc3721
...
@@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
...
@@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='neirong']//h1[@class='ph xs5']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='neirong']//h1[@class='ph xs5']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
View file @
65dc3721
...
@@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
...
@@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
xpath
(
"//a[@class=\"area_link flat_btn\"]/@href"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
xpath
(
"//a[@class=\"area_link flat_btn\"]/@href"
).
all
();
if
(
requests
.
size
()
>
2
)
{
if
(
requests
.
size
()
>
2
)
{
requests
=
requests
.
subList
(
0
,
2
);
requests
=
requests
.
subList
(
0
,
2
);
}
}
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
toStrings
());
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
all
());
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
View file @
65dc3721
...
@@ -14,7 +14,7 @@ import java.util.List;
...
@@ -14,7 +14,7 @@ import java.util.List;
public
class
NjuBBSProcessor
implements
PageProcessor
{
public
class
NjuBBSProcessor
implements
PageProcessor
{
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
View file @
65dc3721
...
@@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
links
().
regex
(
"(http://my\\.oschina\\.net)"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
links
().
regex
(
"(http://my\\.oschina\\.net)"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
View file @
65dc3721
...
@@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
...
@@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='QTitle']/h1/a"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='QTitle']/h1/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='Question']//div[@class='Content']/div[@class='detail']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='Question']//div[@class='Content']/div[@class='detail']"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
View file @
65dc3721
...
@@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
...
@@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}"
).
toStrings
();
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='content']//h2/a"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
View file @
65dc3721
...
@@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
...
@@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
page
.
addTargetRequests
(
page
.
getHtml
().
xpath
(
"//div[@class='articalfrontback SG_j_linedot1 clearfix']"
).
links
().
toStrings
());
page
.
addTargetRequests
(
page
.
getHtml
().
xpath
(
"//div[@class='articalfrontback SG_j_linedot1 clearfix']"
).
links
().
all
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='articalTitle']/h2"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@id='articlebody']//div[@class='articalContent']"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
regex
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
page
.
putField
(
"id"
,
page
.
getUrl
().
regex
(
"http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"
));
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
View file @
65dc3721
...
@@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
...
@@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}"
).
toStrings
();
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='post_head']//span[@class='s_title']//b"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@id='post_head']//span[@class='s_title']//b"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"body"
,
page
.
getHtml
().
smartContent
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment