Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
b393e383
Commit
b393e383
authored
Aug 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add multi entity extract
parent
bfadac75
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
86 additions
and
15 deletions
+86
-15
ObjectPageProcessor.java
...in/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
+1
-1
ObjectPipeline.java
...rc/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java
+12
-1
PageModelExtractor.java
...ain/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+42
-13
OschinaAnswer.java
.../java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java
+31
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
View file @
b393e383
...
...
@@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
Object
process
=
pageModelExtractor
.
process
(
page
);
if
(
process
==
null
)
{
if
(
process
==
null
||
(
process
instanceof
List
&&
((
List
)
process
).
size
()
==
0
)
)
{
page
.
getResultItems
().
setSkip
(
true
);
}
postProcessPageModel
(
pageModelExtractor
.
getClazz
(),
process
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java
View file @
b393e383
...
...
@@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
java.lang.annotation.Annotation
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
...
...
@@ -32,7 +34,16 @@ public class ObjectPipeline implements Pipeline {
for
(
Map
.
Entry
<
Class
,
PageModelPipeline
>
classPageModelPipelineEntry
:
pageModelPipelines
.
entrySet
())
{
Object
o
=
resultItems
.
get
(
classPageModelPipelineEntry
.
getKey
().
getCanonicalName
());
if
(
o
!=
null
)
{
classPageModelPipelineEntry
.
getValue
().
process
(
o
,
task
);
Annotation
annotation
=
classPageModelPipelineEntry
.
getKey
().
getAnnotation
(
ExtractBy
.
class
);
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
if
(
extractBy
.
multi
())
{
List
<
Object
>
list
=
(
List
<
Object
>)
o
;
for
(
Object
o1
:
list
)
{
classPageModelPipelineEntry
.
getValue
().
process
(
o1
,
task
);
}
}
else
{
classPageModelPipelineEntry
.
getValue
().
process
(
o
,
task
);
}
}
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
View file @
b393e383
...
...
@@ -31,6 +31,8 @@ class PageModelExtractor {
private
List
<
FieldExtractor
>
fieldExtractors
;
private
Extractor
extractor
;
public
static
PageModelExtractor
create
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
pageModelExtractor
.
init
(
clazz
);
...
...
@@ -39,7 +41,7 @@ class PageModelExtractor {
private
void
init
(
Class
clazz
)
{
this
.
clazz
=
clazz
;
init
TargetUrlPattern
s
();
init
ClassExtractor
s
();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
...
...
@@ -107,7 +109,7 @@ class PageModelExtractor {
}
}
private
void
init
TargetUrlPattern
s
()
{
private
void
init
ClassExtractor
s
()
{
Annotation
annotation
=
clazz
.
getAnnotation
(
TargetUrl
.
class
);
if
(
annotation
==
null
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
".*"
));
...
...
@@ -115,9 +117,9 @@ class PageModelExtractor {
TargetUrl
targetUrl
=
(
TargetUrl
)
annotation
;
String
[]
value
=
targetUrl
.
value
();
for
(
String
s
:
value
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
));
targetUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)
+
")"
));
}
if
(!
targetUrl
.
sourceRegion
().
equals
(
""
)){
if
(!
targetUrl
.
sourceRegion
().
equals
(
""
))
{
targetUrlRegionSelector
=
new
Xpath2Selector
(
targetUrl
.
sourceRegion
());
}
}
...
...
@@ -126,12 +128,17 @@ class PageModelExtractor {
HelpUrl
helpUrl
=
(
HelpUrl
)
annotation
;
String
[]
value
=
helpUrl
.
value
();
for
(
String
s
:
value
)
{
helpUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
));
helpUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)
+
")"
));
}
if
(!
helpUrl
.
sourceRegion
().
equals
(
""
)){
if
(!
helpUrl
.
sourceRegion
().
equals
(
""
))
{
helpUrlRegionSelector
=
new
Xpath2Selector
(
helpUrl
.
sourceRegion
());
}
}
annotation
=
clazz
.
getAnnotation
(
ExtractBy
.
class
);
if
(
annotation
!=
null
)
{
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
extractor
=
new
Extractor
(
new
Xpath2Selector
(
extractBy
.
value
()),
Extractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
}
}
public
Object
process
(
Page
page
)
{
...
...
@@ -144,6 +151,28 @@ class PageModelExtractor {
if
(!
matched
)
{
return
null
;
}
if
(
extractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
());
}
else
{
if
(
extractor
.
multi
){
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
extractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
if
(
o
!=
null
){
os
.
add
(
o
);
}
}
return
os
;
}
else
{
String
select
=
extractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
return
o
;
}
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
Object
o
=
null
;
try
{
o
=
clazz
.
newInstance
();
...
...
@@ -152,38 +181,38 @@ class PageModelExtractor {
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
()
);
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
()
);
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
}
if
((
value
==
null
||
value
.
size
()
==
0
)
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
)
;
return
null
;
}
setField
(
o
,
fieldExtractor
,
value
);
}
else
{
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
()
);
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
()
);
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
}
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
)
;
return
null
;
}
setField
(
o
,
fieldExtractor
,
value
);
}
}
if
(
AfterExtractor
.
class
.
isAssignableFrom
(
clazz
))
{
((
AfterExtractor
)
o
).
afterProcess
(
page
);
((
AfterExtractor
)
o
).
afterProcess
(
page
);
}
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java
0 → 100644
View file @
b393e383
package
us
.
codecraft
.
webmagic
.
oo
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.oo.*
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-3 <br>
* Time: 下午8:25 <br>
*/
@TargetUrl
(
"http://www.oschina.net/question/\\d+_\\d+*"
)
@HelpUrl
(
"http://www.oschina.net/question/*"
)
@ExtractBy
(
value
=
"//ul[@class='list']/li[@class='Answer']"
,
multi
=
true
)
public
class
OschinaAnswer
implements
AfterExtractor
{
@ExtractBy
(
"//img/@title"
)
private
String
user
;
@ExtractBy
(
value
=
"//div[@class='detail']"
,
notNull
=
false
)
private
String
content
;
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://www.oschina.net/question/567527_120597"
),
OschinaAnswer
.
class
).
run
();
}
@Override
public
void
afterProcess
(
Page
page
)
{
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment