Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
14562855
Commit
14562855
authored
Aug 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update afterextract api
parent
aca165b1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
27 deletions
+52
-27
AfterExtractor.java
...rc/main/java/us/codecraft/webmagic/oo/AfterExtractor.java
+2
-2
ObjectPageProcessor.java
...in/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
+21
-7
PageModelExtractor.java
...ain/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+26
-15
OschinaBlog.java
...e/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
+3
-3
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java
View file @
14562855
...
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
...
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
* @date: 13-8-3 <br>
* @date: 13-8-3 <br>
* Time: 上午9:42 <br>
* Time: 上午9:42 <br>
*/
*/
public
interface
AfterExtractor
<
T
>
{
public
interface
AfterExtractor
{
public
void
afterProcess
(
Page
page
,
T
t
);
public
void
afterProcess
(
Page
page
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
View file @
14562855
...
@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
...
@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.Selector
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashSet
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Set
;
import
java.util.Set
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
/**
/**
...
@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
...
@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
}
}
public
ObjectPageProcessor
addPageModel
(
Class
clazz
){
public
ObjectPageProcessor
addPageModel
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
PageModelExtractor
.
create
(
clazz
);
PageModelExtractor
pageModelExtractor
=
PageModelExtractor
.
create
(
clazz
);
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getTargetUrlPatterns
());
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getTargetUrlPatterns
());
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getHelpUrlPatterns
());
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getHelpUrlPatterns
());
...
@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
...
@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
Object
process
=
pageModelExtractor
.
process
(
page
);
Object
process
=
pageModelExtractor
.
process
(
page
);
if
(
process
==
null
)
{
if
(
process
==
null
)
{
page
.
getResultItems
().
setSkip
(
true
);
page
.
getResultItems
().
setSkip
(
true
);
}
}
postProcessPageModel
(
pageModelExtractor
.
getClazz
(),
process
);
postProcessPageModel
(
pageModelExtractor
.
getClazz
(),
process
);
page
.
putField
(
pageModelExtractor
.
getClazz
().
getCanonicalName
(),
process
);
page
.
putField
(
pageModelExtractor
.
getClazz
().
getCanonicalName
(),
process
);
extractLinks
(
page
,
pageModelExtractor
.
getHelpUrlRegionSelector
(),
pageModelExtractor
.
getHelpUrlPatterns
());
extractLinks
(
page
,
pageModelExtractor
.
getTargetUrlRegionSelector
(),
pageModelExtractor
.
getTargetUrlPatterns
());
}
}
for
(
String
link
:
page
.
getHtml
().
links
().
all
())
{
}
for
(
Pattern
targetUrlPattern
:
targetUrlPatterns
)
{
if
(
targetUrlPattern
.
matcher
(
link
).
matches
()){
private
void
extractLinks
(
Page
page
,
Selector
urlRegionSelector
,
List
<
Pattern
>
urlPatterns
)
{
page
.
addTargetRequest
(
new
Request
(
link
));
List
<
String
>
links
;
if
(
urlRegionSelector
==
null
)
{
links
=
page
.
getHtml
().
links
().
all
();
}
else
{
links
=
urlRegionSelector
.
selectList
(
page
.
getHtml
().
toString
());
}
for
(
String
link
:
links
)
{
for
(
Pattern
targetUrlPattern
:
urlPatterns
)
{
Matcher
matcher
=
targetUrlPattern
.
matcher
(
link
);
if
(
matcher
.
find
())
{
page
.
addTargetRequest
(
new
Request
(
matcher
.
group
(
1
)));
}
}
}
}
}
}
}
}
protected
void
postProcessPageModel
(
Class
clazz
,
Object
object
){
protected
void
postProcessPageModel
(
Class
clazz
,
Object
object
)
{
}
}
@Override
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
View file @
14562855
...
@@ -21,14 +21,16 @@ class PageModelExtractor {
...
@@ -21,14 +21,16 @@ class PageModelExtractor {
private
List
<
Pattern
>
targetUrlPatterns
=
new
ArrayList
<
Pattern
>();
private
List
<
Pattern
>
targetUrlPatterns
=
new
ArrayList
<
Pattern
>();
private
Selector
targetUrlRegionSelector
;
private
List
<
Pattern
>
helpUrlPatterns
=
new
ArrayList
<
Pattern
>();
private
List
<
Pattern
>
helpUrlPatterns
=
new
ArrayList
<
Pattern
>();
private
Selector
helpUrlRegionSelector
;
private
Class
clazz
;
private
Class
clazz
;
private
List
<
FieldExtractor
>
fieldExtractors
;
private
List
<
FieldExtractor
>
fieldExtractors
;
private
AfterExtractor
afterExtractor
;
public
static
PageModelExtractor
create
(
Class
clazz
)
{
public
static
PageModelExtractor
create
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
pageModelExtractor
.
init
(
clazz
);
pageModelExtractor
.
init
(
clazz
);
...
@@ -39,13 +41,6 @@ class PageModelExtractor {
...
@@ -39,13 +41,6 @@ class PageModelExtractor {
this
.
clazz
=
clazz
;
this
.
clazz
=
clazz
;
initTargetUrlPatterns
();
initTargetUrlPatterns
();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
if
(
AfterExtractor
.
class
.
isAssignableFrom
(
clazz
))
{
try
{
afterExtractor
=
(
AfterExtractor
)
clazz
.
newInstance
();
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
e
);
}
}
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
field
.
setAccessible
(
true
);
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
...
@@ -117,16 +112,24 @@ class PageModelExtractor {
...
@@ -117,16 +112,24 @@ class PageModelExtractor {
if
(
annotation
==
null
)
{
if
(
annotation
==
null
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
".*"
));
targetUrlPatterns
.
add
(
Pattern
.
compile
(
".*"
));
}
else
{
}
else
{
String
[]
value
=
((
TargetUrl
)
annotation
).
value
();
TargetUrl
targetUrl
=
(
TargetUrl
)
annotation
;
String
[]
value
=
targetUrl
.
value
();
for
(
String
s
:
value
)
{
for
(
String
s
:
value
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
targetUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
));
}
if
(!
targetUrl
.
sourceRegion
().
equals
(
""
)){
targetUrlRegionSelector
=
new
Xpath2Selector
(
targetUrl
.
sourceRegion
());
}
}
}
}
annotation
=
clazz
.
getAnnotation
(
HelpUrl
.
class
);
annotation
=
clazz
.
getAnnotation
(
HelpUrl
.
class
);
if
(
annotation
!=
null
)
{
if
(
annotation
!=
null
)
{
String
[]
value
=
((
HelpUrl
)
annotation
).
value
();
HelpUrl
helpUrl
=
(
HelpUrl
)
annotation
;
String
[]
value
=
helpUrl
.
value
();
for
(
String
s
:
value
)
{
for
(
String
s
:
value
)
{
helpUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
helpUrlPatterns
.
add
(
Pattern
.
compile
(
"("
+
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
));
}
if
(!
helpUrl
.
sourceRegion
().
equals
(
""
)){
helpUrlRegionSelector
=
new
Xpath2Selector
(
helpUrl
.
sourceRegion
());
}
}
}
}
}
}
...
@@ -179,8 +182,8 @@ class PageModelExtractor {
...
@@ -179,8 +182,8 @@ class PageModelExtractor {
setField
(
o
,
fieldExtractor
,
value
);
setField
(
o
,
fieldExtractor
,
value
);
}
}
}
}
if
(
afterExtractor
!=
null
)
{
if
(
AfterExtractor
.
class
.
isAssignableFrom
(
clazz
)
)
{
afterExtractor
.
afterProcess
(
page
,
o
);
((
AfterExtractor
)
o
).
afterProcess
(
page
);
}
}
}
catch
(
InstantiationException
e
)
{
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
@@ -210,4 +213,12 @@ class PageModelExtractor {
...
@@ -210,4 +213,12 @@ class PageModelExtractor {
List
<
Pattern
>
getHelpUrlPatterns
()
{
List
<
Pattern
>
getHelpUrlPatterns
()
{
return
helpUrlPatterns
;
return
helpUrlPatterns
;
}
}
Selector
getTargetUrlRegionSelector
()
{
return
targetUrlRegionSelector
;
}
Selector
getHelpUrlRegionSelector
()
{
return
helpUrlRegionSelector
;
}
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
View file @
14562855
...
@@ -9,8 +9,8 @@ import java.util.List;
...
@@ -9,8 +9,8 @@ import java.util.List;
* @date: 13-8-1 <br>
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
* Time: 下午10:18 <br>
*/
*/
@TargetUrl
(
"http://my.oschina.net/flashsword/blog/*
"
)
@TargetUrl
(
value
=
"http://my.oschina.net/flashsword/blog/*"
,
sourceRegion
=
"//div[@class='BlogLinks']
"
)
public
class
OschinaBlog
implements
AfterExtractor
<
OschinaBlog
>
{
public
class
OschinaBlog
implements
AfterExtractor
{
@ExtractBy
(
"//title"
)
@ExtractBy
(
"//title"
)
private
String
title
;
private
String
title
;
...
@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
...
@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
private
List
<
String
>
tags
;
private
List
<
String
>
tags
;
@Override
@Override
public
void
afterProcess
(
Page
page
,
OschinaBlog
oschinaBlog
)
{
public
void
afterProcess
(
Page
page
)
{
content
=
null
;
content
=
null
;
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment