Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
f08ffc34
Commit
f08ffc34
authored
Aug 01, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
rename
parent
c5cf0564
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
30 additions
and
30 deletions
+30
-30
ExtractBy.java
...main/java/us/codecraft/webmagic/annotation/ExtractBy.java
+1
-1
FieldExtractor.java
...java/us/codecraft/webmagic/annotation/FieldExtractor.java
+2
-2
ObjectPageProcessor.java
...us/codecraft/webmagic/annotation/ObjectPageProcessor.java
+12
-12
PageModelExtractor.java
.../us/codecraft/webmagic/annotation/PageModelExtractor.java
+13
-13
Blog.java
.../src/test/java/us/codecraft/webmagic/annotation/Blog.java
+2
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/
Fetcher
.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/
ExtractBy
.java
View file @
f08ffc34
...
...
@@ -11,7 +11,7 @@ import java.lang.annotation.Target;
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
public
@interface
Fetcher
{
public
@interface
ExtractBy
{
String
value
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Field
Fetche
r.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Field
Extracto
r.java
View file @
f08ffc34
...
...
@@ -9,13 +9,13 @@ import java.lang.reflect.Field;
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class
Field
Fetche
r
{
class
Field
Extracto
r
{
private
final
Field
field
;
private
final
Selector
selector
;
Field
Fetche
r
(
Field
field
,
Selector
selector
)
{
Field
Extracto
r
(
Field
field
,
Selector
selector
)
{
this
.
field
=
field
;
this
.
selector
=
selector
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
View file @
f08ffc34
...
...
@@ -18,36 +18,36 @@ import java.util.regex.Pattern;
*/
public
class
ObjectPageProcessor
implements
PageProcessor
{
private
List
<
PageModel
Fetcher
>
pageModelFetche
rList
;
private
List
<
PageModel
Extractor
>
pageModelExtracto
rList
;
private
Site
site
;
private
Set
<
Pattern
>
targetUrlPatterns
;
public
static
ObjectPageProcessor
create
(
Site
site
,
Class
...
clazzs
)
{
List
<
PageModel
Fetcher
>
pageModelFetcherList
=
new
ArrayList
<
PageModelFetche
r
>();
List
<
PageModel
Extractor
>
pageModelExtractorList
=
new
ArrayList
<
PageModelExtracto
r
>();
for
(
Class
clazz
:
clazzs
)
{
PageModel
Fetcher
pageModelFetcher
=
PageModelFetche
r
.
create
(
clazz
);
pageModel
FetcherList
.
add
(
pageModelFetche
r
);
PageModel
Extractor
pageModelExtractor
=
PageModelExtracto
r
.
create
(
clazz
);
pageModel
ExtractorList
.
add
(
pageModelExtracto
r
);
}
ObjectPageProcessor
objectPageProcessor
=
new
ObjectPageProcessor
(
site
,
pageModel
Fetche
rList
);
ObjectPageProcessor
objectPageProcessor
=
new
ObjectPageProcessor
(
site
,
pageModel
Extracto
rList
);
return
objectPageProcessor
;
}
private
ObjectPageProcessor
(
Site
site
,
List
<
PageModel
Fetcher
>
pageModelFetche
rList
)
{
private
ObjectPageProcessor
(
Site
site
,
List
<
PageModel
Extractor
>
pageModelExtracto
rList
)
{
this
.
site
=
site
;
this
.
pageModel
FetcherList
=
pageModelFetche
rList
;
this
.
pageModel
ExtractorList
=
pageModelExtracto
rList
;
targetUrlPatterns
=
new
HashSet
<
Pattern
>();
for
(
PageModel
Fetcher
pageModelFetcher
:
pageModelFetche
rList
)
{
targetUrlPatterns
.
addAll
(
pageModel
Fetche
r
.
getTargetUrlPatterns
());
for
(
PageModel
Extractor
pageModelExtractor
:
pageModelExtracto
rList
)
{
targetUrlPatterns
.
addAll
(
pageModel
Extracto
r
.
getTargetUrlPatterns
());
}
}
@Override
public
void
process
(
Page
page
)
{
for
(
PageModel
Fetcher
pageModelFetcher
:
pageModelFetche
rList
)
{
Object
process
=
pageModel
Fetche
r
.
process
(
page
);
page
.
putField
(
pageModel
Fetche
r
.
getClazz
().
getCanonicalName
(),
process
);
for
(
PageModel
Extractor
pageModelExtractor
:
pageModelExtracto
rList
)
{
Object
process
=
pageModel
Extracto
r
.
process
(
page
);
page
.
putField
(
pageModel
Extracto
r
.
getClazz
().
getCanonicalName
(),
process
);
}
for
(
String
link
:
page
.
getHtml
().
links
().
all
())
{
for
(
Pattern
targetUrlPattern
:
targetUrlPatterns
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModel
Fetche
r.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModel
Extracto
r.java
View file @
f08ffc34
...
...
@@ -17,30 +17,30 @@ import java.util.regex.Pattern;
* @date: 13-8-1 <br>
* Time: 下午9:33 <br>
*/
class
PageModel
Fetche
r
{
class
PageModel
Extracto
r
{
private
List
<
Pattern
>
targetUrlPatterns
;
private
Class
clazz
;
private
List
<
Field
Fetcher
>
fieldFetche
rs
;
private
List
<
Field
Extractor
>
fieldExtracto
rs
;
public
static
PageModel
Fetche
r
create
(
Class
clazz
)
{
PageModel
Fetcher
pageModelFetcher
=
new
PageModelFetche
r
();
pageModel
Fetche
r
.
init
(
clazz
);
return
pageModel
Fetche
r
;
public
static
PageModel
Extracto
r
create
(
Class
clazz
)
{
PageModel
Extractor
pageModelExtractor
=
new
PageModelExtracto
r
();
pageModel
Extracto
r
.
init
(
clazz
);
return
pageModel
Extracto
r
;
}
private
void
init
(
Class
clazz
)
{
this
.
clazz
=
clazz
;
initTargetUrlPatterns
();
field
Fetchers
=
new
ArrayList
<
FieldFetche
r
>();
field
Extractors
=
new
ArrayList
<
FieldExtracto
r
>();
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
Fetcher
fetcher
=
field
.
getAnnotation
(
Fetcher
.
class
);
String
value
=
fetcher
.
value
();
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
String
value
=
extractBy
.
value
();
Selector
selector
;
switch
(
fetcher
.
type
())
{
switch
(
extractBy
.
type
())
{
case
Css:
selector
=
new
CssSelector
(
value
);
break
;
...
...
@@ -53,7 +53,7 @@ class PageModelFetcher {
default
:
selector
=
new
XpathSelector
(
value
);
}
field
Fetchers
.
add
(
new
FieldFetche
r
(
field
,
selector
));
field
Extractors
.
add
(
new
FieldExtracto
r
(
field
,
selector
));
}
}
...
...
@@ -83,8 +83,8 @@ class PageModelFetcher {
Object
o
=
null
;
try
{
o
=
clazz
.
newInstance
();
for
(
Field
Fetcher
fieldFetcher
:
fieldFetche
rs
)
{
field
Fetcher
.
getField
().
set
(
o
,
fieldFetche
r
.
getSelector
().
select
(
page
.
getHtml
().
toString
()));
for
(
Field
Extractor
fieldExtractor
:
fieldExtracto
rs
)
{
field
Extractor
.
getField
().
set
(
o
,
fieldExtracto
r
.
getSelector
().
select
(
page
.
getHtml
().
toString
()));
}
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java
View file @
f08ffc34
...
...
@@ -8,10 +8,10 @@ package us.codecraft.webmagic.annotation;
@TargetUrl
(
"http://my.oschina.net/flashsword/blog/*"
)
public
class
Blog
{
@
Fetcher
(
"//title"
)
@
ExtractBy
(
"//title"
)
private
String
title
;
@
Fetcher
(
value
=
"div.BlogContent"
,
type
=
Fetcher
.
Type
.
Css
)
@
ExtractBy
(
value
=
"div.BlogContent"
,
type
=
ExtractBy
.
Type
.
Css
)
private
String
content
;
@Override
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment