Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
629f8ac2
Commit
629f8ac2
authored
Aug 05, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add extractors chain
parent
27ce3fc1
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
148 additions
and
34 deletions
+148
-34
ExtractBy2.java
...src/main/java/us/codecraft/webmagic/model/ExtractBy2.java
+23
-0
ExtractBy3.java
...src/main/java/us/codecraft/webmagic/model/ExtractBy3.java
+23
-0
Extractor.java
.../src/main/java/us/codecraft/webmagic/model/Extractor.java
+9
-1
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+93
-33
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java
0 → 100644
View file @
629f8ac2
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* 定义类或者字段的抽取规则。<br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
public
@interface
ExtractBy2
{
String
value
();
public
enum
Type
{
XPath2
,
XPath
,
Regex
,
Css
}
Type
type
()
default
Type
.
XPath2
;
}
webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java
0 → 100644
View file @
629f8ac2
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* 定义类或者字段的抽取规则。<br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
public
@interface
ExtractBy3
{
String
value
();
public
enum
Type
{
XPath2
,
XPath
,
Regex
,
Css
}
Type
type
()
default
Type
.
XPath2
;
}
webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
View file @
629f8ac2
...
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.selector.Selector;
...
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.selector.Selector;
*/
*/
class
Extractor
{
class
Extractor
{
protected
final
Selector
selector
;
protected
Selector
selector
;
protected
final
Source
source
;
protected
final
Source
source
;
...
@@ -37,4 +37,12 @@ class Extractor {
...
@@ -37,4 +37,12 @@ class Extractor {
boolean
isNotNull
()
{
boolean
isNotNull
()
{
return
notNull
;
return
notNull
;
}
}
boolean
isMulti
()
{
return
multi
;
}
void
setSelector
(
Selector
selector
)
{
this
.
selector
=
selector
;
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
629f8ac2
...
@@ -14,6 +14,7 @@ import java.util.regex.Pattern;
...
@@ -14,6 +14,7 @@ import java.util.regex.Pattern;
/**
/**
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* @date: 13-8-1 <br>
* Time: 下午9:33 <br>
* Time: 下午9:33 <br>
...
@@ -46,41 +47,54 @@ class PageModelExtractor {
...
@@ -46,41 +47,54 @@ class PageModelExtractor {
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
field
.
setAccessible
(
true
);
getAnnotationExtractBy
(
clazz
,
field
);
FieldExtractor
fieldExtractor
=
getAnnotationExtractBy
(
clazz
,
field
);
getAnnotationExtractByRaw
(
clazz
,
field
);
FieldExtractor
fieldExtractorTmp
=
getAnnotationExtractByRaw
(
clazz
,
field
);
getAnnotationExtractByUrl
(
clazz
,
field
);
if
(
fieldExtractor
!=
null
&&
fieldExtractorTmp
!=
null
)
{
throw
new
IllegalStateException
(
"Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"
);
}
else
if
(
fieldExtractor
==
null
&&
fieldExtractorTmp
!=
null
)
{
fieldExtractor
=
fieldExtractorTmp
;
}
// ExtractBy2 & ExtractBy3
addAnnotationExtractBy2
(
clazz
,
fieldExtractor
);
addAnnotationExtractBy3
(
clazz
,
fieldExtractor
);
fieldExtractorTmp
=
getAnnotationExtractByUrl
(
clazz
,
field
);
if
(
fieldExtractor
!=
null
&&
fieldExtractorTmp
!=
null
)
{
throw
new
IllegalStateException
(
"Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"
);
}
else
if
(
fieldExtractor
==
null
&&
fieldExtractorTmp
!=
null
)
{
fieldExtractor
=
fieldExtractorTmp
;
}
if
(
fieldExtractor
!=
null
)
{
if
(!
fieldExtractor
.
isMulti
()
&&
!
String
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
fieldExtractor
.
isMulti
()
&&
!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
}
}
}
}
}
private
void
getAnnotationExtractByUrl
(
Class
clazz
,
Field
field
)
{
private
FieldExtractor
getAnnotationExtractByUrl
(
Class
clazz
,
Field
field
)
{
FieldExtractor
fieldExtractor
=
null
;
ExtractByUrl
extractByUrl
=
field
.
getAnnotation
(
ExtractByUrl
.
class
);
ExtractByUrl
extractByUrl
=
field
.
getAnnotation
(
ExtractByUrl
.
class
);
if
(
extractByUrl
!=
null
)
{
if
(
extractByUrl
!=
null
)
{
if
(!
extractByUrl
.
multi
()
&&
!
String
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractByUrl
.
multi
()
&&
!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
regexPattern
=
extractByUrl
.
value
();
String
regexPattern
=
extractByUrl
.
value
();
if
(
regexPattern
.
trim
().
equals
(
""
))
{
if
(
regexPattern
.
trim
().
equals
(
""
))
{
regexPattern
=
".*"
;
regexPattern
=
".*"
;
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
(),
extractByUrl
.
multi
());
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
(),
extractByUrl
.
multi
());
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
}
}
fieldExtractors
.
add
(
fieldExtractor
);
}
}
return
fieldExtractor
;
}
}
private
void
getAnnotationExtractBy
(
Class
clazz
,
Field
field
)
{
private
FieldExtractor
getAnnotationExtractBy
(
Class
clazz
,
Field
field
)
{
FieldExtractor
fieldExtractor
=
null
;
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
if
(
extractBy
!=
null
)
{
if
(
extractBy
!=
null
)
{
if
(!
extractBy
.
multi
()
&&
!
String
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractBy
.
multi
()
&&
!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
value
=
extractBy
.
value
();
String
value
=
extractBy
.
value
();
Selector
selector
;
Selector
selector
;
switch
(
extractBy
.
type
())
{
switch
(
extractBy
.
type
())
{
...
@@ -99,23 +113,69 @@ class PageModelExtractor {
...
@@ -99,23 +113,69 @@ class PageModelExtractor {
default
:
default
:
selector
=
new
Xpath2Selector
(
value
);
selector
=
new
Xpath2Selector
(
value
);
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
}
}
fieldExtractors
.
add
(
fieldExtractor
);
}
}
return
fieldExtractor
;
}
}
private
void
getAnnotationExtractByRaw
(
Class
clazz
,
Field
field
)
{
private
void
addAnnotationExtractBy2
(
Class
clazz
,
FieldExtractor
fieldExtractor
)
{
ExtractBy2
extractBy
=
fieldExtractor
.
getField
().
getAnnotation
(
ExtractBy2
.
class
);
if
(
extractBy
!=
null
)
{
String
value
=
extractBy
.
value
();
Selector
selector
;
switch
(
extractBy
.
type
())
{
case
Css:
selector
=
new
CssSelector
(
value
);
break
;
case
Regex:
selector
=
new
RegexSelector
(
value
);
break
;
case
XPath:
selector
=
new
XpathSelector
(
value
);
break
;
case
XPath2:
selector
=
new
Xpath2Selector
(
value
);
break
;
default
:
selector
=
new
Xpath2Selector
(
value
);
}
fieldExtractor
.
setSelector
(
new
AndSelector
(
fieldExtractor
.
getSelector
(),
selector
));
}
}
private
void
addAnnotationExtractBy3
(
Class
clazz
,
FieldExtractor
fieldExtractor
)
{
ExtractBy3
extractBy
=
fieldExtractor
.
getField
().
getAnnotation
(
ExtractBy3
.
class
);
if
(
extractBy
!=
null
)
{
String
value
=
extractBy
.
value
();
Selector
selector
;
switch
(
extractBy
.
type
())
{
case
Css:
selector
=
new
CssSelector
(
value
);
break
;
case
Regex:
selector
=
new
RegexSelector
(
value
);
break
;
case
XPath:
selector
=
new
XpathSelector
(
value
);
break
;
case
XPath2:
selector
=
new
Xpath2Selector
(
value
);
break
;
default
:
selector
=
new
Xpath2Selector
(
value
);
}
fieldExtractor
.
setSelector
(
new
AndSelector
(
fieldExtractor
.
getSelector
(),
selector
));
}
}
private
FieldExtractor
getAnnotationExtractByRaw
(
Class
clazz
,
Field
field
)
{
FieldExtractor
fieldExtractor
=
null
;
ExtractByRaw
extractByRaw
=
field
.
getAnnotation
(
ExtractByRaw
.
class
);
ExtractByRaw
extractByRaw
=
field
.
getAnnotation
(
ExtractByRaw
.
class
);
if
(
extractByRaw
!=
null
)
{
if
(
extractByRaw
!=
null
)
{
if
(!
extractByRaw
.
multi
()
&&
!
String
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractByRaw
.
multi
()
&&
!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
value
=
extractByRaw
.
value
();
String
value
=
extractByRaw
.
value
();
Selector
selector
;
Selector
selector
;
switch
(
extractByRaw
.
type
())
{
switch
(
extractByRaw
.
type
())
{
...
@@ -134,13 +194,13 @@ class PageModelExtractor {
...
@@ -134,13 +194,13 @@ class PageModelExtractor {
default
:
default
:
selector
=
new
Xpath2Selector
(
value
);
selector
=
new
Xpath2Selector
(
value
);
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
RawHtml
,
extractByRaw
.
notNull
(),
extractByRaw
.
multi
());
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
RawHtml
,
extractByRaw
.
notNull
(),
extractByRaw
.
multi
());
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
}
}
fieldExtractors
.
add
(
fieldExtractor
);
}
}
return
fieldExtractor
;
}
}
public
static
Method
getSetterMethod
(
Class
clazz
,
Field
field
)
{
public
static
Method
getSetterMethod
(
Class
clazz
,
Field
field
)
{
...
@@ -197,19 +257,19 @@ class PageModelExtractor {
...
@@ -197,19 +257,19 @@ class PageModelExtractor {
return
null
;
return
null
;
}
}
if
(
extractor
==
null
)
{
if
(
extractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
());
return
processSingle
(
page
,
page
.
getHtml
().
toString
());
}
else
{
}
else
{
if
(
extractor
.
multi
){
if
(
extractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
extractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
List
<
String
>
list
=
extractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
Object
o
=
processSingle
(
page
,
s
);
if
(
o
!=
null
)
{
if
(
o
!=
null
)
{
os
.
add
(
o
);
os
.
add
(
o
);
}
}
}
}
return
os
;
return
os
;
}
else
{
}
else
{
String
select
=
extractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
String
select
=
extractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
Object
o
=
processSingle
(
page
,
select
);
return
o
;
return
o
;
...
@@ -217,12 +277,12 @@ class PageModelExtractor {
...
@@ -217,12 +277,12 @@ class PageModelExtractor {
}
}
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
private
Object
processSingle
(
Page
page
,
String
html
)
{
Object
o
=
null
;
Object
o
=
null
;
try
{
try
{
o
=
clazz
.
newInstance
();
o
=
clazz
.
newInstance
();
for
(
FieldExtractor
fieldExtractor
:
fieldExtractors
)
{
for
(
FieldExtractor
fieldExtractor
:
fieldExtractors
)
{
if
(
fieldExtractor
.
multi
)
{
if
(
fieldExtractor
.
isMulti
()
)
{
List
<
String
>
value
;
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment