Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
b1821624
Commit
b1821624
authored
Sep 22, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add type convert
parent
fba33087
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
368 additions
and
16 deletions
+368
-16
FieldExtractor.java
...main/java/us/codecraft/webmagic/model/FieldExtractor.java
+11
-0
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+90
-10
Formatter.java
...ava/us/codecraft/webmagic/model/annotation/Formatter.java
+41
-0
BasicTypeFormatter.java
...odecraft/webmagic/model/formatter/BasicTypeFormatter.java
+150
-0
DateFormatter.java
.../us/codecraft/webmagic/model/formatter/DateFormatter.java
+29
-0
ObjectFormatter.java
...s/codecraft/webmagic/model/formatter/ObjectFormatter.java
+14
-0
ObjectFormatters.java
.../codecraft/webmagic/model/formatter/ObjectFormatters.java
+27
-0
GithubRepo.java
...src/test/java/us/codecraft/webmagic/model/GithubRepo.java
+6
-6
No files found.
webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.model.formatter.ObjectFormatter
;
import
us.codecraft.webmagic.selector.Selector
;
import
java.lang.reflect.Field
;
...
...
@@ -16,6 +17,8 @@ class FieldExtractor extends Extractor {
private
Method
setterMethod
;
private
ObjectFormatter
objectFormatter
;
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
super
(
selector
,
source
,
notNull
,
multi
);
this
.
field
=
field
;
...
...
@@ -44,4 +47,12 @@ class FieldExtractor extends Extractor {
boolean
isNotNull
()
{
return
notNull
;
}
ObjectFormatter
getObjectFormatter
()
{
return
objectFormatter
;
}
void
setObjectFormatter
(
ObjectFormatter
objectFormatter
)
{
this
.
objectFormatter
=
objectFormatter
;
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.model.annotation.*
;
import
us.codecraft.webmagic.model.formatter.BasicTypeFormatter
;
import
us.codecraft.webmagic.model.formatter.ObjectFormatter
;
import
us.codecraft.webmagic.model.formatter.ObjectFormatters
;
import
us.codecraft.webmagic.selector.*
;
import
us.codecraft.webmagic.utils.ExtractorUtils
;
...
...
@@ -36,6 +40,8 @@ class PageModelExtractor {
private
Extractor
objectExtractor
;
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
public
static
PageModelExtractor
create
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
pageModelExtractor
.
init
(
clazz
);
...
...
@@ -62,14 +68,58 @@ class PageModelExtractor {
fieldExtractor
=
fieldExtractorTmp
;
}
if
(
fieldExtractor
!=
null
)
{
checkFormat
(
field
,
fieldExtractor
);
fieldExtractors
.
add
(
fieldExtractor
);
}
}
}
private
void
checkFormat
(
Field
field
,
FieldExtractor
fieldExtractor
)
{
if
(!
fieldExtractor
.
isMulti
()
&&
!
String
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
fieldExtractor
.
isMulti
()
&&
!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
Class
<?>
fieldClazz
=
BasicTypeFormatter
.
detectBasicClass
(
field
.
getType
());
ObjectFormatter
objectFormatter
=
getObjectFormatter
(
field
,
fieldClazz
);
if
(
objectFormatter
==
null
)
{
throw
new
IllegalStateException
(
"Can't find formatter for field "
+
field
.
getName
()
+
" of type "
+
fieldClazz
);
}
else
{
fieldExtractor
.
setObjectFormatter
(
objectFormatter
);
}
}
else
if
(
fieldExtractor
.
isMulti
())
{
if
(!
List
.
class
.
isAssignableFrom
(
field
.
getType
()))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
fieldExtractors
.
add
(
fieldExtractor
);
Formatter
formatter
=
field
.
getAnnotation
(
Formatter
.
class
);
if
(
formatter
!=
null
)
{
if
(!
formatter
.
subClazz
().
equals
(
Void
.
class
))
{
ObjectFormatter
objectFormatter
=
getObjectFormatter
(
field
,
formatter
.
subClazz
());
if
(
objectFormatter
==
null
)
{
throw
new
IllegalStateException
(
"Can't find formatter for field "
+
field
.
getName
()
+
" of type "
+
formatter
.
subClazz
());
}
else
{
fieldExtractor
.
setObjectFormatter
(
objectFormatter
);
}
}
}
}
}
private
ObjectFormatter
getObjectFormatter
(
Field
field
,
Class
<?>
fieldClazz
)
{
Formatter
formatter
=
field
.
getAnnotation
(
Formatter
.
class
);
if
(
formatter
!=
null
)
{
if
(!
formatter
.
formatter
().
equals
(
ObjectFormatter
.
class
))
{
return
initFormatter
(
formatter
);
}
}
return
ObjectFormatters
.
get
(
fieldClazz
);
}
private
ObjectFormatter
initFormatter
(
Formatter
formatter
)
{
try
{
return
formatter
.
formatter
().
newInstance
();
}
catch
(
InstantiationException
e
)
{
logger
.
error
(
"init ObjectFormatter fail"
,
e
);
}
catch
(
IllegalAccessException
e
)
{
logger
.
error
(
"init ObjectFormatter fail"
,
e
);
}
return
null
;
}
private
FieldExtractor
getAnnotationExtractByUrl
(
Class
clazz
,
Field
field
)
{
...
...
@@ -231,7 +281,12 @@ class PageModelExtractor {
if
((
value
==
null
||
value
.
size
()
==
0
)
&&
fieldExtractor
.
isNotNull
())
{
return
null
;
}
if
(
fieldExtractor
.
getObjectFormatter
()
!=
null
)
{
List
<
Object
>
converted
=
convert
(
value
,
fieldExtractor
.
getObjectFormatter
());
setField
(
o
,
fieldExtractor
,
converted
);
}
else
{
setField
(
o
,
fieldExtractor
,
value
);
}
}
else
{
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
...
...
@@ -254,22 +309,47 @@ class PageModelExtractor {
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
())
{
return
null
;
}
if
(
fieldExtractor
.
getObjectFormatter
()
!=
null
)
{
Object
converted
=
convert
(
value
,
fieldExtractor
.
getObjectFormatter
());
setField
(
o
,
fieldExtractor
,
converted
);
}
else
{
setField
(
o
,
fieldExtractor
,
value
);
}
}
}
if
(
AfterExtractor
.
class
.
isAssignableFrom
(
clazz
))
{
((
AfterExtractor
)
o
).
afterProcess
(
page
);
}
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
(
);
logger
.
error
(
"extract fail"
,
e
);
}
catch
(
IllegalAccessException
e
)
{
e
.
printStackTrace
(
);
logger
.
error
(
"extract fail"
,
e
);
}
catch
(
InvocationTargetException
e
)
{
e
.
printStackTrace
(
);
logger
.
error
(
"extract fail"
,
e
);
}
return
o
;
}
private
Object
convert
(
String
value
,
ObjectFormatter
objectFormatter
)
{
try
{
return
objectFormatter
.
format
(
value
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"convert "
+
value
+
" to "
+
objectFormatter
.
clazz
()
+
" error!"
,
e
);
}
return
null
;
}
private
List
<
Object
>
convert
(
List
<
String
>
values
,
ObjectFormatter
objectFormatter
)
{
List
<
Object
>
objects
=
new
ArrayList
<
Object
>();
for
(
String
value
:
values
)
{
Object
converted
=
convert
(
value
,
objectFormatter
);
if
(
converted
!=
null
)
{
objects
.
add
(
converted
);
}
}
return
objects
;
}
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
Object
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java
0 → 100644
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
.
annotation
;
import
us.codecraft.webmagic.model.formatter.ObjectFormatter
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* Define how the result string is convert to an object for field.
*
* @author code4crafter@gmail.com <br>
* @since 0.3.2
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
public
@interface
Formatter
{
/**
* Set formatter params.
*
* @return formatter params
*/
String
[]
value
();
/**
* Specific the class of field of class of elements in collection for field. <br/>
* It is not necessary to be set because we can detect the class by class of field,
* unless you use a collection as a field. <br/>
*
* @return the class of field
*/
Class
subClazz
()
default
Void
.
class
;
/**
* If there are more than one formatter for a class, just specify the implement.
* @return implement
*/
Class
<?
extends
ObjectFormatter
>
formatter
()
default
ObjectFormatter
.
class
;
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
0 → 100644
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
.
formatter
;
import
java.util.Arrays
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public
abstract
class
BasicTypeFormatter
<
T
>
implements
ObjectFormatter
<
T
>
{
@Override
public
void
initParam
(
String
[]
extra
)
{
}
@Override
public
T
format
(
String
raw
)
throws
Exception
{
if
(
raw
==
null
)
{
return
null
;
}
raw
=
raw
.
trim
();
return
formatTrimmed
(
raw
);
}
protected
abstract
T
formatTrimmed
(
String
raw
)
throws
Exception
;
public
static
final
List
<
ObjectFormatter
>
basicTypeFormatters
=
Arrays
.<
ObjectFormatter
>
asList
(
new
IntegerFormatter
(),
new
LongFormatter
(),
new
DoubleFormatter
(),
new
FloatFormatter
(),
new
ShortFormatter
(),
new
CharactorFormatter
(),
new
ByteFormatter
(),
new
BooleanFormatter
());
public
static
Class
<?>
detectBasicClass
(
Class
<?>
type
)
{
if
(
type
.
equals
(
Integer
.
TYPE
)
||
type
.
equals
(
Integer
.
class
))
{
return
Integer
.
class
;
}
else
if
(
type
.
equals
(
Long
.
TYPE
)
||
type
.
equals
(
Long
.
class
))
{
return
Long
.
class
;
}
else
if
(
type
.
equals
(
Double
.
TYPE
)
||
type
.
equals
(
Double
.
class
))
{
return
Double
.
class
;
}
else
if
(
type
.
equals
(
Float
.
TYPE
)
||
type
.
equals
(
Float
.
class
))
{
return
Float
.
class
;
}
else
if
(
type
.
equals
(
Short
.
TYPE
)
||
type
.
equals
(
Short
.
class
))
{
return
Short
.
class
;
}
else
if
(
type
.
equals
(
Character
.
TYPE
)
||
type
.
equals
(
Character
.
class
))
{
return
Character
.
class
;
}
else
if
(
type
.
equals
(
Byte
.
TYPE
)
||
type
.
equals
(
Byte
.
class
))
{
return
Byte
.
class
;
}
else
if
(
type
.
equals
(
Boolean
.
TYPE
)
||
type
.
equals
(
Boolean
.
class
))
{
return
Boolean
.
class
;
}
return
type
;
}
public
static
class
IntegerFormatter
extends
BasicTypeFormatter
<
Integer
>
{
@Override
public
Integer
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Integer
.
parseInt
(
raw
);
}
@Override
public
Class
<
Integer
>
clazz
()
{
return
Integer
.
class
;
}
}
public
static
class
LongFormatter
extends
BasicTypeFormatter
<
Long
>
{
@Override
public
Long
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Long
.
parseLong
(
raw
);
}
@Override
public
Class
<
Long
>
clazz
()
{
return
Long
.
class
;
}
}
public
static
class
DoubleFormatter
extends
BasicTypeFormatter
<
Double
>
{
@Override
public
Double
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Double
.
parseDouble
(
raw
);
}
@Override
public
Class
<
Double
>
clazz
()
{
return
Double
.
class
;
}
}
public
static
class
FloatFormatter
extends
BasicTypeFormatter
<
Float
>
{
@Override
public
Float
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Float
.
parseFloat
(
raw
);
}
@Override
public
Class
<
Float
>
clazz
()
{
return
Float
.
class
;
}
}
public
static
class
ShortFormatter
extends
BasicTypeFormatter
<
Short
>
{
@Override
public
Short
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Short
.
parseShort
(
raw
);
}
@Override
public
Class
<
Short
>
clazz
()
{
return
Short
.
class
;
}
}
public
static
class
CharactorFormatter
extends
BasicTypeFormatter
<
Character
>
{
@Override
public
Character
formatTrimmed
(
String
raw
)
throws
Exception
{
return
raw
.
charAt
(
0
);
}
@Override
public
Class
<
Character
>
clazz
()
{
return
Character
.
class
;
}
}
public
static
class
ByteFormatter
extends
BasicTypeFormatter
<
Byte
>
{
@Override
public
Byte
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Byte
.
parseByte
(
raw
,
10
);
}
@Override
public
Class
<
Byte
>
clazz
()
{
return
Byte
.
class
;
}
}
public
static
class
BooleanFormatter
extends
BasicTypeFormatter
<
Boolean
>
{
@Override
public
Boolean
formatTrimmed
(
String
raw
)
throws
Exception
{
return
Boolean
.
parseBoolean
(
raw
);
}
@Override
public
Class
<
Boolean
>
clazz
()
{
return
Boolean
.
class
;
}
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java
0 → 100644
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
.
formatter
;
import
org.apache.commons.lang3.time.DateUtils
;
import
java.util.Date
;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public
class
DateFormatter
implements
ObjectFormatter
<
Date
>
{
private
String
[]
datePatterns
=
new
String
[]{
"YYYY-MM-dd HH:mm"
};
@Override
public
Date
format
(
String
raw
)
throws
Exception
{
return
DateUtils
.
parseDate
(
raw
,
datePatterns
);
}
@Override
public
Class
<
Date
>
clazz
()
{
return
Date
.
class
;
}
@Override
public
void
initParam
(
String
[]
extra
)
{
datePatterns
=
extra
;
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java
0 → 100644
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
.
formatter
;
/**
* @author code4crafter@gmail.com
*/
public
interface
ObjectFormatter
<
T
>
{
T
format
(
String
raw
)
throws
Exception
;
Class
<
T
>
clazz
();
void
initParam
(
String
[]
extra
);
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
0 → 100644
View file @
b1821624
package
us
.
codecraft
.
webmagic
.
model
.
formatter
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public
class
ObjectFormatters
{
private
static
Map
<
Class
,
ObjectFormatter
>
formatterMap
=
new
ConcurrentHashMap
<
Class
,
ObjectFormatter
>();
static
{
for
(
ObjectFormatter
basicTypeFormatter
:
BasicTypeFormatter
.
basicTypeFormatters
)
{
put
(
basicTypeFormatter
);
}
}
public
static
void
put
(
ObjectFormatter
objectFormatter
)
{
formatterMap
.
put
(
objectFormatter
.
clazz
(),
objectFormatter
);
}
public
static
<
T
>
ObjectFormatter
<
T
>
get
(
Class
<
T
>
clazz
){
return
formatterMap
.
get
(
clazz
);
}
}
webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java
View file @
b1821624
...
...
@@ -32,10 +32,10 @@ public class GithubRepo implements HasKey {
private
List
<
String
>
language
;
@ExtractBy
(
"//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()"
)
private
String
star
;
private
int
star
;
@ExtractBy
(
"//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()"
)
private
String
fork
;
private
int
fork
;
@ExtractByUrl
private
String
url
;
...
...
@@ -46,8 +46,8 @@ public class GithubRepo implements HasKey {
,
new
PageModelPipeline
<
GithubRepo
>()
{
@Override
public
void
process
(
GithubRepo
o
,
Task
task
)
{
Assert
.
assertEquals
(
"78"
,
o
.
getStar
().
trim
());
Assert
.
assertEquals
(
"65"
,
o
.
getFork
().
trim
());
Assert
.
assertEquals
(
78
,
o
.
getStar
());
Assert
.
assertEquals
(
65
,
o
.
getFork
());
}
},
GithubRepo
.
class
).
setDownloader
(
new
MockDownloader
()).
test
(
"https://github.com/code4craft/webmagic"
);
}
...
...
@@ -77,11 +77,11 @@ public class GithubRepo implements HasKey {
return
url
;
}
public
String
getStar
()
{
public
int
getStar
()
{
return
star
;
}
public
String
getFork
()
{
public
int
getFork
()
{
return
fork
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment