Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
250cc5e6
Commit
250cc5e6
authored
Sep 23, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change formatter to class
parent
b1821624
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
54 additions
and
39 deletions
+54
-39
GithubRepo.java
...c/main/java/us/codecraft/webmagic/example/GithubRepo.java
+4
-20
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+4
-4
BasicTypeFormatter.java
...odecraft/webmagic/model/formatter/BasicTypeFormatter.java
+3
-3
ObjectFormatters.java
.../codecraft/webmagic/model/formatter/ObjectFormatters.java
+12
-5
GithubRepoTest.java
...test/java/us/codecraft/webmagic/model/GithubRepoTest.java
+26
-0
OschinaBlog.java
...java/us/codecraft/webmagic/model/samples/OschinaBlog.java
+5
-7
No files found.
webmagic-extension/src/
test/java/us/codecraft/webmagic/model
/GithubRepo.java
→
webmagic-extension/src/
main/java/us/codecraft/webmagic/example
/GithubRepo.java
View file @
250cc5e6
package
us
.
codecraft
.
webmagic
.
model
;
package
us
.
codecraft
.
webmagic
.
example
;
import
junit.framework.Assert
;
import
org.junit.Test
;
import
us.codecraft.webmagic.MockDownloader
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.model.HasKey
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.model.annotation.ExtractByUrl
;
import
us.codecraft.webmagic.model.annotation.HelpUrl
;
...
...
@@ -25,10 +21,10 @@ public class GithubRepo implements HasKey {
@ExtractByUrl
(
"https://github\\.com/(\\w+)/.*"
)
private
String
author
;
@ExtractBy
(
"//div[@id='readme']"
)
@ExtractBy
(
"//div[@id='readme']
/tidyText()
"
)
private
String
readme
;
@ExtractBy
(
value
=
"//div[@class='repository-lang-stats']//li//span[@class='lang']"
,
multi
=
true
)
@ExtractBy
(
value
=
"//div[@class='repository-lang-stats']//li//span[@class='lang']
/text()
"
,
multi
=
true
)
private
List
<
String
>
language
;
@ExtractBy
(
"//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()"
)
...
...
@@ -40,18 +36,6 @@ public class GithubRepo implements HasKey {
@ExtractByUrl
private
String
url
;
@Test
public
void
test
()
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"https://github.com/code4craft/webmagic"
).
setSleepTime
(
0
)
,
new
PageModelPipeline
<
GithubRepo
>()
{
@Override
public
void
process
(
GithubRepo
o
,
Task
task
)
{
Assert
.
assertEquals
(
78
,
o
.
getStar
());
Assert
.
assertEquals
(
65
,
o
.
getFork
());
}
},
GithubRepo
.
class
).
setDownloader
(
new
MockDownloader
()).
test
(
"https://github.com/code4craft/webmagic"
);
}
@Override
public
String
key
()
{
return
author
+
":"
+
name
;
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
250cc5e6
...
...
@@ -105,15 +105,15 @@ class PageModelExtractor {
Formatter
formatter
=
field
.
getAnnotation
(
Formatter
.
class
);
if
(
formatter
!=
null
)
{
if
(!
formatter
.
formatter
().
equals
(
ObjectFormatter
.
class
))
{
return
initFormatter
(
formatter
);
return
initFormatter
(
formatter
.
formatter
()
);
}
}
return
ObjectFormatters
.
get
(
fieldClazz
);
return
initFormatter
(
ObjectFormatters
.
get
(
fieldClazz
)
);
}
private
ObjectFormatter
initFormatter
(
Formatter
formatter
)
{
private
ObjectFormatter
initFormatter
(
Class
<?
extends
ObjectFormatter
>
formatterClazz
)
{
try
{
return
formatter
.
formatter
()
.
newInstance
();
return
formatter
Clazz
.
newInstance
();
}
catch
(
InstantiationException
e
)
{
logger
.
error
(
"init ObjectFormatter fail"
,
e
);
}
catch
(
IllegalAccessException
e
)
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
View file @
250cc5e6
...
...
@@ -25,9 +25,9 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
protected
abstract
T
formatTrimmed
(
String
raw
)
throws
Exception
;
public
static
final
List
<
ObjectFormatter
>
basicTypeFormatters
=
Arrays
.<
ObjectFormatter
>
asList
(
new
IntegerFormatter
()
,
new
LongFormatter
(),
new
DoubleFormatter
(),
new
FloatFormatter
(),
new
ShortFormatter
()
,
new
CharactorFormatter
(),
new
ByteFormatter
(),
new
BooleanFormatter
()
);
public
static
final
List
<
Class
<?
extends
ObjectFormatter
>>
basicTypeFormatters
=
Arrays
.<
Class
<?
extends
ObjectFormatter
>>
asList
(
IntegerFormatter
.
class
,
LongFormatter
.
class
,
DoubleFormatter
.
class
,
FloatFormatter
.
class
,
ShortFormatter
.
class
,
CharactorFormatter
.
class
,
ByteFormatter
.
class
,
BooleanFormatter
.
class
);
public
static
Class
<?>
detectBasicClass
(
Class
<?>
type
)
{
if
(
type
.
equals
(
Integer
.
TYPE
)
||
type
.
equals
(
Integer
.
class
))
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
View file @
250cc5e6
...
...
@@ -9,19 +9,26 @@ import java.util.concurrent.ConcurrentHashMap;
*/
public
class
ObjectFormatters
{
private
static
Map
<
Class
,
ObjectFormatter
>
formatterMap
=
new
ConcurrentHashMap
<
Class
,
ObjectFormatter
>();
private
static
Map
<
Class
,
Class
<?
extends
ObjectFormatter
>>
formatterMap
=
new
ConcurrentHashMap
<
Class
,
Class
<?
extends
ObjectFormatter
>
>();
static
{
for
(
ObjectFormatter
basicTypeFormatter
:
BasicTypeFormatter
.
basicTypeFormatters
)
{
for
(
Class
<?
extends
ObjectFormatter
>
basicTypeFormatter
:
BasicTypeFormatter
.
basicTypeFormatters
)
{
put
(
basicTypeFormatter
);
}
put
(
DateFormatter
.
class
);
}
public
static
void
put
(
ObjectFormatter
objectFormatter
)
{
formatterMap
.
put
(
objectFormatter
.
clazz
(),
objectFormatter
);
public
static
void
put
(
Class
<?
extends
ObjectFormatter
>
objectFormatter
)
{
try
{
formatterMap
.
put
(
objectFormatter
.
newInstance
().
clazz
(),
objectFormatter
);
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
}
catch
(
IllegalAccessException
e
)
{
e
.
printStackTrace
();
}
}
public
static
<
T
>
ObjectFormatter
<
T
>
get
(
Class
<
T
>
clazz
){
public
static
Class
<?
extends
ObjectFormatter
>
get
(
Class
<?
>
clazz
){
return
formatterMap
.
get
(
clazz
);
}
}
webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java
0 → 100644
View file @
250cc5e6
package
us
.
codecraft
.
webmagic
.
model
;
import
junit.framework.Assert
;
import
org.junit.Test
;
import
us.codecraft.webmagic.MockDownloader
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.example.GithubRepo
;
/**
* @author code4crafter@gmail.com <br>
*/
public
class
GithubRepoTest
{
@Test
public
void
test
()
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"https://github.com/code4craft/webmagic"
).
setSleepTime
(
0
)
,
new
PageModelPipeline
<
GithubRepo
>()
{
@Override
public
void
process
(
GithubRepo
o
,
Task
task
)
{
Assert
.
assertEquals
(
78
,
o
.
getStar
());
Assert
.
assertEquals
(
65
,
o
.
getFork
());
}
},
GithubRepo
.
class
).
setDownloader
(
new
MockDownloader
()).
test
(
"https://github.com/code4craft/webmagic"
);
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
View file @
250cc5e6
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.model.HasKey
;
import
us.codecraft.webmagic.model.OOSpider
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.model.annotation.TargetUrl
;
import
us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline
;
import
java.util.Date
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com <br>
*/
@TargetUrl
(
"http://my.oschina.net/flashsword/blog/\\d+"
)
public
class
OschinaBlog
implements
HasKey
{
public
class
OschinaBlog
{
@ExtractBy
(
"//title"
)
private
String
title
;
...
...
@@ -24,16 +24,14 @@ public class OschinaBlog implements HasKey{
@ExtractBy
(
value
=
"//div[@class='BlogTags']/a/text()"
,
multi
=
true
)
private
List
<
String
>
tags
;
@ExtractBy
(
"//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')"
)
private
Date
date
;
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://my.oschina.net/flashsword/blog"
)
,
new
JsonFilePageModelPipeline
(),
OschinaBlog
.
class
).
run
();
}
@Override
public
String
key
()
{
return
title
;
}
public
String
getTitle
()
{
return
title
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment