Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
7a4dbb1f
Commit
7a4dbb1f
authored
Aug 02, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
invite notnull
parent
06a39af0
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
95 additions
and
13 deletions
+95
-13
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+5
-0
ExtractBy.java
...main/java/us/codecraft/webmagic/annotation/ExtractBy.java
+2
-0
ExtractByUrl.java
...n/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
+3
-1
FieldExtractor.java
...java/us/codecraft/webmagic/annotation/FieldExtractor.java
+8
-7
HelpUrl.java
...c/main/java/us/codecraft/webmagic/annotation/HelpUrl.java
+17
-0
ObjectPageProcessor.java
...us/codecraft/webmagic/annotation/ObjectPageProcessor.java
+4
-0
PageModelExtractor.java
.../us/codecraft/webmagic/annotation/PageModelExtractor.java
+22
-5
IteyeBlog.java
...a/us/codecraft/webmagic/annotation/samples/IteyeBlog.java
+34
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
7a4dbb1f
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.annotation.ObjectPageProcessor
;
import
us.codecraft.webmagic.downloader.Destroyable
;
import
us.codecraft.webmagic.downloader.Destroyable
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
...
@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
...
@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
return
new
Spider
(
pageProcessor
);
return
new
Spider
(
pageProcessor
);
}
}
public
static
Spider
create
(
Site
site
,
Class
...
pageModels
)
{
return
new
Spider
(
ObjectPageProcessor
.
create
(
site
,
pageModels
));
}
/**
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
* 重新设置startUrls,会覆盖Site本身的startUrls。
*
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java
View file @
7a4dbb1f
...
@@ -18,4 +18,6 @@ public @interface ExtractBy {
...
@@ -18,4 +18,6 @@ public @interface ExtractBy {
public
enum
Type
{
XPath
,
Regex
,
Css
};
public
enum
Type
{
XPath
,
Regex
,
Css
};
Type
type
()
default
Type
.
XPath
;
Type
type
()
default
Type
.
XPath
;
boolean
notNull
()
default
true
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
View file @
7a4dbb1f
...
@@ -11,8 +11,10 @@ import java.lang.annotation.Target;
...
@@ -11,8 +11,10 @@ import java.lang.annotation.Target;
*/
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
@Target
({
ElementType
.
FIELD
})
public
@interface
ExtractByUrl
{
public
@interface
ExtractByUrl
{
String
value
()
default
""
;
String
value
()
default
""
;
boolean
notNull
()
default
true
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
View file @
7a4dbb1f
...
@@ -20,18 +20,15 @@ class FieldExtractor {
...
@@ -20,18 +20,15 @@ class FieldExtractor {
private
Method
setterMethod
;
private
Method
setterMethod
;
static
enum
Source
{
Html
,
Url
}
private
final
boolean
notNull
;
public
FieldExtractor
(
Field
field
,
Selector
selector
)
{
static
enum
Source
{
Html
,
Url
}
this
.
field
=
field
;
this
.
selector
=
selector
;
this
.
source
=
Source
.
Html
;
}
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
)
{
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
)
{
this
.
field
=
field
;
this
.
field
=
field
;
this
.
selector
=
selector
;
this
.
selector
=
selector
;
this
.
source
=
source
;
this
.
source
=
source
;
this
.
notNull
=
notNull
;
}
}
Field
getField
()
{
Field
getField
()
{
...
@@ -53,4 +50,8 @@ class FieldExtractor {
...
@@ -53,4 +50,8 @@ class FieldExtractor {
Method
getSetterMethod
()
{
Method
getSetterMethod
()
{
return
setterMethod
;
return
setterMethod
;
}
}
boolean
isNotNull
()
{
return
notNull
;
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java
0 → 100644
View file @
7a4dbb1f
package
us
.
codecraft
.
webmagic
.
annotation
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
TYPE
})
public
@interface
HelpUrl
{
String
[]
value
();
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
View file @
7a4dbb1f
...
@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
...
@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
targetUrlPatterns
=
new
HashSet
<
Pattern
>();
targetUrlPatterns
=
new
HashSet
<
Pattern
>();
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getTargetUrlPatterns
());
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getTargetUrlPatterns
());
targetUrlPatterns
.
addAll
(
pageModelExtractor
.
getHelpUrlPatterns
());
}
}
}
}
...
@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
...
@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
for
(
PageModelExtractor
pageModelExtractor
:
pageModelExtractorList
)
{
Object
process
=
pageModelExtractor
.
process
(
page
);
Object
process
=
pageModelExtractor
.
process
(
page
);
if
(
process
==
null
){
page
.
getResultItems
().
setSkip
(
true
);
}
postProcessPageModel
(
pageModelExtractor
.
getClazz
(),
process
);
postProcessPageModel
(
pageModelExtractor
.
getClazz
(),
process
);
page
.
putField
(
pageModelExtractor
.
getClazz
().
getCanonicalName
(),
process
);
page
.
putField
(
pageModelExtractor
.
getClazz
().
getCanonicalName
(),
process
);
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
View file @
7a4dbb1f
...
@@ -24,6 +24,8 @@ class PageModelExtractor {
...
@@ -24,6 +24,8 @@ class PageModelExtractor {
private
List
<
Pattern
>
targetUrlPatterns
;
private
List
<
Pattern
>
targetUrlPatterns
;
private
List
<
Pattern
>
helpUrlPatterns
;
private
Class
clazz
;
private
Class
clazz
;
private
List
<
FieldExtractor
>
fieldExtractors
;
private
List
<
FieldExtractor
>
fieldExtractors
;
...
@@ -57,7 +59,7 @@ class PageModelExtractor {
...
@@ -57,7 +59,7 @@ class PageModelExtractor {
default
:
default
:
selector
=
new
XpathSelector
(
value
);
selector
=
new
XpathSelector
(
value
);
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
);
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
@@ -70,7 +72,7 @@ class PageModelExtractor {
...
@@ -70,7 +72,7 @@ class PageModelExtractor {
if
(
regexPattern
.
trim
().
equals
(
""
))
{
if
(
regexPattern
.
trim
().
equals
(
""
))
{
regexPattern
=
".*"
;
regexPattern
=
".*"
;
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
);
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
@@ -102,6 +104,14 @@ class PageModelExtractor {
...
@@ -102,6 +104,14 @@ class PageModelExtractor {
targetUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
targetUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
}
}
}
}
helpUrlPatterns
=
new
ArrayList
<
Pattern
>();
annotation
=
clazz
.
getAnnotation
(
HelpUrl
.
class
);
if
(
annotation
!=
null
)
{
String
[]
value
=
((
HelpUrl
)
annotation
).
value
();
for
(
String
s
:
value
)
{
helpUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
}
}
}
}
public
Object
process
(
Page
page
)
{
public
Object
process
(
Page
page
)
{
...
@@ -129,7 +139,10 @@ class PageModelExtractor {
...
@@ -129,7 +139,10 @@ class PageModelExtractor {
default
:
default
:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
}
}
setField
(
o
,
fieldExtractor
,
value
);
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
()){
page
.
getResultItems
().
setSkip
(
true
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
}
}
catch
(
InstantiationException
e
)
{
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
@@ -142,8 +155,8 @@ class PageModelExtractor {
...
@@ -142,8 +155,8 @@ class PageModelExtractor {
}
}
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
String
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
String
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
}
}
fieldExtractor
.
getField
().
set
(
o
,
value
);
fieldExtractor
.
getField
().
set
(
o
,
value
);
}
}
...
@@ -155,4 +168,8 @@ class PageModelExtractor {
...
@@ -155,4 +168,8 @@ class PageModelExtractor {
List
<
Pattern
>
getTargetUrlPatterns
()
{
List
<
Pattern
>
getTargetUrlPatterns
()
{
return
targetUrlPatterns
;
return
targetUrlPatterns
;
}
}
List
<
Pattern
>
getHelpUrlPatterns
()
{
return
helpUrlPatterns
;
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java
0 → 100644
View file @
7a4dbb1f
package
us
.
codecraft
.
webmagic
.
annotation
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.annotation.ExtractBy
;
import
us.codecraft.webmagic.annotation.TargetUrl
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-2 <br>
* Time: 上午7:52 <br>
*/
@TargetUrl
(
"http://dengminhui.iteye.com/blog/*"
)
public
class
IteyeBlog
{
@ExtractBy
(
"//title"
)
private
String
title
;
@ExtractBy
(
value
=
"div#blog_content"
,
type
=
ExtractBy
.
Type
.
Css
)
private
String
content
;
@Override
public
String
toString
()
{
return
"IteyeBlog{"
+
"title='"
+
title
+
'\''
+
", content='"
+
content
+
'\''
+
'}'
;
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
Site
.
me
().
addStartUrl
(
"http://dengminhui.iteye.com/blog"
),
IteyeBlog
.
class
).
run
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment