Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
65518f76
Commit
65518f76
authored
Aug 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add list support
parent
d4de60a5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
112 additions
and
55 deletions
+112
-55
ExtractBy.java
...ore/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
+6
-5
ExtractByUrl.java
.../src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
+2
-0
Extractor.java
...ore/src/main/java/us/codecraft/webmagic/oo/Extractor.java
+40
-0
FieldExtractor.java
...rc/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
+3
-13
OOSpider.java
...core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
+0
-5
PageModelExtractor.java
...ain/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+54
-29
OschinaBlog.java
...e/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
+6
-1
TestFetcher.java
...e/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
+1
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
View file @
65518f76
...
...
@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
@Target
({
ElementType
.
FIELD
,
ElementType
.
TYPE
})
public
@interface
ExtractBy
{
//TODO: add list support
String
value
();
public
enum
Type
{
XPath
,
Regex
,
Css
};
public
enum
Type
{
XPath
2
,
XPath
,
Regex
,
Css
}
Type
type
()
default
Type
.
XPath
;
Type
type
()
default
Type
.
XPath
2
;
boolean
notNull
()
default
true
;
boolean
multi
()
default
false
;
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
View file @
65518f76
...
...
@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
boolean
notNull
()
default
true
;
boolean
multi
()
default
false
;
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java
0 → 100644
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
import
us.codecraft.webmagic.selector.Selector
;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class
Extractor
{
protected
final
Selector
selector
;
protected
final
Source
source
;
protected
final
boolean
notNull
;
protected
final
boolean
multi
;
static
enum
Source
{
Html
,
Url
}
public
Extractor
(
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
this
.
selector
=
selector
;
this
.
source
=
source
;
this
.
notNull
=
notNull
;
this
.
multi
=
multi
;
}
Selector
getSelector
()
{
return
selector
;
}
Source
getSource
()
{
return
source
;
}
boolean
isNotNull
()
{
return
notNull
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
View file @
65518f76
...
...
@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class
FieldExtractor
{
class
FieldExtractor
extends
Extractor
{
private
final
Field
field
;
private
final
Selector
selector
;
private
final
Source
source
;
private
Method
setterMethod
;
private
final
boolean
notNull
;
static
enum
Source
{
Html
,
Url
}
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
)
{
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
super
(
selector
,
source
,
notNull
,
multi
);
this
.
field
=
field
;
this
.
selector
=
selector
;
this
.
source
=
source
;
this
.
notNull
=
notNull
;
}
Field
getField
()
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
View file @
65518f76
...
...
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
/**
* @author code4crafter@gmail.com <br>
...
...
@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
return
this
;
}
public
Spider
pipeline
(
Pipeline
pipeline
)
{
throw
new
UnsupportedOperationException
(
"Sorry, OOSpider can only use ObjectPipeline"
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
View file @
65518f76
...
...
@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.selector.CssSelector
;
import
us.codecraft.webmagic.selector.RegexSelector
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.XpathSelector
;
import
us.codecraft.webmagic.selector.*
;
import
java.lang.annotation.Annotation
;
import
java.lang.reflect.Field
;
...
...
@@ -42,20 +39,22 @@ class PageModelExtractor {
this
.
clazz
=
clazz
;
initTargetUrlPatterns
();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
if
(
clazz
.
isAssignableFrom
(
AfterExtractor
.
class
)){
if
(
clazz
.
isAssignableFrom
(
AfterExtractor
.
class
))
{
try
{
afterExtractor
=(
AfterExtractor
)
clazz
.
newInstance
();
afterExtractor
=
(
AfterExtractor
)
clazz
.
newInstance
();
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
e
);
}
}
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
if
(!
field
.
getType
().
isAssignableFrom
(
String
.
class
)){
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()+
" must be string"
);
}
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
if
(
extractBy
!=
null
)
{
if
(!
extractBy
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
String
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractBy
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
List
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
value
=
extractBy
.
value
();
Selector
selector
;
switch
(
extractBy
.
type
())
{
...
...
@@ -68,10 +67,13 @@ class PageModelExtractor {
case
XPath:
selector
=
new
XpathSelector
(
value
);
break
;
case
XPath2:
selector
=
new
Xpath2Selector
(
value
);
break
;
default
:
selector
=
new
XpathSelector
(
value
);
selector
=
new
Xpath
2
Selector
(
value
);
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
());
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
()
,
extractBy
.
multi
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
...
@@ -80,11 +82,16 @@ class PageModelExtractor {
}
ExtractByUrl
extractByUrl
=
field
.
getAnnotation
(
ExtractByUrl
.
class
);
if
(
extractByUrl
!=
null
)
{
if
(!
extractByUrl
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
String
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractByUrl
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
List
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
regexPattern
=
extractByUrl
.
value
();
if
(
regexPattern
.
trim
().
equals
(
""
))
{
regexPattern
=
".*"
;
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
());
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
()
,
extractByUrl
.
multi
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
...
@@ -138,24 +145,42 @@ class PageModelExtractor {
try
{
o
=
clazz
.
newInstance
();
for
(
FieldExtractor
fieldExtractor
:
fieldExtractors
)
{
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
}
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
()){
page
.
getResultItems
().
setSkip
(
true
);
if
(
fieldExtractor
.
multi
)
{
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
}
if
((
value
==
null
||
value
.
size
()
==
0
)
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
else
{
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
}
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
if
(
afterExtractor
!=
null
)
{
afterExtractor
.
afterProcess
(
page
,
o
);
if
(
afterExtractor
!=
null
)
{
afterExtractor
.
afterProcess
(
page
,
o
);
}
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
...
...
@@ -167,7 +192,7 @@ class PageModelExtractor {
return
o
;
}
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
String
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
Object
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
}
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
import
java.util.List
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
...
...
@@ -11,7 +13,10 @@ public class OschinaBlog {
@ExtractBy
(
"//title"
)
private
String
title
;
@ExtractBy
(
value
=
"div.BlogContent"
,
type
=
ExtractBy
.
Type
.
Css
)
@ExtractBy
(
value
=
"div.BlogContent"
,
type
=
ExtractBy
.
Type
.
Css
)
private
String
content
;
@ExtractBy
(
value
=
"//div[@class='BlogTags']/a/text()"
,
multi
=
true
)
private
List
<
String
>
tags
;
}
webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Site
;
...
...
@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
*/
public
class
TestFetcher
{
@Ignore
(
"takes long"
)
//
@Ignore("takes long")
@Test
public
void
test
()
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://my.oschina.net/flashsword/blog/145796"
),
OschinaBlog
.
class
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment