Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
65518f76
Commit
65518f76
authored
Aug 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add list support
parent
d4de60a5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
112 additions
and
55 deletions
+112
-55
ExtractBy.java
...ore/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
+6
-5
ExtractByUrl.java
.../src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
+2
-0
Extractor.java
...ore/src/main/java/us/codecraft/webmagic/oo/Extractor.java
+40
-0
FieldExtractor.java
...rc/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
+3
-13
OOSpider.java
...core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
+0
-5
PageModelExtractor.java
...ain/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+54
-29
OschinaBlog.java
...e/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
+6
-1
TestFetcher.java
...e/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
+1
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
View file @
65518f76
...
@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
...
@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
* Time: 下午8:40 <br>
* Time: 下午8:40 <br>
*/
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
@Target
({
ElementType
.
FIELD
,
ElementType
.
TYPE
})
public
@interface
ExtractBy
{
public
@interface
ExtractBy
{
//TODO: add list support
String
value
();
String
value
();
public
enum
Type
{
XPath
,
Regex
,
Css
};
public
enum
Type
{
XPath
2
,
XPath
,
Regex
,
Css
}
Type
type
()
default
Type
.
XPath
;
Type
type
()
default
Type
.
XPath
2
;
boolean
notNull
()
default
true
;
boolean
notNull
()
default
true
;
boolean
multi
()
default
false
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
View file @
65518f76
...
@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
...
@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
boolean
notNull
()
default
true
;
boolean
notNull
()
default
true
;
boolean
multi
()
default
false
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java
0 → 100644
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
import
us.codecraft.webmagic.selector.Selector
;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class
Extractor
{
protected
final
Selector
selector
;
protected
final
Source
source
;
protected
final
boolean
notNull
;
protected
final
boolean
multi
;
static
enum
Source
{
Html
,
Url
}
public
Extractor
(
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
this
.
selector
=
selector
;
this
.
source
=
source
;
this
.
notNull
=
notNull
;
this
.
multi
=
multi
;
}
Selector
getSelector
()
{
return
selector
;
}
Source
getSource
()
{
return
source
;
}
boolean
isNotNull
()
{
return
notNull
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
View file @
65518f76
...
@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
...
@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
* @date: 13-8-1 <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
* Time: 下午9:48 <br>
*/
*/
class
FieldExtractor
{
class
FieldExtractor
extends
Extractor
{
private
final
Field
field
;
private
final
Field
field
;
private
final
Selector
selector
;
private
final
Source
source
;
private
Method
setterMethod
;
private
Method
setterMethod
;
private
final
boolean
notNull
;
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
super
(
selector
,
source
,
notNull
,
multi
);
static
enum
Source
{
Html
,
Url
}
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
)
{
this
.
field
=
field
;
this
.
field
=
field
;
this
.
selector
=
selector
;
this
.
source
=
source
;
this
.
notNull
=
notNull
;
}
}
Field
getField
()
{
Field
getField
()
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
View file @
65518f76
...
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
...
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
...
@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
...
@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
return
this
;
return
this
;
}
}
public
Spider
pipeline
(
Pipeline
pipeline
)
{
throw
new
UnsupportedOperationException
(
"Sorry, OOSpider can only use ObjectPipeline"
);
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
View file @
65518f76
...
@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
...
@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.selector.CssSelector
;
import
us.codecraft.webmagic.selector.*
;
import
us.codecraft.webmagic.selector.RegexSelector
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.XpathSelector
;
import
java.lang.annotation.Annotation
;
import
java.lang.annotation.Annotation
;
import
java.lang.reflect.Field
;
import
java.lang.reflect.Field
;
...
@@ -42,20 +39,22 @@ class PageModelExtractor {
...
@@ -42,20 +39,22 @@ class PageModelExtractor {
this
.
clazz
=
clazz
;
this
.
clazz
=
clazz
;
initTargetUrlPatterns
();
initTargetUrlPatterns
();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
fieldExtractors
=
new
ArrayList
<
FieldExtractor
>();
if
(
clazz
.
isAssignableFrom
(
AfterExtractor
.
class
)){
if
(
clazz
.
isAssignableFrom
(
AfterExtractor
.
class
))
{
try
{
try
{
afterExtractor
=(
AfterExtractor
)
clazz
.
newInstance
();
afterExtractor
=
(
AfterExtractor
)
clazz
.
newInstance
();
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
e
);
throw
new
IllegalArgumentException
(
e
);
}
}
}
}
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
field
.
setAccessible
(
true
);
if
(!
field
.
getType
().
isAssignableFrom
(
String
.
class
)){
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()+
" must be string"
);
}
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
ExtractBy
extractBy
=
field
.
getAnnotation
(
ExtractBy
.
class
);
if
(
extractBy
!=
null
)
{
if
(
extractBy
!=
null
)
{
if
(!
extractBy
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
String
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractBy
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
List
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
value
=
extractBy
.
value
();
String
value
=
extractBy
.
value
();
Selector
selector
;
Selector
selector
;
switch
(
extractBy
.
type
())
{
switch
(
extractBy
.
type
())
{
...
@@ -68,10 +67,13 @@ class PageModelExtractor {
...
@@ -68,10 +67,13 @@ class PageModelExtractor {
case
XPath:
case
XPath:
selector
=
new
XpathSelector
(
value
);
selector
=
new
XpathSelector
(
value
);
break
;
break
;
case
XPath2:
selector
=
new
Xpath2Selector
(
value
);
break
;
default
:
default
:
selector
=
new
XpathSelector
(
value
);
selector
=
new
Xpath
2
Selector
(
value
);
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
());
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
selector
,
FieldExtractor
.
Source
.
Html
,
extractBy
.
notNull
()
,
extractBy
.
multi
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
@@ -80,11 +82,16 @@ class PageModelExtractor {
...
@@ -80,11 +82,16 @@ class PageModelExtractor {
}
}
ExtractByUrl
extractByUrl
=
field
.
getAnnotation
(
ExtractByUrl
.
class
);
ExtractByUrl
extractByUrl
=
field
.
getAnnotation
(
ExtractByUrl
.
class
);
if
(
extractByUrl
!=
null
)
{
if
(
extractByUrl
!=
null
)
{
if
(!
extractByUrl
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
String
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be string"
);
}
else
if
(
extractByUrl
.
multi
()
&&
!
field
.
getType
().
isAssignableFrom
(
List
.
class
))
{
throw
new
IllegalStateException
(
"Field "
+
field
.
getName
()
+
" must be list"
);
}
String
regexPattern
=
extractByUrl
.
value
();
String
regexPattern
=
extractByUrl
.
value
();
if
(
regexPattern
.
trim
().
equals
(
""
))
{
if
(
regexPattern
.
trim
().
equals
(
""
))
{
regexPattern
=
".*"
;
regexPattern
=
".*"
;
}
}
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
());
FieldExtractor
fieldExtractor
=
new
FieldExtractor
(
field
,
new
RegexSelector
(
regexPattern
),
FieldExtractor
.
Source
.
Url
,
extractByUrl
.
notNull
()
,
extractByUrl
.
multi
()
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
Method
setterMethod
=
getSetterMethod
(
clazz
,
field
);
if
(
setterMethod
!=
null
)
{
if
(
setterMethod
!=
null
)
{
fieldExtractor
.
setSetterMethod
(
setterMethod
);
fieldExtractor
.
setSetterMethod
(
setterMethod
);
...
@@ -138,24 +145,42 @@ class PageModelExtractor {
...
@@ -138,24 +145,42 @@ class PageModelExtractor {
try
{
try
{
o
=
clazz
.
newInstance
();
o
=
clazz
.
newInstance
();
for
(
FieldExtractor
fieldExtractor
:
fieldExtractors
)
{
for
(
FieldExtractor
fieldExtractor
:
fieldExtractors
)
{
String
value
;
if
(
fieldExtractor
.
multi
)
{
switch
(
fieldExtractor
.
getSource
())
{
List
<
String
>
value
;
case
Html:
switch
(
fieldExtractor
.
getSource
())
{
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
case
Html:
break
;
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
case
Url:
break
;
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
case
Url:
break
;
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
default
:
break
;
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
default
:
}
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
()){
}
page
.
getResultItems
().
setSkip
(
true
);
if
((
value
==
null
||
value
.
size
()
==
0
)
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
else
{
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
break
;
default
:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
}
if
(
value
==
null
&&
fieldExtractor
.
isNotNull
())
{
page
.
getResultItems
().
setSkip
(
true
);
}
setField
(
o
,
fieldExtractor
,
value
);
}
}
setField
(
o
,
fieldExtractor
,
value
);
}
}
if
(
afterExtractor
!=
null
)
{
if
(
afterExtractor
!=
null
)
{
afterExtractor
.
afterProcess
(
page
,
o
);
afterExtractor
.
afterProcess
(
page
,
o
);
}
}
}
catch
(
InstantiationException
e
)
{
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
@@ -167,7 +192,7 @@ class PageModelExtractor {
...
@@ -167,7 +192,7 @@ class PageModelExtractor {
return
o
;
return
o
;
}
}
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
String
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
private
void
setField
(
Object
o
,
FieldExtractor
fieldExtractor
,
Object
value
)
throws
IllegalAccessException
,
InvocationTargetException
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
if
(
fieldExtractor
.
getSetterMethod
()
!=
null
)
{
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
fieldExtractor
.
getSetterMethod
().
invoke
(
o
,
value
);
}
}
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
oo
;
import
java.util.List
;
/**
/**
* @author yihua.huang@dianping.com <br>
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* @date: 13-8-1 <br>
...
@@ -11,7 +13,10 @@ public class OschinaBlog {
...
@@ -11,7 +13,10 @@ public class OschinaBlog {
@ExtractBy
(
"//title"
)
@ExtractBy
(
"//title"
)
private
String
title
;
private
String
title
;
@ExtractBy
(
value
=
"div.BlogContent"
,
type
=
ExtractBy
.
Type
.
Css
)
@ExtractBy
(
value
=
"div.BlogContent"
,
type
=
ExtractBy
.
Type
.
Css
)
private
String
content
;
private
String
content
;
@ExtractBy
(
value
=
"//div[@class='BlogTags']/a/text()"
,
multi
=
true
)
private
List
<
String
>
tags
;
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
View file @
65518f76
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
oo
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
...
@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
...
@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
*/
*/
public
class
TestFetcher
{
public
class
TestFetcher
{
@Ignore
(
"takes long"
)
//
@Ignore("takes long")
@Test
@Test
public
void
test
()
{
public
void
test
()
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://my.oschina.net/flashsword/blog/145796"
),
OschinaBlog
.
class
)
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://my.oschina.net/flashsword/blog/145796"
),
OschinaBlog
.
class
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment