Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
50edd22e
Commit
50edd22e
authored
Aug 01, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add annotation
parent
7020b864
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
282 additions
and
1 deletion
+282
-1
Fetcher.java
...c/main/java/us/codecraft/webmagic/annotation/Fetcher.java
+21
-0
FieldFetcher.java
...n/java/us/codecraft/webmagic/annotation/FieldFetcher.java
+30
-0
ObjectPageProcessor.java
...us/codecraft/webmagic/annotation/ObjectPageProcessor.java
+65
-0
PageModelFetcher.java
...va/us/codecraft/webmagic/annotation/PageModelFetcher.java
+104
-0
TargetUrl.java
...main/java/us/codecraft/webmagic/annotation/TargetUrl.java
+17
-0
Selector.java
...rc/main/java/us/codecraft/webmagic/selector/Selector.java
+1
-1
Blog.java
.../src/test/java/us/codecraft/webmagic/annotation/Blog.java
+24
-0
TestFetcher.java
...st/java/us/codecraft/webmagic/annotation/TestFetcher.java
+20
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
FIELD
})
public
@interface
Fetcher
{
String
value
();
public
enum
Type
{
XPath
,
Regex
,
Css
};
Type
type
()
default
Type
.
XPath
;
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
us.codecraft.webmagic.selector.Selector
;
import
java.lang.reflect.Field
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:48 <br>
*/
class
FieldFetcher
{
private
final
Field
field
;
private
final
Selector
selector
;
FieldFetcher
(
Field
field
,
Selector
selector
)
{
this
.
field
=
field
;
this
.
selector
=
selector
;
}
Field
getField
()
{
return
field
;
}
Selector
getSelector
()
{
return
selector
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.ArrayList
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
import
java.util.regex.Pattern
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
public
class
ObjectPageProcessor
implements
PageProcessor
{
private
List
<
PageModelFetcher
>
pageModelFetcherList
;
private
Site
site
;
private
Set
<
Pattern
>
targetUrlPatterns
;
public
static
ObjectPageProcessor
create
(
Site
site
,
Class
...
clazzs
)
{
List
<
PageModelFetcher
>
pageModelFetcherList
=
new
ArrayList
<
PageModelFetcher
>();
for
(
Class
clazz
:
clazzs
)
{
PageModelFetcher
pageModelFetcher
=
PageModelFetcher
.
create
(
clazz
);
pageModelFetcherList
.
add
(
pageModelFetcher
);
}
ObjectPageProcessor
objectPageProcessor
=
new
ObjectPageProcessor
(
site
,
pageModelFetcherList
);
return
objectPageProcessor
;
}
private
ObjectPageProcessor
(
Site
site
,
List
<
PageModelFetcher
>
pageModelFetcherList
)
{
this
.
site
=
site
;
this
.
pageModelFetcherList
=
pageModelFetcherList
;
targetUrlPatterns
=
new
HashSet
<
Pattern
>();
for
(
PageModelFetcher
pageModelFetcher
:
pageModelFetcherList
)
{
targetUrlPatterns
.
addAll
(
pageModelFetcher
.
getTargetUrlPatterns
());
}
}
@Override
public
void
process
(
Page
page
)
{
for
(
PageModelFetcher
pageModelFetcher
:
pageModelFetcherList
)
{
Object
process
=
pageModelFetcher
.
process
(
page
);
page
.
putField
(
pageModelFetcher
.
getClazz
().
getCanonicalName
(),
process
);
}
for
(
String
link
:
page
.
getHtml
().
links
().
all
())
{
for
(
Pattern
targetUrlPattern
:
targetUrlPatterns
)
{
if
(
targetUrlPattern
.
matcher
(
link
).
matches
()){
page
.
addTargetRequest
(
new
Request
(
link
));
}
}
}
}
@Override
public
Site
getSite
()
{
return
site
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.selector.CssSelector
;
import
us.codecraft.webmagic.selector.RegexSelector
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.XpathSelector
;
import
java.lang.annotation.Annotation
;
import
java.lang.reflect.Field
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.regex.Pattern
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午9:33 <br>
*/
class
PageModelFetcher
{
private
List
<
Pattern
>
targetUrlPatterns
;
private
Class
clazz
;
private
List
<
FieldFetcher
>
fieldFetchers
;
public
static
PageModelFetcher
create
(
Class
clazz
)
{
PageModelFetcher
pageModelFetcher
=
new
PageModelFetcher
();
pageModelFetcher
.
init
(
clazz
);
return
pageModelFetcher
;
}
private
void
init
(
Class
clazz
)
{
this
.
clazz
=
clazz
;
initTargetUrlPatterns
();
fieldFetchers
=
new
ArrayList
<
FieldFetcher
>();
for
(
Field
field
:
clazz
.
getDeclaredFields
())
{
field
.
setAccessible
(
true
);
Fetcher
fetcher
=
field
.
getAnnotation
(
Fetcher
.
class
);
String
value
=
fetcher
.
value
();
Selector
selector
;
switch
(
fetcher
.
type
())
{
case
Css:
selector
=
new
CssSelector
(
value
);
break
;
case
Regex:
selector
=
new
RegexSelector
(
value
);
break
;
case
XPath:
selector
=
new
XpathSelector
(
value
);
break
;
default
:
selector
=
new
XpathSelector
(
value
);
}
fieldFetchers
.
add
(
new
FieldFetcher
(
field
,
selector
));
}
}
private
void
initTargetUrlPatterns
()
{
targetUrlPatterns
=
new
ArrayList
<
Pattern
>();
Annotation
annotation
=
clazz
.
getAnnotation
(
TargetUrl
.
class
);
if
(
annotation
==
null
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
".*"
));
}
else
{
String
[]
value
=
((
TargetUrl
)
annotation
).
value
();
for
(
String
s
:
value
)
{
targetUrlPatterns
.
add
(
Pattern
.
compile
(
s
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)));
}
}
}
public
Object
process
(
Page
page
)
{
boolean
matched
=
false
;
for
(
Pattern
targetPattern
:
targetUrlPatterns
)
{
if
(
targetPattern
.
matcher
(
page
.
getUrl
().
toString
()).
matches
())
{
matched
=
true
;
}
}
if
(!
matched
)
{
return
null
;
}
Object
o
=
null
;
try
{
o
=
clazz
.
newInstance
();
for
(
FieldFetcher
fieldFetcher
:
fieldFetchers
)
{
fieldFetcher
.
getField
().
set
(
o
,
fieldFetcher
.
getSelector
().
select
(
page
.
getHtml
().
toString
()));
}
}
catch
(
InstantiationException
e
)
{
e
.
printStackTrace
();
}
catch
(
IllegalAccessException
e
)
{
e
.
printStackTrace
();
}
return
o
;
}
Class
getClazz
()
{
return
clazz
;
}
List
<
Pattern
>
getTargetUrlPatterns
()
{
return
targetUrlPatterns
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Target
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:40 <br>
*/
@Retention
(
java
.
lang
.
annotation
.
RetentionPolicy
.
RUNTIME
)
@Target
({
ElementType
.
TYPE
})
public
@interface
TargetUrl
{
String
[]
value
();
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
View file @
50edd22e
...
...
@@ -8,7 +8,7 @@ import java.util.List;
* Date: 13-4-20
* Time: 下午8:02
*/
interface
Selector
{
public
interface
Selector
{
public
String
select
(
String
text
);
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
*/
@TargetUrl
(
"http://djjchobits.iteye.com/blog/\\d+"
)
public
class
Blog
{
@Fetcher
(
"//title"
)
private
String
title
;
@Fetcher
(
value
=
"div#main"
,
type
=
Fetcher
.
Type
.
Css
)
private
String
content
;
@Override
public
String
toString
()
{
return
"Blog{"
+
"title='"
+
title
+
'\''
+
", content='"
+
content
+
'\''
+
'}'
;
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java
0 → 100644
View file @
50edd22e
package
us
.
codecraft
.
webmagic
.
annotation
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 下午8:42 <br>
*/
public
class
TestFetcher
{
@Test
public
void
test
()
{
Spider
.
create
(
ObjectPageProcessor
.
create
(
Site
.
me
().
addStartUrl
(
"http://djjchobits.iteye.com/blog/569000"
),
Blog
.
class
)).
run
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment