Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
59ad4cad
Commit
59ad4cad
authored
Nov 28, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#42 Add jsonpath in annotation mode for json result
parent
c2d6d495
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
51 additions
and
9 deletions
+51
-9
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+20
-5
AppStore.java
...src/main/java/us/codecraft/webmagic/example/AppStore.java
+24
-0
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+2
-2
ExtractBy.java
...ava/us/codecraft/webmagic/model/annotation/ExtractBy.java
+1
-1
ExtractByUrl.java
.../us/codecraft/webmagic/model/annotation/ExtractByUrl.java
+1
-1
ExtractorUtils.java
...main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+3
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
59ad4cad
...
...
@@ -9,7 +9,7 @@ import java.util.ArrayList;
import
java.util.List
;
/**
* Selectable
plain text
.<br>
* Selectable
html
.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
...
...
@@ -23,18 +23,30 @@ public class Html extends PlainText {
*/
private
Document
document
;
private
boolean
init
=
false
;
public
Html
(
List
<
String
>
strings
)
{
super
(
strings
);
}
public
Html
(
String
text
)
{
super
(
text
);
}
/**
* lazy init
*/
private
void
initDocument
()
{
if
(
this
.
document
==
null
&&
!
init
)
{
init
=
true
;
//just init once whether the parsing succeeds or not
try
{
this
.
document
=
Jsoup
.
parse
(
text
);
this
.
document
=
Jsoup
.
parse
(
getText
()
);
}
catch
(
Exception
e
)
{
logger
.
warn
(
"parse document error "
,
e
);
}
}
}
public
Html
(
Document
document
)
{
super
(
document
.
html
());
...
...
@@ -47,6 +59,7 @@ public class Html extends PlainText {
@Override
protected
Selectable
select
(
Selector
selector
,
List
<
String
>
strings
)
{
initDocument
();
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
String
result
=
selector
.
select
(
string
);
...
...
@@ -59,6 +72,7 @@ public class Html extends PlainText {
@Override
protected
Selectable
selectList
(
Selector
selector
,
List
<
String
>
strings
)
{
initDocument
();
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
List
<
String
>
result
=
selector
.
selectList
(
string
);
...
...
@@ -69,6 +83,7 @@ public class Html extends PlainText {
@Override
public
Selectable
smartContent
()
{
initDocument
();
SmartContentSelector
smartContentSelector
=
Selectors
.
smartContent
();
return
select
(
smartContentSelector
,
strings
);
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java
0 → 100644
View file @
59ad4cad
package
us
.
codecraft
.
webmagic
.
example
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.model.OOSpider
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public
class
AppStore
{
@ExtractBy
(
type
=
ExtractBy
.
Type
.
JsonPath
,
value
=
"$..trackName"
)
private
String
trackName
;
@ExtractBy
(
type
=
ExtractBy
.
Type
.
JsonPath
,
value
=
"$..description"
)
private
String
description
;
public
static
void
main
(
String
[]
args
)
{
AppStore
appStore
=
OOSpider
.
create
(
Site
.
me
(),
AppStore
.
class
).<
AppStore
>
get
(
"http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"
);
System
.
out
.
println
(
appStore
.
trackName
);
System
.
out
.
println
(
appStore
.
description
);
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
59ad4cad
...
...
@@ -239,7 +239,7 @@ class PageModelExtractor {
}
else
{
if
(
objectExtractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
objectExtractor
.
getSelector
().
selectList
(
page
.
get
Html
().
toString
());
List
<
String
>
list
=
objectExtractor
.
getSelector
().
selectList
(
page
.
get
RawText
());
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
,
false
);
if
(
o
!=
null
)
{
...
...
@@ -248,7 +248,7 @@ class PageModelExtractor {
}
return
os
;
}
else
{
String
select
=
objectExtractor
.
getSelector
().
select
(
page
.
get
Html
().
toString
());
String
select
=
objectExtractor
.
getSelector
().
select
(
page
.
get
RawText
());
Object
o
=
processSingle
(
page
,
select
,
false
);
return
o
;
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
View file @
59ad4cad
...
...
@@ -24,7 +24,7 @@ public @interface ExtractBy {
/**
* types of extractor expressions
*/
public
static
enum
Type
{
XPath
,
Regex
,
Css
}
public
static
enum
Type
{
XPath
,
Regex
,
Css
,
JsonPath
}
/**
* Extractor type, support XPath, CSS Selector and regex.
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java
View file @
59ad4cad
...
...
@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import
java.lang.annotation.Target
;
/**
* Define a extractor
for url
. Only regex can be used. <br>
* Define a extractor
to extract data in url of current page
. Only regex can be used. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
View file @
59ad4cad
...
...
@@ -27,6 +27,9 @@ public class ExtractorUtils {
case
XPath:
selector
=
getXpathSelector
(
value
);
break
;
case
JsonPath:
selector
=
new
JsonPathSelector
(
value
);
break
;
default
:
selector
=
getXpathSelector
(
value
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment