Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
03c25123
Commit
03c25123
authored
Apr 13, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add Json parse support
parent
843e928c
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
150 additions
and
13 deletions
+150
-13
pom.xml
webmagic-core/pom.xml
+11
-0
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+17
-1
Json.java
...re/src/main/java/us/codecraft/webmagic/selector/Json.java
+64
-0
JsonPathSelector.java
...java/us/codecraft/webmagic/selector/JsonPathSelector.java
+1
-1
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+5
-0
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+9
-0
HtmlTest.java
...ic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
+2
-2
JsonPathSelectorTest.java
.../us/codecraft/webmagic/selector/JsonPathSelectorTest.java
+0
-0
JsonTest.java
...rc/test/java/us/codecraft/webmagic/selector/JsonTest.java
+20
-0
pom.xml
webmagic-extension/pom.xml
+0
-9
AngularJSProcessor.java
...ava/us/codecraft/webmagic/samples/AngularJSProcessor.java
+21
-0
No files found.
webmagic-core/pom.xml
View file @
03c25123
...
...
@@ -65,6 +65,17 @@
<artifactId>
commons-io
</artifactId>
</dependency>
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
<artifactId>
json-path
</artifactId>
<version>
0.8.1
</version>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
fastjson
</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
03c25123
...
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
...
@@ -31,6 +32,8 @@ public class Page {
private
Html
html
;
private
Json
json
;
private
String
rawText
;
private
Selectable
url
;
...
...
@@ -72,6 +75,19 @@ public class Page {
return
html
;
}
/**
* get json content of page
*
* @return json
* @since 0.5.0
*/
public
Json
getJson
()
{
if
(
json
==
null
)
{
json
=
new
Json
(
rawText
);
}
return
json
;
}
/**
* @param html
* @deprecated since 0.4.0
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
0 → 100644
View file @
03c25123
package
us
.
codecraft
.
webmagic
.
selector
;
import
com.alibaba.fastjson.JSON
;
import
org.jsoup.parser.TokenQueue
;
import
java.util.List
;
/**
* parse json
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public
class
Json
extends
PlainText
{
public
Json
(
List
<
String
>
strings
)
{
super
(
strings
);
}
public
Json
(
String
text
)
{
super
(
text
);
}
/**
* remove padding for JSONP
* @param padding
* @return
*/
public
Json
removePadding
(
String
padding
)
{
String
text
=
getText
();
TokenQueue
tokenQueue
=
new
TokenQueue
(
text
);
tokenQueue
.
consumeWhitespace
();
tokenQueue
.
consume
(
padding
);
tokenQueue
.
consumeWhitespace
();
String
chompBalanced
=
tokenQueue
.
chompBalanced
(
'('
,
')'
);
return
new
Json
(
chompBalanced
);
}
public
<
T
>
T
toObject
(
Class
<
T
>
clazz
)
{
if
(
getText
()
==
null
)
{
return
null
;
}
return
JSON
.
parseObject
(
getText
(),
clazz
);
}
public
<
T
>
List
<
T
>
toList
(
Class
<
T
>
clazz
)
{
if
(
getText
()
==
null
)
{
return
null
;
}
return
JSON
.
parseArray
(
getText
(),
clazz
);
}
public
String
getText
()
{
if
(
strings
!=
null
&&
strings
.
size
()
>
0
)
{
return
strings
.
get
(
0
);
}
return
null
;
}
@Override
public
Selectable
jsonPath
(
String
jsonPath
)
{
JsonPathSelector
jsonPathSelector
=
new
JsonPathSelector
(
jsonPath
);
return
selectList
(
jsonPathSelector
,
strings
);
}
}
webmagic-
extension
/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
→
webmagic-
core
/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
View file @
03c25123
...
...
@@ -22,7 +22,7 @@ public class JsonPathSelector implements Selector {
public
JsonPathSelector
(
String
jsonPathStr
)
{
this
.
jsonPathStr
=
jsonPathStr
;
this
.
jsonPath
=
JsonPath
.
compile
(
jsonPathStr
);
this
.
jsonPath
=
JsonPath
.
compile
(
this
.
jsonPathStr
);
}
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
03c25123
...
...
@@ -108,6 +108,11 @@ public class PlainText implements Selectable {
return
strings
;
}
@Override
public
Selectable
jsonPath
(
String
jsonPath
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
String
get
()
{
if
(
CollectionUtils
.
isNotEmpty
(
all
()))
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
03c25123
...
...
@@ -119,4 +119,13 @@ public interface Selectable {
* @return multi string result
*/
public
List
<
String
>
all
();
/**
* extract by JSON Path expression
*
* @param jsonPath
* @return
*/
public
Selectable
jsonPath
(
String
jsonPath
);
}
webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
View file @
03c25123
package
us
.
codecraft
.
webmagic
;
import
org.junit.Assert
;
import
org.junit.Test
;
import
us.codecraft.webmagic.selector.Html
;
...
...
@@ -14,7 +13,8 @@ public class HtmlTest {
@Test
public
void
testRegexSelector
()
{
Html
selectable
=
new
Html
(
"aaaaaaab"
);
Assert
.
assertEquals
(
"abbabbab"
,
(
selectable
.
regex
(
"(.*)"
).
replace
(
"aa(a)"
,
"$1bb"
).
toString
()));
// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
System
.
out
.
println
(
selectable
.
regex
(
"(.*)"
).
replace
(
"aa(a)"
,
"$1bb"
).
toString
());
}
...
...
webmagic-
extension
/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
→
webmagic-
core
/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
View file @
03c25123
File moved
webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java
0 → 100644
View file @
03c25123
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.junit.Test
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
/**
* @author code4crafter@gmai.com
* @since 0.5.0
*/
public
class
JsonTest
{
private
String
text
=
"callback({\"name\":\"json\"})"
;
@Test
public
void
testRemovePadding
()
throws
Exception
{
String
name
=
new
Json
(
text
).
removePadding
(
"callback"
).
jsonPath
(
"$.name"
).
get
();
assertThat
(
name
).
isEqualTo
(
"json"
);
}
}
webmagic-extension/pom.xml
View file @
03c25123
...
...
@@ -10,10 +10,6 @@
<artifactId>
webmagic-extension
</artifactId>
<dependencies>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
fastjson
</artifactId>
</dependency>
<dependency>
<groupId>
redis.clients
</groupId>
<artifactId>
jedis
</artifactId>
...
...
@@ -28,11 +24,6 @@
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
</dependency>
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
<artifactId>
json-path
</artifactId>
<version>
0.8.1
</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
0 → 100644
View file @
03c25123
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
* @author yihua.huang@dianping.com
*/
public
class
AngularJSProcessor
implements
PageProcessor
{
@Override
public
void
process
(
Page
page
)
{
}
@Override
public
Site
getSite
()
{
return
null
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment