Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
db67db81
Commit
db67db81
authored
Apr 08, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#523 remove fixAllRelativeHrefs by default, get absolute urls for links()
parent
abd020b4
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
96 additions
and
7 deletions
+96
-7
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+3
-6
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+10
-0
HtmlNode.java
...rc/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+1
-1
LinksSelector.java
...in/java/us/codecraft/webmagic/selector/LinksSelector.java
+51
-0
HtmlTest.java
...ic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
+10
-0
LinksSelectorTest.java
...ava/us/codecraft/webmagic/selector/LinksSelectorTest.java
+21
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
db67db81
package
us
.
codecraft
.
webmagic
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Json
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
/**
...
...
@@ -76,7 +73,7 @@ public class Page {
*/
public
Html
getHtml
()
{
if
(
html
==
null
)
{
html
=
new
Html
(
UrlUtils
.
fixAllRelativeHrefs
(
rawText
,
request
.
getUrl
()
));
html
=
new
Html
(
rawText
,
request
.
getUrl
(
));
}
return
html
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
db67db81
...
...
@@ -44,6 +44,16 @@ public class Html extends HtmlNode {
*/
private
Document
document
;
public
Html
(
String
text
,
String
url
)
{
try
{
disableJsoupHtmlEntityEscape
();
this
.
document
=
Jsoup
.
parse
(
text
,
url
);
}
catch
(
Exception
e
)
{
this
.
document
=
null
;
logger
.
warn
(
"parse document error "
,
e
);
}
}
public
Html
(
String
text
)
{
try
{
disableJsoupHtmlEntityEscape
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
View file @
db67db81
...
...
@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
@Override
public
Selectable
links
()
{
return
xpath
(
"//a/@href"
);
return
selectElements
(
new
LinksSelector
()
);
}
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
0 → 100644
View file @
db67db81
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.helper.StringUtil
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* Links selector based on jsoup. Use absolute url. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.7.0
*/
public
class
LinksSelector
extends
BaseElementSelector
{
@Override
public
String
select
(
Element
element
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
List
<
String
>
selectList
(
Element
element
)
{
Elements
elements
=
element
.
select
(
"a"
);
List
<
String
>
links
=
new
ArrayList
<
String
>(
elements
.
size
());
for
(
Element
element0
:
elements
)
{
if
(!
StringUtil
.
isBlank
(
element0
.
baseUri
()))
{
links
.
add
(
element0
.
attr
(
"abs:href"
));
}
else
{
links
.
add
(
element0
.
attr
(
"href"
));
}
}
return
links
;
}
@Override
public
Element
selectElement
(
Element
element
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
List
<
Element
>
selectElements
(
Element
element
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
boolean
hasAttribute
()
{
return
true
;
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
View file @
db67db81
...
...
@@ -48,4 +48,14 @@ public class HtmlTest {
Selectable
selectable
=
html
.
xpath
(
"//a[1]"
).
nodes
().
get
(
0
);
assertThat
(
selectable
.
xpath
(
"/a/@href"
).
get
()).
isEqualTo
(
"/xx/xx"
);
}
@Test
public
void
testGetHrefsByJsoup
(){
Html
html
=
new
Html
(
"<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>"
,
"https://github.com/code4craft/webmagic/"
);
assertThat
(
html
.
xpath
(
"//a[1]/@abs:href"
).
get
()).
isEqualTo
(
"https://github.com/code4craft/webmagic/issues"
);
assertThat
(
html
.
xpath
(
"//img/@abs:src"
).
get
()).
isEqualTo
(
"https://github.com/code4craft/webmagic/webmagic.jpg"
);
html
=
new
Html
(
"<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>"
);
assertThat
(
html
.
xpath
(
"//a[1]/@abs:href"
).
get
()).
isEqualTo
(
"https://github.com/code4craft/webmagic/issues"
);
assertThat
(
html
.
xpath
(
"//img/@abs:src"
).
get
()).
isEqualTo
(
"https://github.com/code4craft/webmagic/webmagic.jpg"
);
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
0 → 100644
View file @
db67db81
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.junit.Test
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
* Time: 下午9:41
*/
public
class
LinksSelectorTest
{
private
String
html
=
"<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>"
;
@Test
public
void
testLinks
()
throws
Exception
{
List
<
String
>
links
=
new
LinksSelector
().
selectList
(
html
);
System
.
out
.
println
(
links
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment