Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
7c9e9ce8
Commit
7c9e9ce8
authored
Aug 02, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
xpath2.0
parent
7f27c28d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
204 additions
and
4 deletions
+204
-4
Xpath2Selector.java
...n/java/us/codecraft/webmagic/selector/Xpath2Selector.java
+167
-0
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+37
-4
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
0 → 100644
View file @
7c9e9ce8
package
us
.
codecraft
.
webmagic
.
selector
;
import
net.sf.saxon.lib.NamespaceConstant
;
import
net.sf.saxon.xpath.XPathEvaluator
;
import
org.apache.log4j.Logger
;
import
org.htmlcleaner.CleanerProperties
;
import
org.htmlcleaner.DomSerializer
;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.TagNode
;
import
org.w3c.dom.Document
;
import
org.w3c.dom.NodeList
;
import
javax.xml.namespace.NamespaceContext
;
import
javax.xml.transform.OutputKeys
;
import
javax.xml.transform.Transformer
;
import
javax.xml.transform.TransformerFactory
;
import
javax.xml.transform.dom.DOMSource
;
import
javax.xml.transform.stream.StreamResult
;
import
javax.xml.xpath.XPathConstants
;
import
javax.xml.xpath.XPathExpression
;
import
javax.xml.xpath.XPathExpressionException
;
import
java.io.StringWriter
;
import
java.util.ArrayList
;
import
java.util.Iterator
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
*/
public
class
Xpath2Selector
implements
Selector
{
private
String
xpathStr
;
private
XPathExpression
xPathExpression
;
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
public
Xpath2Selector
(
String
xpathStr
)
{
this
.
xpathStr
=
xpathStr
;
try
{
init
();
}
catch
(
XPathExpressionException
e
)
{
throw
new
IllegalArgumentException
(
"XPath error!"
,
e
);
}
}
enum
XPath2NamespaceContext
implements
NamespaceContext
{
INSTANCE
;
private
final
Map
<
String
,
String
>
prefix2NamespaceMap
=
new
ConcurrentHashMap
<
String
,
String
>();
private
final
Map
<
String
,
List
<
String
>>
namespace2PrefixMap
=
new
ConcurrentHashMap
<
String
,
List
<
String
>>();
private
void
put
(
String
prefix
,
String
namespaceURI
)
{
prefix2NamespaceMap
.
put
(
prefix
,
namespaceURI
);
List
<
String
>
prefixes
=
namespace2PrefixMap
.
get
(
namespaceURI
);
if
(
prefixes
==
null
)
{
prefixes
=
new
ArrayList
<
String
>();
namespace2PrefixMap
.
put
(
namespaceURI
,
prefixes
);
}
prefixes
.
add
(
prefix
);
}
private
XPath2NamespaceContext
()
{
put
(
"fn"
,
NamespaceConstant
.
FN
);
put
(
"xslt"
,
NamespaceConstant
.
XSLT
);
}
@Override
public
String
getNamespaceURI
(
String
prefix
)
{
return
prefix2NamespaceMap
.
get
(
prefix
);
}
@Override
public
String
getPrefix
(
String
namespaceURI
)
{
List
<
String
>
prefixes
=
namespace2PrefixMap
.
get
(
namespaceURI
);
if
(
prefixes
==
null
||
prefixes
.
size
()
<
1
)
{
return
null
;
}
return
prefixes
.
get
(
0
);
}
@Override
public
Iterator
getPrefixes
(
String
namespaceURI
)
{
List
<
String
>
prefixes
=
namespace2PrefixMap
.
get
(
namespaceURI
);
if
(
prefixes
==
null
||
prefixes
.
size
()
<
1
)
{
return
null
;
}
return
prefixes
.
iterator
();
}
}
private
void
init
()
throws
XPathExpressionException
{
XPathEvaluator
xPathEvaluator
=
new
XPathEvaluator
();
xPathEvaluator
.
setNamespaceContext
(
XPath2NamespaceContext
.
INSTANCE
);
xPathExpression
=
xPathEvaluator
.
compile
(
xpathStr
);
}
@Override
public
String
select
(
String
text
)
{
try
{
HtmlCleaner
htmlCleaner
=
new
HtmlCleaner
();
TagNode
tagNode
=
htmlCleaner
.
clean
(
text
);
Document
document
=
new
DomSerializer
(
new
CleanerProperties
()).
createDOM
(
tagNode
);
Object
result
;
try
{
result
=
xPathExpression
.
evaluate
(
document
,
XPathConstants
.
NODESET
);
}
catch
(
XPathExpressionException
e
)
{
result
=
xPathExpression
.
evaluate
(
document
,
XPathConstants
.
STRING
);
}
if
(
result
instanceof
NodeList
)
{
StreamResult
xmlOutput
=
new
StreamResult
(
new
StringWriter
());
Transformer
transformer
=
TransformerFactory
.
newInstance
().
newTransformer
();
transformer
.
setOutputProperty
(
OutputKeys
.
OMIT_XML_DECLARATION
,
"yes"
);
NodeList
nodeList
=
(
NodeList
)
result
;
if
(
nodeList
.
getLength
()
==
0
)
{
return
null
;
}
transformer
.
transform
(
new
DOMSource
(
nodeList
.
item
(
0
)),
xmlOutput
);
return
xmlOutput
.
getWriter
().
toString
();
}
return
result
.
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"select text error! "
+
xpathStr
,
e
);
}
return
null
;
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
try
{
HtmlCleaner
htmlCleaner
=
new
HtmlCleaner
();
TagNode
tagNode
=
htmlCleaner
.
clean
(
text
);
Document
document
=
new
DomSerializer
(
new
CleanerProperties
()).
createDOM
(
tagNode
);
Object
result
;
try
{
result
=
xPathExpression
.
evaluate
(
document
,
XPathConstants
.
NODESET
);
}
catch
(
XPathExpressionException
e
)
{
result
=
xPathExpression
.
evaluate
(
document
,
XPathConstants
.
STRING
);
}
if
(
result
instanceof
NodeList
)
{
NodeList
nodeList
=
(
NodeList
)
result
;
Transformer
transformer
=
TransformerFactory
.
newInstance
().
newTransformer
();
StreamResult
xmlOutput
=
new
StreamResult
();
transformer
.
setOutputProperty
(
OutputKeys
.
OMIT_XML_DECLARATION
,
"yes"
);
for
(
int
i
=
0
;
i
<
nodeList
.
getLength
();
i
++)
{
xmlOutput
.
setWriter
(
new
StringWriter
());
transformer
.
transform
(
new
DOMSource
(
nodeList
.
item
(
i
)),
xmlOutput
);
results
.
add
(
xmlOutput
.
getWriter
().
toString
());
}
}
else
{
results
.
add
(
result
.
toString
());
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"select text error! "
+
xpathStr
,
e
);
}
return
results
;
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
7c9e9ce8
...
...
@@ -12,6 +12,7 @@ import org.htmlcleaner.DomSerializer;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.TagNode
;
import
org.junit.Assert
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.w3c.dom.Document
;
import
org.w3c.dom.NodeList
;
...
...
@@ -1381,9 +1382,7 @@ public class XpathSelectorTest {
//http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
@Test
public
void
testSaxon
()
throws
XPathFactoryConfigurationException
{
System
.
setProperty
(
"javax.xml.xpath.XPathFactory:"
+
NamespaceConstant
.
OBJECT_MODEL_SAXON
,
"net.sf.saxon.xpath.XPathFactoryImpl"
);
System
.
setProperty
(
"javax.xml.xpath.XPathFactory:"
+
NamespaceConstant
.
FN
,
"net.sf.saxon.xpath.XPathFactoryImpl"
);
public
void
testSaxon
()
{
String
text
=
"<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n"
+
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>"
;
try
{
...
...
@@ -1406,10 +1405,44 @@ public class XpathSelectorTest {
}));
XPathExpression
expr
=
xPathEvaluator
.
compile
(
"fn:substring-before(//h1,'\n')"
);
Object
result
=
expr
.
evaluate
(
document
,
XPathConstants
.
STRING
);
System
.
out
.
println
(
result
);
Assert
.
assertNotNull
(
result
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
Xpath2Selector
xpath2Selector
=
new
Xpath2Selector
(
"fn:substring-before(//h1,'\n')"
);
String
select
=
xpath2Selector
.
select
(
text
);
Assert
.
assertNotNull
(
select
);
Assert
.
assertNotNull
(
xpath2Selector
.
selectList
(
text
));
}
@Test
public
void
testXpath2Selector
()
{
Xpath2Selector
xpath2Selector
=
new
Xpath2Selector
(
"//a"
);
String
select
=
xpath2Selector
.
select
(
html
);
Assert
.
assertNotNull
(
select
);
}
@Ignore
(
"take long time"
)
@Test
public
void
performanceTest
()
{
Xpath2Selector
xpath2Selector
=
new
Xpath2Selector
(
"//a"
);
long
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpath2Selector
.
selectList
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
XpathSelector
xpathSelector
=
new
XpathSelector
(
"//a"
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpathSelector
.
selectList
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpath2Selector
.
selectList
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment