Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
5e9e8b25
Commit
5e9e8b25
authored
Aug 25, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add TextContentSelector
parent
0cc0ccee
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
171 additions
and
7 deletions
+171
-7
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+17
-5
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+19
-2
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+21
-0
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+12
-0
TextContentSelector.java
...a/us/codecraft/webmagic/selector/TextContentSelector.java
+68
-0
TextContentSelectorTest.java
.../codecraft/webmagic/selector/TextContentSelectorTest.java
+34
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
5e9e8b25
...
...
@@ -47,32 +47,44 @@ public class Html extends PlainText {
@Override
public
Selectable
smartContent
()
{
SmartContentSelector
smartContentSelector
=
Selector
Factory
.
getInstatnce
().
newSmartContentSelector
();
SmartContentSelector
smartContentSelector
=
Selector
s
.
smartContent
();
return
select
(
smartContentSelector
,
strings
);
}
@Override
public
Selectable
links
()
{
XpathSelector
xpathSelector
=
Selector
Factory
.
getInstatnce
().
newXpathSelector
(
"//a/@href"
);
XpathSelector
xpathSelector
=
Selector
s
.
xpath
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
}
@Override
public
Selectable
xpath
(
String
xpath
)
{
XpathSelector
xpathSelector
=
Selector
Factory
.
getInstatnce
().
newXpathSelector
(
xpath
);
XpathSelector
xpathSelector
=
Selector
s
.
xpath
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
@Override
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
new
CssSelector
(
selector
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
return
selectList
(
cssSelector
,
strings
);
}
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
new
CssSelector
(
selector
,
attrName
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
return
selectList
(
cssSelector
,
strings
);
}
@Override
public
Selectable
text
()
{
TextContentSelector
selector
=
Selectors
.
text
();
return
select
(
selector
,
strings
);
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
TextContentSelector
selector
=
Selectors
.
text
(
newlineSeparator
);
return
select
(
selector
,
strings
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
5e9e8b25
...
...
@@ -57,13 +57,13 @@ public class PlainText implements Selectable {
@Override
public
Selectable
regex
(
String
regex
)
{
RegexSelector
regexSelector
=
Selector
Factory
.
getInstatnce
().
newRegexSelector
(
regex
);
RegexSelector
regexSelector
=
Selector
s
.
regex
(
regex
);
return
selectList
(
regexSelector
,
strings
);
}
@Override
public
Selectable
regex
(
String
regex
,
int
group
)
{
RegexSelector
regexSelector
=
Selector
Factory
.
getInstatnce
().
newRegexSelector
(
regex
,
group
);
RegexSelector
regexSelector
=
Selector
s
.
regex
(
regex
,
group
);
return
selectList
(
regexSelector
,
strings
);
}
...
...
@@ -106,4 +106,21 @@ public class PlainText implements Selectable {
return
null
;
}
}
@Override
public
Selectable
text
()
{
//do nothing
return
this
;
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
//do nothing
return
this
;
}
@Override
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
5e9e8b25
...
...
@@ -82,6 +82,27 @@ public interface Selectable {
*/
public
String
toString
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
(
String
newlineSeparator
);
/**
* if result exist for select
*
* @return true if result exist
*/
public
boolean
match
();
/**
* multi string result
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
View file @
5e9e8b25
...
...
@@ -16,6 +16,10 @@ public abstract class Selectors {
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
,
group
);
}
public
static
SmartContentSelector
smartContent
()
{
return
SelectorFactory
.
getInstatnce
().
newSmartContentSelector
();
}
public
static
CssSelector
$
(
String
expr
)
{
return
new
CssSelector
(
expr
);
}
...
...
@@ -36,6 +40,14 @@ public abstract class Selectors {
return
new
OrSelector
(
selectors
);
}
public
static
TextContentSelector
text
()
{
return
new
TextContentSelector
();
}
public
static
TextContentSelector
text
(
String
newlineSeperator
)
{
return
new
TextContentSelector
(
newlineSeperator
);
}
public
static
void
main
(
String
[]
args
)
{
String
s
=
"a"
;
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java
0 → 100644
View file @
5e9e8b25
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelector
implements
Selector
{
private
String
newLineSeperator
=
"\n"
;
public
TextContentSelector
()
{
}
public
TextContentSelector
(
String
newLineSeperator
)
{
this
.
newLineSeperator
=
newLineSeperator
;
}
private
final
static
Set
<
String
>
TAGS_IN_NEWLINE
=
new
HashSet
<
String
>();
private
final
static
Set
<
String
>
TAGS_TO_IGNORE
=
new
HashSet
<
String
>();
static
{
TAGS_IN_NEWLINE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"p"
,
"div"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"h6"
,
"br"
,
"li"
}));
TAGS_TO_IGNORE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"head"
,
"style"
,
"script"
,
"noscript"
,
"option"
}));
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
return
select0
(
doc
);
}
protected
String
select0
(
Element
element
)
{
String
tagName
=
element
.
tagName
().
toLowerCase
();
if
(
TAGS_TO_IGNORE
.
contains
(
tagName
))
{
return
""
;
}
StringBuilder
textBuilder
=
new
StringBuilder
();
textBuilder
.
append
(
element
.
text
());
if
(
element
.
children
()
!=
null
)
{
for
(
Element
child
:
element
.
children
())
{
textBuilder
.
append
(
select0
(
child
));
}
}
if
(
TAGS_IN_NEWLINE
.
contains
(
tagName
))
{
textBuilder
.
append
(
newLineSeperator
);
}
return
textBuilder
.
toString
();
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
throw
new
UnsupportedOperationException
();
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java
0 → 100644
View file @
5e9e8b25
package
us
.
codecraft
.
webmagic
.
selector
;
import
junit.framework.Assert
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelectorTest
{
@Test
public
void
test
()
{
String
html
=
"<div class=\"edit-comment-hide\">\n"
+
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n"
+
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n"
+
" </div>\n"
+
" </div>"
;
TextContentSelector
textContentSelector
=
new
TextContentSelector
(
"<br>"
);
String
text
=
textContentSelector
.
select
(
html
);
Assert
.
assertNotNull
(
text
);
}
@Ignore
(
"takes long time"
)
@Test
public
void
testDownload
()
{
String
s
=
new
HttpClientDownloader
().
download
(
"http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/"
,
"utf-8"
)
.
smartContent
().
text
().
toString
();
Assert
.
assertNotNull
(
text
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment