Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
2c357453
Commit
2c357453
authored
Sep 02, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor in selectors
parent
85b7cf15
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
35 additions
and
248 deletions
+35
-248
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+26
-16
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+1
-13
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+0
-14
SelectorFactory.java
.../java/us/codecraft/webmagic/selector/SelectorFactory.java
+0
-91
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+8
-12
TextContentSelector.java
...a/us/codecraft/webmagic/selector/TextContentSelector.java
+0
-68
TextContentSelectorTest.java
.../codecraft/webmagic/selector/TextContentSelectorTest.java
+0
-34
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
2c357453
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -11,12 +14,23 @@ import java.util.List;
*/
public
class
Html
extends
PlainText
{
/**
* Store parsed document for better performance when only one text exist.
*/
private
Document
document
;
public
Html
(
List
<
String
>
strings
)
{
super
(
strings
);
}
public
Html
(
String
text
)
{
super
(
text
);
this
.
document
=
Jsoup
.
parse
(
text
);
}
public
Html
(
Document
document
)
{
super
(
document
.
html
());
this
.
document
=
document
;
}
public
static
Html
create
(
String
text
)
{
...
...
@@ -53,38 +67,34 @@ public class Html extends PlainText {
@Override
public
Selectable
links
()
{
XsoupSelector
xpathSelector
=
new
XsoupSelector
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
return
xpath
(
"//a/@href"
);
}
@Override
public
Selectable
xpath
(
String
xpath
)
{
XsoupSelector
xpathSelector
=
new
XsoupSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
document
!=
null
){
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
}
return
selectList
(
xsoupSelector
,
strings
);
}
@Override
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
){
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
}
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
return
selectList
(
cssSelector
,
strings
);
}
@Override
public
Selectable
text
()
{
TextContentSelector
selector
=
Selectors
.
text
();
return
select
(
selector
,
strings
);
if
(
document
!=
null
){
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
TextContentSelector
selector
=
Selectors
.
text
(
newlineSeparator
);
return
select
(
selector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
2c357453
...
...
@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
@Override
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
SelectorFactory
.
getInstatnce
().
newReplaceSelector
(
regex
,
replacement
);
ReplaceSelector
replaceSelector
=
new
ReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
}
...
...
@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
}
}
@Override
public
Selectable
text
()
{
//do nothing
return
this
;
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
//do nothing
return
this
;
}
@Override
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
2c357453
...
...
@@ -82,20 +82,6 @@ public interface Selectable {
*/
public
String
toString
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
(
String
newlineSeparator
);
/**
* if result exist for select
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
deleted
100644 → 0
View file @
85b7cf15
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.lang3.StringUtils
;
import
java.lang.reflect.Constructor
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public
class
SelectorFactory
{
private
Map
<
String
,
Selector
>
innerCache
=
new
ConcurrentHashMap
<
String
,
Selector
>();
private
static
final
SelectorFactory
INSTATNCE
=
new
SelectorFactory
();
public
static
SelectorFactory
getInstatnce
()
{
return
INSTATNCE
;
}
public
RegexSelector
newRegexSelector
(
String
regex
)
{
return
newSelector
(
RegexSelector
.
class
,
regex
);
}
public
RegexSelector
newRegexSelector
(
String
regex
,
int
group
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
regex
,
String
.
valueOf
(
group
));
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
RegexSelector
)
innerCache
.
get
(
cacheKey
);
}
return
new
RegexSelector
(
regex
,
group
);
}
public
ReplaceSelector
newReplaceSelector
(
String
regex
,
String
replacement
)
{
return
newSelector
(
ReplaceSelector
.
class
,
regex
,
replacement
);
}
public
XpathSelector
newXpathSelector
(
String
xpath
)
{
return
newSelector
(
XpathSelector
.
class
,
xpath
);
}
public
SmartContentSelector
newSmartContentSelector
()
{
return
newSelector
(
SmartContentSelector
.
class
);
}
public
<
T
extends
Selector
>
T
newAndCacheSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
param
);
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
T
)
innerCache
.
get
(
cacheKey
);
}
T
selector
=
newSelector
(
clazz
,
param
);
if
(
selector
!=
null
)
{
innerCache
.
put
(
cacheKey
,
selector
);
}
return
selector
;
}
public
<
T
extends
Selector
>
T
newSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
try
{
if
(
param
.
length
==
0
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
();
T
selector
=
constructor
.
newInstance
();
return
selector
;
}
else
if
(
param
.
length
==
1
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
]);
return
selector
;
}
else
if
(
param
.
length
==
2
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
,
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
],
param
[
1
]);
return
selector
;
}
else
{
throw
new
UnsupportedOperationException
();
}
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
"init object error"
,
e
);
}
}
private
String
getCacheKey
(
Class
<?>
clazz
,
String
...
param
)
{
return
clazz
.
toString
()
+
"_"
+
StringUtils
.
join
(
param
,
"_"
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
View file @
2c357453
...
...
@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
public
abstract
class
Selectors
{
public
static
RegexSelector
regex
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
new
RegexSelector
(
expr
);
return
new
RegexSelector
(
expr
);
}
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
,
group
);
return
new
RegexSelector
(
expr
,
group
);
}
public
static
SmartContentSelector
smartContent
()
{
return
SelectorFactory
.
getInstatnce
().
new
SmartContentSelector
();
return
new
SmartContentSelector
();
}
public
static
CssSelector
$
(
String
expr
)
{
...
...
@@ -29,7 +29,11 @@ public abstract class Selectors {
}
public
static
XpathSelector
xpath
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
expr
);
return
new
XpathSelector
(
expr
);
}
public
static
XsoupSelector
xsoup
(
String
expr
)
{
return
new
XsoupSelector
(
expr
);
}
public
static
AndSelector
and
(
Selector
...
selectors
)
{
...
...
@@ -40,14 +44,6 @@ public abstract class Selectors {
return
new
OrSelector
(
selectors
);
}
public
static
TextContentSelector
text
()
{
return
new
TextContentSelector
();
}
public
static
TextContentSelector
text
(
String
newlineSeperator
)
{
return
new
TextContentSelector
(
newlineSeperator
);
}
public
static
void
main
(
String
[]
args
)
{
String
s
=
"a"
;
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java
deleted
100644 → 0
View file @
85b7cf15
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelector
implements
Selector
{
private
String
newLineSeperator
=
"\n"
;
public
TextContentSelector
()
{
}
public
TextContentSelector
(
String
newLineSeperator
)
{
this
.
newLineSeperator
=
newLineSeperator
;
}
private
final
static
Set
<
String
>
TAGS_IN_NEWLINE
=
new
HashSet
<
String
>();
private
final
static
Set
<
String
>
TAGS_TO_IGNORE
=
new
HashSet
<
String
>();
static
{
TAGS_IN_NEWLINE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"p"
,
"div"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"h6"
,
"br"
,
"li"
}));
TAGS_TO_IGNORE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"head"
,
"style"
,
"script"
,
"noscript"
,
"option"
}));
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
return
select0
(
doc
);
}
protected
String
select0
(
Element
element
)
{
String
tagName
=
element
.
tagName
().
toLowerCase
();
if
(
TAGS_TO_IGNORE
.
contains
(
tagName
))
{
return
""
;
}
StringBuilder
textBuilder
=
new
StringBuilder
();
textBuilder
.
append
(
element
.
text
());
if
(
element
.
children
()
!=
null
)
{
for
(
Element
child
:
element
.
children
())
{
textBuilder
.
append
(
select0
(
child
));
}
}
if
(
TAGS_IN_NEWLINE
.
contains
(
tagName
))
{
textBuilder
.
append
(
newLineSeperator
);
}
return
textBuilder
.
toString
();
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
throw
new
UnsupportedOperationException
();
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java
deleted
100644 → 0
View file @
85b7cf15
package
us
.
codecraft
.
webmagic
.
selector
;
import
junit.framework.Assert
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelectorTest
{
@Test
public
void
test
()
{
String
html
=
"<div class=\"edit-comment-hide\">\n"
+
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n"
+
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n"
+
" </div>\n"
+
" </div>"
;
TextContentSelector
textContentSelector
=
new
TextContentSelector
(
"<br>"
);
String
text
=
textContentSelector
.
select
(
html
);
Assert
.
assertNotNull
(
text
);
}
@Ignore
(
"takes long time"
)
@Test
public
void
testDownload
()
{
String
s
=
new
HttpClientDownloader
().
download
(
"http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/"
,
"utf-8"
)
.
smartContent
().
text
().
toString
();
Assert
.
assertNotNull
(
s
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment