Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
17f8ead2
Commit
17f8ead2
authored
Aug 17, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update comments for selector
parent
77e6ca29
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
50 additions
and
39 deletions
+50
-39
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+4
-4
RegexResult.java
...main/java/us/codecraft/webmagic/selector/RegexResult.java
+3
-3
RegexSelector.java
...in/java/us/codecraft/webmagic/selector/RegexSelector.java
+11
-11
ReplaceSelector.java
.../java/us/codecraft/webmagic/selector/ReplaceSelector.java
+3
-3
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+3
-3
Selector.java
...rc/main/java/us/codecraft/webmagic/selector/Selector.java
+12
-3
SelectorFactory.java
.../java/us/codecraft/webmagic/selector/SelectorFactory.java
+4
-4
SmartContentSelector.java
.../us/codecraft/webmagic/selector/SmartContentSelector.java
+6
-4
XpathSelector.java
...in/java/us/codecraft/webmagic/selector/XpathSelector.java
+3
-3
package.html
...src/main/java/us/codecraft/webmagic/selector/package.html
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
17f8ead2
...
@@ -4,10 +4,10 @@ import java.util.ArrayList;
...
@@ -4,10 +4,10 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* 可抽取的html文本。<br>
* Selectable plain text.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午7:54
*/
*/
public
class
Html
extends
PlainText
{
public
class
Html
extends
PlainText
{
...
@@ -66,7 +66,7 @@ public class Html extends PlainText {
...
@@ -66,7 +66,7 @@ public class Html extends PlainText {
@Override
@Override
public
Selectable
$
(
String
selector
)
{
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
new
CssSelector
(
selector
);
CssSelector
cssSelector
=
new
CssSelector
(
selector
);
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
View file @
17f8ead2
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
/**
/**
* 封装正则表达式抽取接口的类。<br>
* Object contains regex results.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午7:39
*/
*/
class
RegexResult
{
class
RegexResult
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
View file @
17f8ead2
...
@@ -9,10 +9,10 @@ import java.util.regex.Pattern;
...
@@ -9,10 +9,10 @@ import java.util.regex.Pattern;
import
java.util.regex.PatternSyntaxException
;
import
java.util.regex.PatternSyntaxException
;
/**
/**
* 正则表达式抽取器。<br>
* Selector in regex.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午7:09
*/
*/
public
class
RegexSelector
implements
Selector
{
public
class
RegexSelector
implements
Selector
{
...
@@ -21,18 +21,18 @@ public class RegexSelector implements Selector {
...
@@ -21,18 +21,18 @@ public class RegexSelector implements Selector {
private
Pattern
regex
;
private
Pattern
regex
;
public
RegexSelector
(
String
regexStr
)
{
public
RegexSelector
(
String
regexStr
)
{
if
(
StringUtils
.
isBlank
(
regexStr
)){
if
(
StringUtils
.
isBlank
(
regexStr
))
{
throw
new
IllegalArgumentException
(
"regex must not be empty"
);
throw
new
IllegalArgumentException
(
"regex must not be empty"
);
}
}
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)&&!
StringUtils
.
contains
(
regexStr
,
")"
))
{
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)
&&
!
StringUtils
.
contains
(
regexStr
,
")"
))
{
regexStr
=
"("
+
regexStr
+
")"
;
regexStr
=
"("
+
regexStr
+
")"
;
}
}
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)||!
StringUtils
.
contains
(
regexStr
,
")"
))
{
if
(!
StringUtils
.
contains
(
regexStr
,
"("
)
||
!
StringUtils
.
contains
(
regexStr
,
")"
))
{
throw
new
IllegalArgumentException
(
"regex must have capture group 1"
);
throw
new
IllegalArgumentException
(
"regex must have capture group 1"
);
}
}
this
.
regexStr
=
regexStr
;
this
.
regexStr
=
regexStr
;
try
{
try
{
regex
=
Pattern
.
compile
(
regexStr
,
Pattern
.
DOTALL
|
Pattern
.
CASE_INSENSITIVE
);
regex
=
Pattern
.
compile
(
regexStr
,
Pattern
.
DOTALL
|
Pattern
.
CASE_INSENSITIVE
);
}
catch
(
PatternSyntaxException
e
)
{
}
catch
(
PatternSyntaxException
e
)
{
throw
new
IllegalArgumentException
(
"invalid regex"
,
e
);
throw
new
IllegalArgumentException
(
"invalid regex"
,
e
);
}
}
...
@@ -45,7 +45,7 @@ public class RegexSelector implements Selector {
...
@@ -45,7 +45,7 @@ public class RegexSelector implements Selector {
@Override
@Override
public
List
<
String
>
selectList
(
String
text
)
{
public
List
<
String
>
selectList
(
String
text
)
{
List
<
String
>
strings
=
new
ArrayList
<
String
>();
List
<
String
>
strings
=
new
ArrayList
<
String
>();
List
<
RegexResult
>
results
=
selectGroupList
(
text
);
List
<
RegexResult
>
results
=
selectGroupList
(
text
);
for
(
RegexResult
result
:
results
)
{
for
(
RegexResult
result
:
results
)
{
strings
.
add
(
result
.
get
(
1
));
strings
.
add
(
result
.
get
(
1
));
...
@@ -56,7 +56,7 @@ public class RegexSelector implements Selector {
...
@@ -56,7 +56,7 @@ public class RegexSelector implements Selector {
public
RegexResult
selectGroup
(
String
text
)
{
public
RegexResult
selectGroup
(
String
text
)
{
Matcher
matcher
=
regex
.
matcher
(
text
);
Matcher
matcher
=
regex
.
matcher
(
text
);
if
(
matcher
.
find
())
{
if
(
matcher
.
find
())
{
String
[]
groups
=
new
String
[
matcher
.
groupCount
()
+
1
];
String
[]
groups
=
new
String
[
matcher
.
groupCount
()
+
1
];
for
(
int
i
=
0
;
i
<
groups
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
groups
.
length
;
i
++)
{
groups
[
i
]
=
matcher
.
group
(
i
);
groups
[
i
]
=
matcher
.
group
(
i
);
}
}
...
@@ -69,7 +69,7 @@ public class RegexSelector implements Selector {
...
@@ -69,7 +69,7 @@ public class RegexSelector implements Selector {
Matcher
matcher
=
regex
.
matcher
(
text
);
Matcher
matcher
=
regex
.
matcher
(
text
);
List
<
RegexResult
>
resultList
=
new
ArrayList
<
RegexResult
>();
List
<
RegexResult
>
resultList
=
new
ArrayList
<
RegexResult
>();
while
(
matcher
.
find
())
{
while
(
matcher
.
find
())
{
String
[]
groups
=
new
String
[
matcher
.
groupCount
()
+
1
];
String
[]
groups
=
new
String
[
matcher
.
groupCount
()
+
1
];
for
(
int
i
=
0
;
i
<
groups
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
groups
.
length
;
i
++)
{
groups
[
i
]
=
matcher
.
group
(
i
);
groups
[
i
]
=
matcher
.
group
(
i
);
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
View file @
17f8ead2
...
@@ -6,10 +6,10 @@ import java.util.regex.Pattern;
...
@@ -6,10 +6,10 @@ import java.util.regex.Pattern;
import
java.util.regex.PatternSyntaxException
;
import
java.util.regex.PatternSyntaxException
;
/**
/**
* 对文本进行替换。<br>
* Replace selector。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午7:09
*/
*/
public
class
ReplaceSelector
implements
Selector
{
public
class
ReplaceSelector
implements
Selector
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
17f8ead2
...
@@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
...
@@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
import
java.util.List
;
import
java.util.List
;
/**
/**
* 可进行抽取的文本。<br>
* Selectable text.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-20
* @since 0.1.0
* Time: 下午7:51
*/
*/
public
interface
Selectable
{
public
interface
Selectable
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
View file @
17f8ead2
...
@@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
...
@@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
import
java.util.List
;
import
java.util.List
;
/**
/**
*
抽取器。
<br>
*
Selector(extractor) for text.
<br>
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-20
* Time: 下午8:02
*/
*/
public
interface
Selector
{
public
interface
Selector
{
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
* @param text
* @return result
*/
public
String
select
(
String
text
);
public
String
select
(
String
text
);
/**
* Extract all results in text.<br>
* @param text
* @return results
*/
public
List
<
String
>
selectList
(
String
text
);
public
List
<
String
>
selectList
(
String
text
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
View file @
17f8ead2
...
@@ -7,10 +7,10 @@ import java.util.Map;
...
@@ -7,10 +7,10 @@ import java.util.Map;
import
java.util.concurrent.ConcurrentHashMap
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
/**
* 产生selector的工厂。<br>
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午7:56
*/
*/
public
class
SelectorFactory
{
public
class
SelectorFactory
{
...
@@ -34,7 +34,7 @@ public class SelectorFactory {
...
@@ -34,7 +34,7 @@ public class SelectorFactory {
return
newSelector
(
XpathSelector
.
class
,
xpath
);
return
newSelector
(
XpathSelector
.
class
,
xpath
);
}
}
public
SmartContentSelector
newSmartContentSelector
(){
public
SmartContentSelector
newSmartContentSelector
()
{
return
newSelector
(
SmartContentSelector
.
class
);
return
newSelector
(
SmartContentSelector
.
class
);
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
View file @
17f8ead2
...
@@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
...
@@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.TagNode
;
import
org.htmlcleaner.TagNode
;
import
us.codecraft.webmagic.utils.Experimental
;
import
java.util.*
;
import
java.util.*
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
/**
* readability算法,基础是找到所有p标签的父节点
* Extract the text content of html.<br>
* 写的比较乱,最终效果还在尝试中
* Using Readability algorithm: find parents of all p tags.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午4:42
*/
*/
@Experimental
public
class
SmartContentSelector
implements
Selector
{
public
class
SmartContentSelector
implements
Selector
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
View file @
17f8ead2
...
@@ -6,10 +6,10 @@ import java.util.ArrayList;
...
@@ -6,10 +6,10 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* xpath的选择器。包装了HtmlCleaner。<br>
* XPath selector based on HtmlCleaner。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午9:39
*/
*/
public
class
XpathSelector
implements
Selector
{
public
class
XpathSelector
implements
Selector
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html
View file @
17f8ead2
<html>
<html>
<body>
<body>
提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制
。
Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector
。
</body>
</body>
</html>
</html>
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment