Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
c1471718
Commit
c1471718
authored
Aug 20, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
extractors
parent
20705b34
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
98 additions
and
9 deletions
+98
-9
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+2
-0
OrSelector.java
.../main/java/us/codecraft/webmagic/selector/OrSelector.java
+3
-3
RegexSelector.java
...in/java/us/codecraft/webmagic/selector/RegexSelector.java
+10
-3
SelectorFactory.java
.../java/us/codecraft/webmagic/selector/SelectorFactory.java
+5
-1
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+44
-0
ExtractorsTest.java
...t/java/us/codecraft/webmagic/selector/ExtractorsTest.java
+34
-0
RegexSelectorTest.java
...ava/us/codecraft/webmagic/selector/RegexSelectorTest.java
+0
-2
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
c1471718
...
...
@@ -43,6 +43,8 @@ public class CssSelector implements Selector {
private
String
getValue
(
Element
element
)
{
if
(
attrName
==
null
)
{
return
element
.
outerHtml
();
}
else
if
(
"innerHtml"
.
equalsIgnoreCase
(
attrName
))
{
return
element
.
html
();
}
else
{
return
element
.
attr
(
attrName
);
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
View file @
c1471718
...
...
@@ -26,9 +26,9 @@ public class OrSelector implements Selector {
@Override
public
String
select
(
String
text
)
{
for
(
Selector
selector
:
selectors
)
{
tex
t
=
selector
.
select
(
text
);
if
(
tex
t
!=
null
)
{
return
tex
t
;
String
resul
t
=
selector
.
select
(
text
);
if
(
resul
t
!=
null
)
{
return
resul
t
;
}
}
return
null
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
View file @
c1471718
...
...
@@ -20,7 +20,9 @@ public class RegexSelector implements Selector {
private
Pattern
regex
;
public
RegexSelector
(
String
regexStr
)
{
private
int
group
=
1
;
public
RegexSelector
(
String
regexStr
,
int
group
)
{
if
(
StringUtils
.
isBlank
(
regexStr
))
{
throw
new
IllegalArgumentException
(
"regex must not be empty"
);
}
...
...
@@ -36,11 +38,16 @@ public class RegexSelector implements Selector {
}
catch
(
PatternSyntaxException
e
)
{
throw
new
IllegalArgumentException
(
"invalid regex"
,
e
);
}
this
.
group
=
group
;
}
public
RegexSelector
(
String
regexStr
)
{
this
(
regexStr
,
1
);
}
@Override
public
String
select
(
String
text
)
{
return
selectGroup
(
text
).
get
(
1
);
return
selectGroup
(
text
).
get
(
group
);
}
@Override
...
...
@@ -48,7 +55,7 @@ public class RegexSelector implements Selector {
List
<
String
>
strings
=
new
ArrayList
<
String
>();
List
<
RegexResult
>
results
=
selectGroupList
(
text
);
for
(
RegexResult
result
:
results
)
{
strings
.
add
(
result
.
get
(
1
));
strings
.
add
(
result
.
get
(
group
));
}
return
strings
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
View file @
c1471718
...
...
@@ -27,7 +27,11 @@ public class SelectorFactory {
}
public
RegexSelector
newRegexSelector
(
String
regex
,
int
group
)
{
return
newSelector
(
RegexSelector
.
class
,
regex
,
String
.
valueOf
(
group
));
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
regex
,
String
.
valueOf
(
group
));
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
RegexSelector
)
innerCache
.
get
(
cacheKey
);
}
return
new
RegexSelector
(
regex
,
group
);
}
public
ReplaceSelector
newReplaceSelector
(
String
regex
,
String
replacement
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
0 → 100644
View file @
c1471718
package
us
.
codecraft
.
webmagic
.
selector
;
/**
* Convenient methods for selectors.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public
abstract
class
Selectors
{
public
static
RegexSelector
regex
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
);
}
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
,
group
);
}
public
static
CssSelector
$
(
String
expr
)
{
return
new
CssSelector
(
expr
);
}
public
static
CssSelector
$
(
String
expr
,
String
attrName
)
{
return
new
CssSelector
(
expr
,
attrName
);
}
public
static
XpathSelector
xpath
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
expr
);
}
public
static
AndSelector
and
(
Selector
...
selectors
)
{
return
new
AndSelector
(
selectors
);
}
public
static
OrSelector
or
(
Selector
...
selectors
)
{
return
new
OrSelector
(
selectors
);
}
public
static
void
main
(
String
[]
args
)
{
String
s
=
"a"
;
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
}
}
\ No newline at end of file
webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java
0 → 100644
View file @
c1471718
package
us
.
codecraft
.
webmagic
.
selector
;
import
junit.framework.Assert
;
import
org.junit.Test
;
import
static
us
.
codecraft
.
webmagic
.
selector
.
Selectors
.*;
/**
* @author code4crafter@gmail.com <br>
*/
public
class
ExtractorsTest
{
String
html
=
"<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>"
;
String
html2
=
"<title>aabbcc</title>"
;
@Test
public
void
testEach
()
{
Assert
.
assertEquals
(
"<a href=\"xxx\">aabbcc</a>"
,
$
(
"div h1 a"
).
select
(
html
));
Assert
.
assertEquals
(
"xxx"
,
$
(
"div h1 a"
,
"href"
).
select
(
html
));
Assert
.
assertEquals
(
"aabbcc"
,
$
(
"div h1 a"
,
"innerHtml"
).
select
(
html
));
Assert
.
assertEquals
(
"xxx"
,
xpath
(
"//a/@href"
).
select
(
html
));
Assert
.
assertEquals
(
"xxx"
,
regex
(
"a href=\"(.*)\""
).
select
(
html
));
Assert
.
assertEquals
(
"xxx"
,
regex
(
"(a href)=\"(.*)\""
,
2
).
select
(
html
));
}
@Test
public
void
testCombo
()
{
Assert
.
assertEquals
(
"bb"
,
and
(
$
(
"title"
),
regex
(
"aa(bb)cc"
)).
select
(
html2
));
OrSelector
or
=
or
(
$
(
"div h1 a"
,
"innerHtml"
),
xpath
(
"//title"
));
Assert
.
assertEquals
(
"aabbcc"
,
or
.
select
(
html
));
Assert
.
assertEquals
(
"aabbcc"
,
or
.
select
(
html2
));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java
View file @
c1471718
...
...
@@ -5,8 +5,6 @@ import org.junit.Test;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午7:13
*/
public
class
RegexSelectorTest
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment