Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
f9825c21
Commit
f9825c21
authored
May 27, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor selectable for html fragment #113
parent
03d26c16
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
288 additions
and
92 deletions
+288
-92
AbstractSelectable.java
...va/us/codecraft/webmagic/selector/AbstractSelectable.java
+112
-0
BaseElementSelector.java
...a/us/codecraft/webmagic/selector/BaseElementSelector.java
+21
-0
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+16
-2
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+7
-0
HtmlFragment.java
...ain/java/us/codecraft/webmagic/selector/HtmlFragment.java
+7
-0
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+7
-90
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+6
-0
XpathSelector.java
...in/java/us/codecraft/webmagic/selector/XpathSelector.java
+16
-0
MamacnPageProcessor.java
...va/us/codecraft/webmagic/samples/MamacnPageProcessor.java
+46
-0
OneFilePipeline.java
.../codecraft/webmagic/samples/pipeline/OneFilePipeline.java
+50
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
0 → 100644
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.collections.CollectionUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* @author code4crafer@gmail.com
* @since 0.5.2
*/
public
abstract
class
AbstractSelectable
implements
Selectable
{
protected
List
<
String
>
strings
;
public
AbstractSelectable
(
String
text
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
results
.
add
(
text
);
this
.
strings
=
results
;
}
public
AbstractSelectable
(
List
<
String
>
strings
)
{
this
.
strings
=
strings
;
}
@Override
public
Selectable
css
(
String
selector
)
{
return
$
(
selector
);
}
@Override
public
Selectable
css
(
String
selector
,
String
attrName
)
{
return
$
(
selector
,
attrName
);
}
protected
Selectable
select
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
String
result
=
selector
.
select
(
string
);
if
(
result
!=
null
)
{
results
.
add
(
result
);
}
}
return
new
PlainText
(
results
);
}
protected
Selectable
selectList
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
List
<
String
>
result
=
selector
.
selectList
(
string
);
results
.
addAll
(
result
);
}
return
new
PlainText
(
results
);
}
@Override
public
List
<
String
>
all
()
{
return
strings
;
}
@Override
public
Selectable
jsonPath
(
String
jsonPath
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
String
get
()
{
if
(
CollectionUtils
.
isNotEmpty
(
all
()))
{
return
all
().
get
(
0
);
}
else
{
return
null
;
}
}
@Override
public
Selectable
select
(
Selector
selector
)
{
return
select
(
selector
,
strings
);
}
@Override
public
Selectable
selectList
(
Selector
selector
)
{
return
selectList
(
selector
,
strings
);
}
@Override
public
Selectable
regex
(
String
regex
)
{
RegexSelector
regexSelector
=
Selectors
.
regex
(
regex
);
return
selectList
(
regexSelector
,
strings
);
}
@Override
public
Selectable
regex
(
String
regex
,
int
group
)
{
RegexSelector
regexSelector
=
Selectors
.
regex
(
regex
,
group
);
return
selectList
(
regexSelector
,
strings
);
}
@Override
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
new
ReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
}
@Override
public
String
toString
()
{
return
get
();
}
@Override
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -28,4 +30,23 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
}
}
public
Element
selectElement
(
String
text
)
{
if
(
text
!=
null
)
{
return
selectElement
(
Jsoup
.
parse
(
text
));
}
return
null
;
}
public
Elements
selectElements
(
String
text
)
{
if
(
text
!=
null
)
{
return
selectElements
(
Jsoup
.
parse
(
text
));
}
else
{
return
new
Elements
();
}
}
public
abstract
Element
selectElement
(
Element
element
);
public
abstract
Elements
selectElements
(
Element
element
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
f9825c21
...
...
@@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public
String
select
(
Element
element
)
{
Elements
elements
=
element
.
select
(
selectorTex
t
);
Elements
elements
=
selectElements
(
elemen
t
);
if
(
CollectionUtils
.
isEmpty
(
elements
))
{
return
null
;
}
...
...
@@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public
List
<
String
>
selectList
(
Element
doc
)
{
List
<
String
>
strings
=
new
ArrayList
<
String
>();
Elements
elements
=
doc
.
select
(
selectorText
);
Elements
elements
=
selectElements
(
doc
);
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
for
(
Element
element
:
elements
)
{
String
value
=
getValue
(
element
);
...
...
@@ -78,4 +78,18 @@ public class CssSelector extends BaseElementSelector {
}
return
strings
;
}
@Override
public
Element
selectElement
(
Element
element
)
{
Elements
elements
=
element
.
select
(
selectorText
);
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
return
elements
.
get
(
0
);
}
return
null
;
}
@Override
public
Elements
selectElements
(
Element
element
)
{
return
element
.
select
(
selectorText
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
f9825c21
...
...
@@ -142,6 +142,13 @@ public class Html extends PlainText {
return
document
.
html
();
}
@Override
public
List
<
Selectable
>
nodes
()
{
ArrayList
<
Selectable
>
selectables
=
new
ArrayList
<
Selectable
>();
selectables
.
add
(
this
);
return
selectables
;
}
/**
* @param selector
* @return
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java
0 → 100644
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
selector
;
/**
* @author code4crafer@gmail.com
*/
public
class
HtmlFragment
{
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.collections.CollectionUtils
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -12,18 +10,14 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public
class
PlainText
implements
Selectable
{
protected
List
<
String
>
strings
;
public
class
PlainText
extends
AbstractSelectable
{
public
PlainText
(
List
<
String
>
strings
)
{
this
.
strings
=
strings
;
super
(
strings
)
;
}
public
PlainText
(
String
text
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
results
.
add
(
text
);
this
.
strings
=
results
;
super
(
text
);
}
public
static
PlainText
create
(
String
text
)
{
...
...
@@ -45,16 +39,6 @@ public class PlainText implements Selectable {
throw
new
UnsupportedOperationException
();
}
@Override
public
Selectable
css
(
String
selector
)
{
return
$
(
selector
);
}
@Override
public
Selectable
css
(
String
selector
,
String
attrName
)
{
return
$
(
selector
,
attrName
);
}
@Override
public
Selectable
smartContent
()
{
throw
new
UnsupportedOperationException
();
...
...
@@ -66,79 +50,12 @@ public class PlainText implements Selectable {
}
@Override
public
Selectable
regex
(
String
regex
)
{
RegexSelector
regexSelector
=
Selectors
.
regex
(
regex
);
return
selectList
(
regexSelector
,
strings
);
}
@Override
public
Selectable
regex
(
String
regex
,
int
group
)
{
RegexSelector
regexSelector
=
Selectors
.
regex
(
regex
,
group
);
return
selectList
(
regexSelector
,
strings
);
}
protected
Selectable
select
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
public
List
<
Selectable
>
nodes
()
{
List
<
Selectable
>
nodes
=
new
ArrayList
<
Selectable
>(
strings
.
size
());
for
(
String
string
:
strings
)
{
String
result
=
selector
.
select
(
string
);
if
(
result
!=
null
)
{
results
.
add
(
result
);
}
nodes
.
add
(
PlainText
.
create
(
string
));
}
return
n
ew
PlainText
(
results
)
;
return
n
odes
;
}
protected
Selectable
selectList
(
Selector
selector
,
List
<
String
>
strings
)
{
List
<
String
>
results
=
new
ArrayList
<
String
>();
for
(
String
string
:
strings
)
{
List
<
String
>
result
=
selector
.
selectList
(
string
);
results
.
addAll
(
result
);
}
return
new
PlainText
(
results
);
}
@Override
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
new
ReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
}
@Override
public
List
<
String
>
all
()
{
return
strings
;
}
@Override
public
Selectable
jsonPath
(
String
jsonPath
)
{
throw
new
UnsupportedOperationException
();
}
@Override
public
String
get
()
{
if
(
CollectionUtils
.
isNotEmpty
(
all
()))
{
return
all
().
get
(
0
);
}
else
{
return
null
;
}
}
@Override
public
Selectable
select
(
Selector
selector
)
{
return
select
(
selector
,
strings
);
}
@Override
public
Selectable
selectList
(
Selector
selector
)
{
return
selectList
(
selector
,
strings
);
}
@Override
public
String
toString
()
{
return
get
();
}
@Override
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
f9825c21
...
...
@@ -143,4 +143,10 @@ public interface Selectable {
* @return
*/
public
Selectable
selectList
(
Selector
selector
);
/**
* get all nodes
* @return
*/
public
List
<
Selectable
>
nodes
();
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
us.codecraft.xsoup.XPathEvaluator
;
import
us.codecraft.xsoup.Xsoup
;
...
...
@@ -29,4 +31,18 @@ public class XpathSelector extends BaseElementSelector {
public
List
<
String
>
selectList
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
list
();
}
@Override
public
Element
selectElement
(
Element
element
)
{
Elements
elements
=
selectElements
(
element
);
if
(
CollectionUtils
.
isNotEmpty
(
elements
)){
return
elements
.
get
(
0
);
}
return
null
;
}
@Override
public
Elements
selectElements
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
getElements
();
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java
0 → 100644
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.samples.pipeline.OneFilePipeline
;
import
us.codecraft.webmagic.scheduler.FileCacheQueueScheduler
;
import
us.codecraft.webmagic.selector.Selectable
;
import
java.io.FileNotFoundException
;
import
java.io.UnsupportedEncodingException
;
/**
* @author code4crafer@gmail.com
*/
public
class
MamacnPageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setDomain
(
"www.mama.cn"
).
setSleepTime
(
100
);
@Override
public
void
process
(
Page
page
)
{
Selectable
images
=
page
.
getHtml
().
xpath
(
"//ul[@id=ma-thumb-list]/li"
);
page
.
putField
(
"img"
,
images
.
xpath
(
"//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src"
).
get
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt"
).
get
());
page
.
putField
(
"url"
,
page
.
getUrl
().
toString
());
if
(
page
.
getResultItems
().
get
(
"title"
)
==
null
)
{
page
.
setSkip
(
true
);
}
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"http://www\\.mama\\.cn/photo/.*\\.html"
).
all
());
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
static
void
main
(
String
[]
args
)
throws
FileNotFoundException
,
UnsupportedEncodingException
{
Spider
.
create
(
new
MamacnPageProcessor
())
.
setScheduler
(
new
FileCacheQueueScheduler
(
"/data/webmagic/mamacn"
))
.
addUrl
(
"http://www.mama.cn/photo/t1-p1.html"
)
.
addPipeline
(
new
OneFilePipeline
(
"/data/webmagic/mamacn/data"
))
.
thread
(
5
)
.
run
();
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java
0 → 100644
View file @
f9825c21
package
us
.
codecraft
.
webmagic
.
samples
.
pipeline
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.utils.FilePersistentBase
;
import
java.io.*
;
import
java.util.Map
;
/**
* @author code4crafer@gmail.com
*/
public
class
OneFilePipeline
extends
FilePersistentBase
implements
Pipeline
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
PrintWriter
printWriter
;
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public
OneFilePipeline
()
throws
FileNotFoundException
,
UnsupportedEncodingException
{
this
(
"/data/webmagic/"
);
}
public
OneFilePipeline
(
String
path
)
throws
FileNotFoundException
,
UnsupportedEncodingException
{
setPath
(
path
);
printWriter
=
new
PrintWriter
(
new
OutputStreamWriter
(
new
FileOutputStream
(
getFile
(
path
)),
"UTF-8"
));
}
@Override
public
synchronized
void
process
(
ResultItems
resultItems
,
Task
task
)
{
printWriter
.
println
(
"url:\t"
+
resultItems
.
getRequest
().
getUrl
());
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
resultItems
.
getAll
().
entrySet
())
{
if
(
entry
.
getValue
()
instanceof
Iterable
)
{
Iterable
value
=
(
Iterable
)
entry
.
getValue
();
printWriter
.
println
(
entry
.
getKey
()
+
":"
);
for
(
Object
o
:
value
)
{
printWriter
.
println
(
o
);
}
}
else
{
printWriter
.
println
(
entry
.
getKey
()
+
":\t"
+
entry
.
getValue
());
}
}
printWriter
.
flush
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment