Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
e06b0c1a
Commit
e06b0c1a
authored
Sep 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'xsoup'
parents
b9eeb88f
aefd0569
Changes
33
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
33 changed files
with
1283 additions
and
307 deletions
+1283
-307
pom.xml
pom.xml
+1
-1
pom.xml
webmagic-core/pom.xml
+7
-1
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+4
-3
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+9
-0
BaseElementSelector.java
...a/us/codecraft/webmagic/selector/BaseElementSelector.java
+23
-0
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+11
-15
ElementSelector.java
.../java/us/codecraft/webmagic/selector/ElementSelector.java
+32
-0
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+60
-12
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+1
-13
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+0
-14
SelectorFactory.java
.../java/us/codecraft/webmagic/selector/SelectorFactory.java
+0
-91
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+8
-12
TextContentSelector.java
...a/us/codecraft/webmagic/selector/TextContentSelector.java
+0
-68
XsoupSelector.java
...in/java/us/codecraft/webmagic/selector/XsoupSelector.java
+32
-0
EnvironmentUtil.java
...ain/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
+28
-0
TextContentSelectorTest.java
.../codecraft/webmagic/selector/TextContentSelectorTest.java
+0
-34
EnvironmentUtilTest.java
...java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
+18
-0
pom.xml
webmagic-extension/pom.xml
+1
-1
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+22
-14
FilePageModelPipeline.java
...us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
+1
-1
ExtractorUtils.java
...main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+15
-7
pom.xml
webmagic-samples/pom.xml
+1
-1
DiaoyuwengProcessor.java
...va/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+6
-1
F58PageProcesser.java
.../java/us/codecraft/webmagic/samples/F58PageProcesser.java
+7
-2
HuxiuProcessor.java
...in/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+7
-3
InfoQMiniBookProcessor.java
...us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
+0
-4
IteyeBlogProcessor.java
...ava/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+1
-2
KaichibaProcessor.java
...java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+5
-0
MeicanProcessor.java
...n/java/us/codecraft/webmagic/samples/MeicanProcessor.java
+7
-2
OschinaBlogPageProcesser.java
.../codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+4
-5
ProcessorBenchmark.java
.../src/test/java/us/codecraft/model/ProcessorBenchmark.java
+890
-0
pom.xml
webmagic-saxon/pom.xml
+5
-0
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+77
-0
No files found.
pom.xml
View file @
e06b0c1a
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
<version>
7
</version>
<version>
7
</version>
</parent>
</parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<packaging>
pom
</packaging>
<packaging>
pom
</packaging>
<properties>
<properties>
...
...
webmagic-core/pom.xml
View file @
e06b0c1a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<parent>
<parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
@@ -25,6 +25,12 @@
...
@@ -25,6 +25,12 @@
<artifactId>
commons-lang3
</artifactId>
<artifactId>
commons-lang3
</artifactId>
</dependency>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
xsoup
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<dependency>
<groupId>
log4j
</groupId>
<groupId>
log4j
</groupId>
<artifactId>
log4j
</artifactId>
<artifactId>
log4j
</artifactId>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
@@ -28,7 +29,7 @@ public class Page {
...
@@ -28,7 +29,7 @@ public class Page {
private
ResultItems
resultItems
=
new
ResultItems
();
private
ResultItems
resultItems
=
new
ResultItems
();
private
Selectable
html
;
private
Html
html
;
private
Selectable
url
;
private
Selectable
url
;
...
@@ -58,11 +59,11 @@ public class Page {
...
@@ -58,11 +59,11 @@ public class Page {
*
*
* @return html
* @return html
*/
*/
public
Selectable
getHtml
()
{
public
Html
getHtml
()
{
return
html
;
return
html
;
}
}
public
void
setHtml
(
Selectable
html
)
{
public
void
setHtml
(
Html
html
)
{
this
.
html
=
html
;
this
.
html
=
html
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
e06b0c1a
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
java.io.Closeable
;
import
java.io.Closeable
;
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return
this
;
return
this
;
}
}
/**
* switch off xsoup
* @return
*/
public
static
void
xsoupOff
(){
EnvironmentUtil
.
setUseXsoup
(
false
);
}
@Override
@Override
public
String
getUUID
()
{
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public
abstract
class
BaseElementSelector
implements
Selector
,
ElementSelector
{
@Override
public
String
select
(
String
text
)
{
return
select
(
Jsoup
.
parse
(
text
));
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
return
selectList
(
Jsoup
.
parse
(
text
));
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
...
@@ -15,7 +13,7 @@ import java.util.List;
...
@@ -15,7 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @since 0.1.0
*/
*/
public
class
CssSelector
implements
Selector
{
public
class
CssSelector
extends
BaseElement
Selector
{
private
String
selectorText
;
private
String
selectorText
;
...
@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
...
@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
this
.
attrName
=
attrName
;
this
.
attrName
=
attrName
;
}
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
Elements
elements
=
doc
.
select
(
selectorText
);
if
(
CollectionUtils
.
isEmpty
(
elements
))
{
return
null
;
}
return
getValue
(
elements
.
get
(
0
));
}
private
String
getValue
(
Element
element
)
{
private
String
getValue
(
Element
element
)
{
if
(
attrName
==
null
)
{
if
(
attrName
==
null
)
{
return
element
.
outerHtml
();
return
element
.
outerHtml
();
...
@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
...
@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
}
}
@Override
@Override
public
List
<
String
>
selectList
(
String
text
)
{
public
String
select
(
Element
element
)
{
Elements
elements
=
element
.
select
(
selectorText
);
if
(
CollectionUtils
.
isEmpty
(
elements
))
{
return
null
;
}
return
getValue
(
elements
.
get
(
0
));
}
@Override
public
List
<
String
>
selectList
(
Element
doc
)
{
List
<
String
>
strings
=
new
ArrayList
<
String
>();
List
<
String
>
strings
=
new
ArrayList
<
String
>();
Document
doc
=
Jsoup
.
parse
(
text
);
Elements
elements
=
doc
.
select
(
selectorText
);
Elements
elements
=
doc
.
select
(
selectorText
);
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
java.util.List
;
/**
* Selector(extractor) for html elements.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public
interface
ElementSelector
{
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param element
* @return result
*/
public
String
select
(
Element
element
);
/**
* Extract all results in text.<br>
*
* @param element
* @return results
*/
public
List
<
String
>
selectList
(
Element
element
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
@@ -11,12 +15,23 @@ import java.util.List;
...
@@ -11,12 +15,23 @@ import java.util.List;
*/
*/
public
class
Html
extends
PlainText
{
public
class
Html
extends
PlainText
{
/**
* Store parsed document for better performance when only one text exist.
*/
private
Document
document
;
public
Html
(
List
<
String
>
strings
)
{
public
Html
(
List
<
String
>
strings
)
{
super
(
strings
);
super
(
strings
);
}
}
public
Html
(
String
text
)
{
public
Html
(
String
text
)
{
super
(
text
);
super
(
text
);
this
.
document
=
Jsoup
.
parse
(
text
);
}
public
Html
(
Document
document
)
{
super
(
document
.
html
());
this
.
document
=
document
;
}
}
public
static
Html
create
(
String
text
)
{
public
static
Html
create
(
String
text
)
{
...
@@ -53,38 +68,71 @@ public class Html extends PlainText {
...
@@ -53,38 +68,71 @@ public class Html extends PlainText {
@Override
@Override
public
Selectable
links
()
{
public
Selectable
links
()
{
XpathSelector
xpathSelector
=
Selectors
.
xpath
(
"//a/@href"
);
return
xpath
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
}
}
@Override
@Override
public
Selectable
xpath
(
String
xpath
)
{
public
Selectable
xpath
(
String
xpath
)
{
XpathSelector
xpathSelector
=
Selectors
.
xpath
(
xpath
);
if
(
EnvironmentUtil
.
useXsoup
())
{
return
selectList
(
xpathSelector
,
strings
);
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
document
!=
null
)
{
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
}
return
selectList
(
xsoupSelector
,
strings
);
}
else
{
XpathSelector
xpathSelector
=
new
XpathSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
}
}
@Override
@Override
public
Selectable
$
(
String
selector
)
{
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
@Override
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
@Override
public
Document
getDocument
()
{
public
Selectable
text
()
{
return
document
;
TextContentSelector
selector
=
Selectors
.
text
();
return
select
(
selector
,
strings
);
}
}
@Override
public
String
getText
()
{
public
Selectable
text
(
String
newlineSeparator
)
{
if
(
strings
!=
null
&&
strings
.
size
()>
0
){
TextContentSelector
selector
=
Selectors
.
text
(
newlineSeparator
);
return
strings
.
get
(
0
);
return
select
(
selector
,
strings
);
}
return
document
.
html
();
}
/**
* @param selector
* @return
*/
public
String
selectDocument
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getDocument
());
}
else
{
return
selector
.
select
(
getText
());
}
}
}
public
List
<
String
>
selectDocumentForList
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getDocument
());
}
else
{
return
selector
.
selectList
(
getText
());
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
e06b0c1a
...
@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
...
@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
@Override
@Override
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
SelectorFactory
.
getInstatnce
().
newReplaceSelector
(
regex
,
replacement
);
ReplaceSelector
replaceSelector
=
new
ReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
return
select
(
replaceSelector
,
strings
);
}
}
...
@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
...
@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
}
}
}
}
@Override
public
Selectable
text
()
{
//do nothing
return
this
;
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
//do nothing
return
this
;
}
@Override
@Override
public
boolean
match
()
{
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
return
strings
!=
null
&&
strings
.
size
()
>
0
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
e06b0c1a
...
@@ -82,20 +82,6 @@ public interface Selectable {
...
@@ -82,20 +82,6 @@ public interface Selectable {
*/
*/
public
String
toString
();
public
String
toString
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
(
String
newlineSeparator
);
/**
/**
* if result exist for select
* if result exist for select
*
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.lang3.StringUtils
;
import
java.lang.reflect.Constructor
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public
class
SelectorFactory
{
private
Map
<
String
,
Selector
>
innerCache
=
new
ConcurrentHashMap
<
String
,
Selector
>();
private
static
final
SelectorFactory
INSTATNCE
=
new
SelectorFactory
();
public
static
SelectorFactory
getInstatnce
()
{
return
INSTATNCE
;
}
public
RegexSelector
newRegexSelector
(
String
regex
)
{
return
newSelector
(
RegexSelector
.
class
,
regex
);
}
public
RegexSelector
newRegexSelector
(
String
regex
,
int
group
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
regex
,
String
.
valueOf
(
group
));
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
RegexSelector
)
innerCache
.
get
(
cacheKey
);
}
return
new
RegexSelector
(
regex
,
group
);
}
public
ReplaceSelector
newReplaceSelector
(
String
regex
,
String
replacement
)
{
return
newSelector
(
ReplaceSelector
.
class
,
regex
,
replacement
);
}
public
XpathSelector
newXpathSelector
(
String
xpath
)
{
return
newSelector
(
XpathSelector
.
class
,
xpath
);
}
public
SmartContentSelector
newSmartContentSelector
()
{
return
newSelector
(
SmartContentSelector
.
class
);
}
public
<
T
extends
Selector
>
T
newAndCacheSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
param
);
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
T
)
innerCache
.
get
(
cacheKey
);
}
T
selector
=
newSelector
(
clazz
,
param
);
if
(
selector
!=
null
)
{
innerCache
.
put
(
cacheKey
,
selector
);
}
return
selector
;
}
public
<
T
extends
Selector
>
T
newSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
try
{
if
(
param
.
length
==
0
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
();
T
selector
=
constructor
.
newInstance
();
return
selector
;
}
else
if
(
param
.
length
==
1
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
]);
return
selector
;
}
else
if
(
param
.
length
==
2
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
,
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
],
param
[
1
]);
return
selector
;
}
else
{
throw
new
UnsupportedOperationException
();
}
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
"init object error"
,
e
);
}
}
private
String
getCacheKey
(
Class
<?>
clazz
,
String
...
param
)
{
return
clazz
.
toString
()
+
"_"
+
StringUtils
.
join
(
param
,
"_"
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
View file @
e06b0c1a
...
@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
...
@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
public
abstract
class
Selectors
{
public
abstract
class
Selectors
{
public
static
RegexSelector
regex
(
String
expr
)
{
public
static
RegexSelector
regex
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
new
RegexSelector
(
expr
);
return
new
RegexSelector
(
expr
);
}
}
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
,
group
);
return
new
RegexSelector
(
expr
,
group
);
}
}
public
static
SmartContentSelector
smartContent
()
{
public
static
SmartContentSelector
smartContent
()
{
return
SelectorFactory
.
getInstatnce
().
new
SmartContentSelector
();
return
new
SmartContentSelector
();
}
}
public
static
CssSelector
$
(
String
expr
)
{
public
static
CssSelector
$
(
String
expr
)
{
...
@@ -29,7 +29,11 @@ public abstract class Selectors {
...
@@ -29,7 +29,11 @@ public abstract class Selectors {
}
}
public
static
XpathSelector
xpath
(
String
expr
)
{
public
static
XpathSelector
xpath
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
expr
);
return
new
XpathSelector
(
expr
);
}
public
static
XsoupSelector
xsoup
(
String
expr
)
{
return
new
XsoupSelector
(
expr
);
}
}
public
static
AndSelector
and
(
Selector
...
selectors
)
{
public
static
AndSelector
and
(
Selector
...
selectors
)
{
...
@@ -40,14 +44,6 @@ public abstract class Selectors {
...
@@ -40,14 +44,6 @@ public abstract class Selectors {
return
new
OrSelector
(
selectors
);
return
new
OrSelector
(
selectors
);
}
}
public
static
TextContentSelector
text
()
{
return
new
TextContentSelector
();
}
public
static
TextContentSelector
text
(
String
newlineSeperator
)
{
return
new
TextContentSelector
(
newlineSeperator
);
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
s
=
"a"
;
String
s
=
"a"
;
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelector
implements
Selector
{
private
String
newLineSeperator
=
"\n"
;
public
TextContentSelector
()
{
}
public
TextContentSelector
(
String
newLineSeperator
)
{
this
.
newLineSeperator
=
newLineSeperator
;
}
private
final
static
Set
<
String
>
TAGS_IN_NEWLINE
=
new
HashSet
<
String
>();
private
final
static
Set
<
String
>
TAGS_TO_IGNORE
=
new
HashSet
<
String
>();
static
{
TAGS_IN_NEWLINE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"p"
,
"div"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"h6"
,
"br"
,
"li"
}));
TAGS_TO_IGNORE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"head"
,
"style"
,
"script"
,
"noscript"
,
"option"
}));
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
return
select0
(
doc
);
}
protected
String
select0
(
Element
element
)
{
String
tagName
=
element
.
tagName
().
toLowerCase
();
if
(
TAGS_TO_IGNORE
.
contains
(
tagName
))
{
return
""
;
}
StringBuilder
textBuilder
=
new
StringBuilder
();
textBuilder
.
append
(
element
.
text
());
if
(
element
.
children
()
!=
null
)
{
for
(
Element
child
:
element
.
children
())
{
textBuilder
.
append
(
select0
(
child
));
}
}
if
(
TAGS_IN_NEWLINE
.
contains
(
tagName
))
{
textBuilder
.
append
(
newLineSeperator
);
}
return
textBuilder
.
toString
();
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
throw
new
UnsupportedOperationException
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
us.codecraft.xsoup.XPathEvaluator
;
import
us.codecraft.xsoup.Xsoup
;
import
java.util.List
;
/**
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public
class
XsoupSelector
extends
BaseElementSelector
{
private
XPathEvaluator
xPathEvaluator
;
public
XsoupSelector
(
String
xpathStr
)
{
this
.
xPathEvaluator
=
Xsoup
.
compile
(
xpathStr
);
}
@Override
public
String
select
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
get
();
}
@Override
public
List
<
String
>
selectList
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
list
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.apache.commons.lang3.BooleanUtils
;
import
java.util.Properties
;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public
abstract
class
EnvironmentUtil
{
private
static
final
String
USE_XSOUP
=
"xsoup"
;
public
static
boolean
useXsoup
()
{
Properties
properties
=
System
.
getProperties
();
Object
o
=
properties
.
get
(
USE_XSOUP
);
if
(
o
==
null
)
{
return
true
;
}
return
BooleanUtils
.
toBoolean
(((
String
)
o
).
toLowerCase
());
}
public
static
void
setUseXsoup
(
boolean
useXsoup
)
{
Properties
properties
=
System
.
getProperties
();
properties
.
setProperty
(
USE_XSOUP
,
BooleanUtils
.
toString
(
useXsoup
,
"true"
,
"false"
));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
junit.framework.Assert
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelectorTest
{
@Test
public
void
test
()
{
String
html
=
"<div class=\"edit-comment-hide\">\n"
+
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n"
+
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n"
+
" </div>\n"
+
" </div>"
;
TextContentSelector
textContentSelector
=
new
TextContentSelector
(
"<br>"
);
String
text
=
textContentSelector
.
select
(
html
);
Assert
.
assertNotNull
(
text
);
}
@Ignore
(
"takes long time"
)
@Test
public
void
testDownload
()
{
String
s
=
new
HttpClientDownloader
().
download
(
"http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/"
,
"utf-8"
)
.
smartContent
().
text
().
toString
();
Assert
.
assertNotNull
(
s
);
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.junit.Test
;
import
static
junit
.
framework
.
Assert
.*;
/**
* @author code4crafter@gmail.com
*/
public
class
EnvironmentUtilTest
{
@Test
public
void
test
()
{
assertTrue
(
EnvironmentUtil
.
useXsoup
());
EnvironmentUtil
.
setUseXsoup
(
false
);
assertFalse
(
EnvironmentUtil
.
useXsoup
());
}
}
webmagic-extension/pom.xml
View file @
e06b0c1a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<parent>
<parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
e06b0c1a
...
@@ -34,7 +34,7 @@ class PageModelExtractor {
...
@@ -34,7 +34,7 @@ class PageModelExtractor {
private
List
<
FieldExtractor
>
fieldExtractors
;
private
List
<
FieldExtractor
>
fieldExtractors
;
private
Extractor
e
xtractor
;
private
Extractor
objectE
xtractor
;
public
static
PageModelExtractor
create
(
Class
clazz
)
{
public
static
PageModelExtractor
create
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
...
@@ -169,7 +169,7 @@ class PageModelExtractor {
...
@@ -169,7 +169,7 @@ class PageModelExtractor {
annotation
=
clazz
.
getAnnotation
(
ExtractBy
.
class
);
annotation
=
clazz
.
getAnnotation
(
ExtractBy
.
class
);
if
(
annotation
!=
null
)
{
if
(
annotation
!=
null
)
{
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
e
xtractor
=
new
Extractor
(
new
XpathSelector
(
extractBy
.
value
()),
Extractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
objectE
xtractor
=
new
Extractor
(
new
XpathSelector
(
extractBy
.
value
()),
Extractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
}
}
}
}
...
@@ -183,28 +183,28 @@ class PageModelExtractor {
...
@@ -183,28 +183,28 @@ class PageModelExtractor {
if
(!
matched
)
{
if
(!
matched
)
{
return
null
;
return
null
;
}
}
if
(
e
xtractor
==
null
)
{
if
(
objectE
xtractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
()
);
return
processSingle
(
page
,
null
,
false
);
}
else
{
}
else
{
if
(
e
xtractor
.
multi
)
{
if
(
objectE
xtractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
e
xtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
List
<
String
>
list
=
objectE
xtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
Object
o
=
processSingle
(
page
,
s
,
false
);
if
(
o
!=
null
)
{
if
(
o
!=
null
)
{
os
.
add
(
o
);
os
.
add
(
o
);
}
}
}
}
return
os
;
return
os
;
}
else
{
}
else
{
String
select
=
e
xtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
String
select
=
objectE
xtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
Object
o
=
processSingle
(
page
,
select
,
false
);
return
o
;
return
o
;
}
}
}
}
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
private
Object
processSingle
(
Page
page
,
String
html
,
boolean
isRaw
)
{
Object
o
=
null
;
Object
o
=
null
;
try
{
try
{
o
=
clazz
.
newInstance
();
o
=
clazz
.
newInstance
();
...
@@ -213,10 +213,14 @@ class PageModelExtractor {
...
@@ -213,10 +213,14 @@ class PageModelExtractor {
List
<
String
>
value
;
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
...
@@ -232,10 +236,14 @@ class PageModelExtractor {
...
@@ -232,10 +236,14 @@ class PageModelExtractor {
String
value
;
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
View file @
e06b0c1a
...
@@ -18,7 +18,7 @@ import java.io.PrintWriter;
...
@@ -18,7 +18,7 @@ import java.io.PrintWriter;
* Otherwise use SHA1 as file name.
* Otherwise use SHA1 as file name.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.
2.2
* @since 0.
3.0
*/
*/
public
class
FilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
public
class
FilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
package
us
.
codecraft
.
webmagic
.
utils
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.selector.CssSelector
;
import
us.codecraft.webmagic.selector.*
;
import
us.codecraft.webmagic.selector.RegexSelector
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.XpathSelector
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
/**
/**
* Tools for annotation converting. <br>
* Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
* @since 0.2.1
*/
*/
...
@@ -27,17 +25,27 @@ public class ExtractorUtils {
...
@@ -27,17 +25,27 @@ public class ExtractorUtils {
selector
=
new
RegexSelector
(
value
);
selector
=
new
RegexSelector
(
value
);
break
;
break
;
case
XPath:
case
XPath:
selector
=
new
XpathSelector
(
value
);
selector
=
get
XpathSelector
(
value
);
break
;
break
;
default
:
default
:
selector
=
new
XpathSelector
(
value
);
selector
=
getXpathSelector
(
value
);
}
return
selector
;
}
private
static
Selector
getXpathSelector
(
String
value
)
{
Selector
selector
;
if
(
EnvironmentUtil
.
useXsoup
())
{
selector
=
new
XsoupSelector
(
value
);
}
else
{
selector
=
new
XpathSelector
(
value
);
}
}
return
selector
;
return
selector
;
}
}
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
if
(
extractBies
==
null
)
{
if
(
extractBies
==
null
)
{
return
selectors
;
return
selectors
;
}
}
for
(
ExtractBy
extractBy
:
extractBies
)
{
for
(
ExtractBy
extractBy
:
extractBies
)
{
...
...
webmagic-samples/pom.xml
View file @
e06b0c1a
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
<parent>
<parent>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<version>
0.
2.1
</version>
<version>
0.
3.0-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.selector.PlainText
;
...
@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
...
@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='pcb']//tbody"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='pcb']//tbody
/tidyText()
"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
regex
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
regex
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
regex
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
regex
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
}
}
...
@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
...
@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
}
}
return
site
;
return
site
;
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
DiaoyuwengProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
...
@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}
"
).
all
();
List
<
String
>
strings
=
page
.
getHtml
().
links
().
regex
(
".*/yewu/.*
"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd
[@class='w133']
"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd"
));
}
}
@Override
@Override
public
Site
getSite
()
{
public
Site
getSite
()
{
return
Site
.
me
().
setDomain
(
"sh.58.com"
).
addStartUrl
(
"http://sh.58.com/"
);
//To change body of implemented methods use File | Settings | File Templates.
return
Site
.
me
().
setDomain
(
"sh.58.com"
).
addStartUrl
(
"http://sh.58.com/"
);
//To change body of implemented methods use File | Settings | File Templates.
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
F58PageProcesser
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -14,10 +15,9 @@ import java.util.List;
...
@@ -14,10 +15,9 @@ import java.util.List;
public
class
HuxiuProcessor
implements
PageProcessor
{
public
class
HuxiuProcessor
implements
PageProcessor
{
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
".*article.*"
).
all
();
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='
neirong']//h1[@class='ph xs5']
"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='
clearfix neirong']//h1/text()
"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
}
}
...
@@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
...
@@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"www.huxiu.com"
).
addStartUrl
(
"http://www.huxiu.com/"
).
return
Site
.
me
().
setDomain
(
"www.huxiu.com"
).
addStartUrl
(
"http://www.huxiu.com/"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
HuxiuProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
View file @
e06b0c1a
...
@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
...
@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.RedisScheduler
;
import
java.util.List
;
import
java.util.List
;
...
@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
...
@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
InfoQMiniBookProcessor
())
Spider
.
create
(
new
InfoQMiniBookProcessor
())
.
scheduler
(
new
RedisScheduler
(
"localhost"
))
.
pipeline
(
new
FilePipeline
(
"/data/temp/webmagic/"
))
.
thread
(
5
)
.
thread
(
5
)
.
run
();
.
run
();
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
View file @
e06b0c1a
...
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
...
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
/**
...
@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
...
@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
pipeline
(
new
FilePipeline
(
"/data/webmagic/"
)).
run
();
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
run
();
}
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
/**
...
@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
...
@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"kaichiba.com"
).
addStartUrl
(
"http://kaichiba.com/shop/41725781"
).
setCharset
(
"utf-8"
).
return
Site
.
me
().
setDomain
(
"kaichiba.com"
).
addStartUrl
(
"http://kaichiba.com/shop/41725781"
).
setCharset
(
"utf-8"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
KaichibaProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
...
@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
}
}
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
all
());
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
all
());
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]
/text()
"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]
/text()
"
));
}
}
@Override
@Override
...
@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
...
@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"meican.com"
).
addStartUrl
(
"http://www.meican.com/shanghai/districts"
).
setCharset
(
"utf-8"
).
return
Site
.
me
().
setDomain
(
"meican.com"
).
addStartUrl
(
"http://www.meican.com/shanghai/districts"
).
setCharset
(
"utf-8"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
MeicanProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
samples
;
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
links
=
page
.
getHtml
().
links
().
regex
(
"http://my\\.oschina\\.net/flashsword/blog/\\d+"
).
all
();
List
<
String
>
links
=
page
.
getHtml
().
links
().
regex
(
"http://my\\.oschina\\.net/flashsword/blog/\\d+"
).
all
();
page
.
addTargetRequests
(
links
);
page
.
addTargetRequests
(
links
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
).
toString
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1
/text()
"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
$
(
"div.content
"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogContent']/tidyText()
"
).
toString
());
page
.
putField
(
"tags"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogTags']/a/text()"
).
all
());
page
.
putField
(
"tags"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogTags']/a/text()"
).
all
());
}
}
...
@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
pipeline
(
new
ConsolePipeline
()).
run
();
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
run
();
}
}
}
}
webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java
0 → 100644
View file @
e06b0c1a
This diff is collapsed.
Click to expand it.
webmagic-saxon/pom.xml
View file @
e06b0c1a
...
@@ -17,6 +17,11 @@
...
@@ -17,6 +17,11 @@
<artifactId>
webmagic-core
</artifactId>
<artifactId>
webmagic-core
</artifactId>
<version>
${project.version}
</version>
<version>
${project.version}
</version>
</dependency>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
xsoup
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<dependency>
<groupId>
net.sf.saxon
</groupId>
<groupId>
net.sf.saxon
</groupId>
<artifactId>
Saxon-HE
</artifactId>
<artifactId>
Saxon-HE
</artifactId>
...
...
webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.TagNode
;
import
org.htmlcleaner.XPatherException
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.junit.Assert
;
import
org.junit.Assert
;
import
org.junit.Ignore
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
import
us.codecraft.xsoup.XPathEvaluator
;
import
us.codecraft.xsoup.Xsoup
;
/**
/**
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
...
@@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
...
@@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
Html
html1
=
new
Html
(
html
);
Html
html1
=
new
Html
(
html
);
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
all
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
all
());
Selectors
.
xpath
(
"/abc/"
).
select
(
""
);
}
}
@Test
@Test
...
@@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
...
@@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
xpath2Selector
.
selectList
(
html
);
xpath2Selector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
XpathSelector
xpathSelector
=
new
XpathSelector
(
"//a"
);
XpathSelector
xpathSelector
=
new
XpathSelector
(
"//a"
);
time
=
System
.
currentTimeMillis
();
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpathSelector
.
selectList
(
html
);
xpathSelector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpath2Selector
.
selectList
(
html
);
xpath2Selector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()
-
time
);
CssSelector
cssSelector
=
new
CssSelector
(
"a"
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
cssSelector
.
selectList
(
html
);
}
System
.
out
.
println
(
"css "
+(
System
.
currentTimeMillis
()-
time
));
}
@Ignore
(
"take long time"
)
@Test
public
void
parserPerformanceTest
()
throws
XPatherException
{
System
.
out
.
println
(
html
.
length
());
HtmlCleaner
htmlCleaner
=
new
HtmlCleaner
();
TagNode
tagNode
=
htmlCleaner
.
clean
(
html
);
Document
document
=
Jsoup
.
parse
(
html
);
long
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
htmlCleaner
.
clean
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
tagNode
.
evaluateXPath
(
"//a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
Jsoup
.
parse
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
document
.
select
(
"a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
htmlCleaner
.
clean
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
tagNode
.
evaluateXPath
(
"//a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
XPathEvaluator
compile
=
Xsoup
.
compile
(
"//a"
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
compile
.
evaluate
(
document
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment