Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
194518fd
Commit
194518fd
authored
Sep 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add switch
parent
326b97c6
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
992 additions
and
63 deletions
+992
-63
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+9
-0
CacheElement.java
...ain/java/us/codecraft/webmagic/selector/CacheElement.java
+0
-36
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+17
-9
EnvironmentUtil.java
...ain/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
+28
-0
EnvironmentUtilTest.java
...java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
+18
-0
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+16
-15
ExtractorUtils.java
...main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+14
-3
ProcessorBenchmark.java
.../src/test/java/us/codecraft/model/ProcessorBenchmark.java
+890
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
194518fd
...
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
java.io.Closeable
;
...
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return
this
;
}
/**
* switch off xsoup
* @return
*/
public
static
void
xsoupOff
(){
EnvironmentUtil
.
setUseXsoup
(
false
);
}
@Override
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java
deleted
100644 → 0
View file @
326b97c6
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
java.util.List
;
/**
* Cache parsed element for extract.
*
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public
class
CacheElement
{
public
String
text
;
public
Element
element
;
public
String
select
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getElement
());
}
else
{
return
selector
.
select
(
getText
());
}
}
public
List
<
String
>
selectList
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getElement
());
}
else
{
return
selector
.
selectList
(
getText
());
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
194518fd
...
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -72,17 +73,22 @@ public class Html extends PlainText {
@Override
public
Selectable
xpath
(
String
xpath
)
{
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
document
!=
null
){
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
if
(
EnvironmentUtil
.
useXsoup
())
{
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
document
!=
null
)
{
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
}
return
selectList
(
xsoupSelector
,
strings
);
}
else
{
XpathSelector
xpathSelector
=
new
XpathSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
return
selectList
(
xsoupSelector
,
strings
);
}
@Override
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
)
{
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
...
...
@@ -91,7 +97,7 @@ public class Html extends PlainText {
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
if
(
document
!=
null
)
{
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
...
...
@@ -102,15 +108,17 @@ public class Html extends PlainText {
}
public
String
getText
()
{
if
(
strings
!=
null
&&
strings
.
size
()>
0
){
return
strings
.
get
(
0
);
}
return
document
.
html
();
}
/**
*
* @param selector
* @return
*/
public
String
select
(
Selector
selector
)
{
public
String
select
Document
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getDocument
());
...
...
@@ -119,7 +127,7 @@ public class Html extends PlainText {
}
}
public
List
<
String
>
selectList
(
Selector
selector
)
{
public
List
<
String
>
select
DocumentFor
List
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getDocument
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
0 → 100644
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.apache.commons.lang3.BooleanUtils
;
import
java.util.Properties
;
/**
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public
abstract
class
EnvironmentUtil
{
private
static
final
String
USE_XSOUP
=
"xsoup"
;
public
static
boolean
useXsoup
()
{
Properties
properties
=
System
.
getProperties
();
Object
o
=
properties
.
get
(
USE_XSOUP
);
if
(
o
==
null
)
{
return
true
;
}
return
BooleanUtils
.
toBoolean
(((
String
)
o
).
toLowerCase
());
}
public
static
void
setUseXsoup
(
boolean
useXsoup
)
{
Properties
properties
=
System
.
getProperties
();
properties
.
setProperty
(
USE_XSOUP
,
BooleanUtils
.
toString
(
useXsoup
,
"true"
,
"false"
));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
0 → 100644
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.junit.Test
;
import
static
junit
.
framework
.
Assert
.*;
/**
* @author code4crafter@gmail.com
*/
public
class
EnvironmentUtilTest
{
@Test
public
void
test
()
{
assertTrue
(
EnvironmentUtil
.
useXsoup
());
EnvironmentUtil
.
setUseXsoup
(
false
);
assertFalse
(
EnvironmentUtil
.
useXsoup
());
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.nodes.Element
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.model.annotation.*
;
import
us.codecraft.webmagic.selector.*
;
...
...
@@ -185,13 +184,13 @@ class PageModelExtractor {
return
null
;
}
if
(
objectExtractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
()
);
return
processSingle
(
page
,
null
,
false
);
}
else
{
if
(
objectExtractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
objectExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
Object
o
=
processSingle
(
page
,
s
,
false
);
if
(
o
!=
null
)
{
os
.
add
(
o
);
}
...
...
@@ -199,19 +198,13 @@ class PageModelExtractor {
return
os
;
}
else
{
String
select
=
objectExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
Object
o
=
processSingle
(
page
,
select
,
false
);
return
o
;
}
}
}
private
List
<
String
>
select
(
Selector
selector
,
Element
element
,
String
html
){
if
(
selector
instanceof
ElementSelector
){
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
private
Object
processSingle
(
Page
page
,
String
html
,
boolean
isRaw
)
{
Object
o
=
null
;
try
{
o
=
clazz
.
newInstance
();
...
...
@@ -220,10 +213,14 @@ class PageModelExtractor {
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
break
;
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
}
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
...
...
@@ -239,10 +236,14 @@ class PageModelExtractor {
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
break
;
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
}
break
;
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
View file @
194518fd
...
...
@@ -8,6 +8,7 @@ import java.util.List;
/**
* Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
...
...
@@ -24,17 +25,27 @@ public class ExtractorUtils {
selector
=
new
RegexSelector
(
value
);
break
;
case
XPath:
selector
=
new
Xsoup
Selector
(
value
);
selector
=
getXpath
Selector
(
value
);
break
;
default
:
selector
=
new
XsoupSelector
(
value
);
selector
=
getXpathSelector
(
value
);
}
return
selector
;
}
private
static
Selector
getXpathSelector
(
String
value
)
{
Selector
selector
;
if
(
EnvironmentUtil
.
useXsoup
())
{
selector
=
new
XsoupSelector
(
value
);
}
else
{
selector
=
new
XpathSelector
(
value
);
}
return
selector
;
}
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
if
(
extractBies
==
null
)
{
if
(
extractBies
==
null
)
{
return
selectors
;
}
for
(
ExtractBy
extractBy
:
extractBies
)
{
...
...
webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java
0 → 100644
View file @
194518fd
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment