Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
194518fd
Commit
194518fd
authored
Sep 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add switch
parent
326b97c6
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
992 additions
and
63 deletions
+992
-63
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+9
-0
CacheElement.java
...ain/java/us/codecraft/webmagic/selector/CacheElement.java
+0
-36
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+17
-9
EnvironmentUtil.java
...ain/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
+28
-0
EnvironmentUtilTest.java
...java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
+18
-0
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+16
-15
ExtractorUtils.java
...main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+14
-3
ProcessorBenchmark.java
.../src/test/java/us/codecraft/model/ProcessorBenchmark.java
+890
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
194518fd
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
java.io.Closeable
;
import
java.io.Closeable
;
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return
this
;
return
this
;
}
}
/**
* switch off xsoup
* @return
*/
public
static
void
xsoupOff
(){
EnvironmentUtil
.
setUseXsoup
(
false
);
}
@Override
@Override
public
String
getUUID
()
{
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java
deleted
100644 → 0
View file @
326b97c6
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
java.util.List
;
/**
* Cache parsed element for extract.
*
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public
class
CacheElement
{
public
String
text
;
public
Element
element
;
public
String
select
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getElement
());
}
else
{
return
selector
.
select
(
getText
());
}
}
public
List
<
String
>
selectList
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getElement
());
}
else
{
return
selector
.
selectList
(
getText
());
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
194518fd
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
@@ -72,17 +73,22 @@ public class Html extends PlainText {
...
@@ -72,17 +73,22 @@ public class Html extends PlainText {
@Override
@Override
public
Selectable
xpath
(
String
xpath
)
{
public
Selectable
xpath
(
String
xpath
)
{
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
EnvironmentUtil
.
useXsoup
())
{
if
(
document
!=
null
){
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
if
(
document
!=
null
)
{
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
}
return
selectList
(
xsoupSelector
,
strings
);
}
else
{
XpathSelector
xpathSelector
=
new
XpathSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
}
return
selectList
(
xsoupSelector
,
strings
);
}
}
@Override
@Override
public
Selectable
$
(
String
selector
)
{
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
)
{
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
...
@@ -91,7 +97,7 @@ public class Html extends PlainText {
...
@@ -91,7 +97,7 @@ public class Html extends PlainText {
@Override
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
if
(
document
!=
null
)
{
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
...
@@ -102,15 +108,17 @@ public class Html extends PlainText {
...
@@ -102,15 +108,17 @@ public class Html extends PlainText {
}
}
public
String
getText
()
{
public
String
getText
()
{
if
(
strings
!=
null
&&
strings
.
size
()>
0
){
return
strings
.
get
(
0
);
}
return
document
.
html
();
return
document
.
html
();
}
}
/**
/**
*
* @param selector
* @param selector
* @return
* @return
*/
*/
public
String
select
(
Selector
selector
)
{
public
String
select
Document
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getDocument
());
return
elementSelector
.
select
(
getDocument
());
...
@@ -119,7 +127,7 @@ public class Html extends PlainText {
...
@@ -119,7 +127,7 @@ public class Html extends PlainText {
}
}
}
}
public
List
<
String
>
selectList
(
Selector
selector
)
{
public
List
<
String
>
select
DocumentFor
List
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getDocument
());
return
elementSelector
.
selectList
(
getDocument
());
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
0 → 100644
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.apache.commons.lang3.BooleanUtils
;
import
java.util.Properties
;
/**
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public
abstract
class
EnvironmentUtil
{
private
static
final
String
USE_XSOUP
=
"xsoup"
;
public
static
boolean
useXsoup
()
{
Properties
properties
=
System
.
getProperties
();
Object
o
=
properties
.
get
(
USE_XSOUP
);
if
(
o
==
null
)
{
return
true
;
}
return
BooleanUtils
.
toBoolean
(((
String
)
o
).
toLowerCase
());
}
public
static
void
setUseXsoup
(
boolean
useXsoup
)
{
Properties
properties
=
System
.
getProperties
();
properties
.
setProperty
(
USE_XSOUP
,
BooleanUtils
.
toString
(
useXsoup
,
"true"
,
"false"
));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
0 → 100644
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.junit.Test
;
import
static
junit
.
framework
.
Assert
.*;
/**
* @author code4crafter@gmail.com
*/
public
class
EnvironmentUtilTest
{
@Test
public
void
test
()
{
assertTrue
(
EnvironmentUtil
.
useXsoup
());
EnvironmentUtil
.
setUseXsoup
(
false
);
assertFalse
(
EnvironmentUtil
.
useXsoup
());
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
194518fd
package
us
.
codecraft
.
webmagic
.
model
;
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.nodes.Element
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.model.annotation.*
;
import
us.codecraft.webmagic.model.annotation.*
;
import
us.codecraft.webmagic.selector.*
;
import
us.codecraft.webmagic.selector.*
;
...
@@ -185,13 +184,13 @@ class PageModelExtractor {
...
@@ -185,13 +184,13 @@ class PageModelExtractor {
return
null
;
return
null
;
}
}
if
(
objectExtractor
==
null
)
{
if
(
objectExtractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
()
);
return
processSingle
(
page
,
null
,
false
);
}
else
{
}
else
{
if
(
objectExtractor
.
multi
)
{
if
(
objectExtractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
objectExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
List
<
String
>
list
=
objectExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
Object
o
=
processSingle
(
page
,
s
,
false
);
if
(
o
!=
null
)
{
if
(
o
!=
null
)
{
os
.
add
(
o
);
os
.
add
(
o
);
}
}
...
@@ -199,19 +198,13 @@ class PageModelExtractor {
...
@@ -199,19 +198,13 @@ class PageModelExtractor {
return
os
;
return
os
;
}
else
{
}
else
{
String
select
=
objectExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
String
select
=
objectExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
Object
o
=
processSingle
(
page
,
select
,
false
);
return
o
;
return
o
;
}
}
}
}
}
}
private
List
<
String
>
select
(
Selector
selector
,
Element
element
,
String
html
){
private
Object
processSingle
(
Page
page
,
String
html
,
boolean
isRaw
)
{
if
(
selector
instanceof
ElementSelector
){
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
Object
o
=
null
;
Object
o
=
null
;
try
{
try
{
o
=
clazz
.
newInstance
();
o
=
clazz
.
newInstance
();
...
@@ -220,10 +213,14 @@ class PageModelExtractor {
...
@@ -220,10 +213,14 @@ class PageModelExtractor {
List
<
String
>
value
;
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
...
@@ -239,10 +236,14 @@ class PageModelExtractor {
...
@@ -239,10 +236,14 @@ class PageModelExtractor {
String
value
;
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
View file @
194518fd
...
@@ -8,6 +8,7 @@ import java.util.List;
...
@@ -8,6 +8,7 @@ import java.util.List;
/**
/**
* Tools for annotation converting. <br>
* Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
* @since 0.2.1
*/
*/
...
@@ -24,17 +25,27 @@ public class ExtractorUtils {
...
@@ -24,17 +25,27 @@ public class ExtractorUtils {
selector
=
new
RegexSelector
(
value
);
selector
=
new
RegexSelector
(
value
);
break
;
break
;
case
XPath:
case
XPath:
selector
=
new
Xsoup
Selector
(
value
);
selector
=
getXpath
Selector
(
value
);
break
;
break
;
default
:
default
:
selector
=
new
XsoupSelector
(
value
);
selector
=
getXpathSelector
(
value
);
}
return
selector
;
}
private
static
Selector
getXpathSelector
(
String
value
)
{
Selector
selector
;
if
(
EnvironmentUtil
.
useXsoup
())
{
selector
=
new
XsoupSelector
(
value
);
}
else
{
selector
=
new
XpathSelector
(
value
);
}
}
return
selector
;
return
selector
;
}
}
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
if
(
extractBies
==
null
)
{
if
(
extractBies
==
null
)
{
return
selectors
;
return
selectors
;
}
}
for
(
ExtractBy
extractBy
:
extractBies
)
{
for
(
ExtractBy
extractBy
:
extractBies
)
{
...
...
webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java
0 → 100644
View file @
194518fd
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment