Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
8b35d795
Commit
8b35d795
authored
Mar 19, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Do not cache document in Selectable for selected Html element #73
parent
6201fd69
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
79 additions
and
32 deletions
+79
-32
schema.sql
...gic-avalon-common/src/main/resources/sql/mysql/schema.sql
+22
-0
DashBoardController.java
...us/codecraft/webmagic/avalon/web/DashBoardController.java
+0
-20
Worker.java
...er/src/main/java/us/codecraft/webmagic/worker/Worker.java
+2
-0
SpiderController.java
...odecraft/webmagic/worker/controller/SpiderController.java
+10
-3
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+19
-9
SelectorTest.java
...est/java/us/codecraft/webmagic/selector/SelectorTest.java
+26
-0
No files found.
webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql
View file @
8b35d795
...
...
@@ -6,4 +6,26 @@ CREATE TABLE `DynamicClass` (
`UpdateTime`
datetime
NOT
NULL
,
PRIMARY
KEY
(
`Id`
),
UNIQUE
KEY
`un_class_name`
(
`ClassName`
)
)
ENGINE
=
InnoDB
DEFAULT
CHARSET
=
utf8
;
CREATE
TABLE
`Spider`
(
`Id`
int
(
11
)
unsigned
NOT
NULL
AUTO_INCREMENT
,
`PageProcessorId`
int
(
11
)
unsigned
NOT
NULL
AUTO_INCREMENT
,
`PipelineId`
int
(
11
)
unsigned
NOT
NULL
AUTO_INCREMENT
,
`SchedulerId`
int
(
11
)
unsigned
NOT
NULL
AUTO_INCREMENT
,
`Config`
text
NOT
NULL
,
`AddTime`
datetime
NOT
NULL
,
`UpdateTime`
datetime
NOT
NULL
,
PRIMARY
KEY
(
`Id`
),
UNIQUE
KEY
`un_class_name`
(
`ClassName`
)
)
ENGINE
=
InnoDB
DEFAULT
CHARSET
=
utf8
;
CREATE
TABLE
`PageProcessor`
(
`Id`
int
(
11
)
unsigned
NOT
NULL
AUTO_INCREMENT
,
`ClassName`
varchar
(
200
)
NOT
NULL
,
`Params`
text
NOT
NULL
,
`AddTime`
datetime
NOT
NULL
,
`UpdateTime`
datetime
NOT
NULL
,
PRIMARY
KEY
(
`Id`
),
UNIQUE
KEY
`un_class_name`
(
`ClassName`
)
)
ENGINE
=
InnoDB
DEFAULT
CHARSET
=
utf8
;
\ No newline at end of file
webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java
deleted
100644 → 0
View file @
6201fd69
package
us
.
codecraft
.
webmagic
.
avalon
.
web
;
import
org.springframework.stereotype.Controller
;
import
org.springframework.web.bind.annotation.RequestMapping
;
import
org.springframework.web.servlet.ModelAndView
;
/**
* @author code4crafter@gmail.com
*/
@Controller
(
"dashboard"
)
@RequestMapping
(
"/"
)
public
class
DashBoardController
{
@RequestMapping
public
ModelAndView
index
()
{
ModelAndView
map
=
new
ModelAndView
(
"dashboard"
);
return
map
;
}
}
webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java
View file @
8b35d795
...
...
@@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap;
import
java.util.concurrent.ExecutorService
;
/**
* Container of Spiders.
*
* @author code4crafter@gmail.com
*/
public
class
Worker
{
...
...
webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/
avalon/web
/SpiderController.java
→
webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/
worker/controller
/SpiderController.java
View file @
8b35d795
package
us
.
codecraft
.
webmagic
.
avalon
.
web
;
package
us
.
codecraft
.
webmagic
.
worker
.
controller
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.stereotype.Controller
;
import
org.springframework.web.bind.annotation.RequestMapping
;
import
org.springframework.web.bind.annotation.RequestParam
;
import
org.springframework.web.bind.annotation.ResponseBody
;
import
us.codecraft.webmagic.worker.Worker
;
import
java.util.HashMap
;
import
java.util.Map
;
...
...
@@ -10,15 +13,19 @@ import java.util.Map;
/**
* @author code4crafter@gmail.com
*/
@Controller
(
"spider"
)
@Controller
@RequestMapping
(
"spider"
)
public
class
SpiderController
{
@Autowired
private
Worker
worker
;
@RequestMapping
(
"create"
)
@ResponseBody
public
Map
<
String
,
Object
>
create
()
{
public
Map
<
String
,
Object
>
create
(
@RequestParam
(
"id"
)
String
id
)
{
HashMap
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"code"
,
200
);
return
map
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
8b35d795
...
...
@@ -23,7 +23,7 @@ public class Html extends PlainText {
*/
private
Document
document
;
private
boolean
init
=
fals
e
;
private
boolean
needInitCache
=
tru
e
;
public
Html
(
List
<
String
>
strings
)
{
super
(
strings
);
...
...
@@ -33,12 +33,22 @@ public class Html extends PlainText {
super
(
text
);
}
public
Html
(
List
<
String
>
strings
,
boolean
needInitCache
)
{
super
(
strings
);
this
.
needInitCache
=
needInitCache
;
}
public
Html
(
String
text
,
boolean
needInitCache
)
{
super
(
text
);
this
.
needInitCache
=
needInitCache
;
}
/**
* lazy init
*/
private
void
initDocument
()
{
if
(
this
.
document
==
null
&&
!
init
)
{
init
=
tru
e
;
if
(
this
.
document
==
null
&&
needInitCache
)
{
needInitCache
=
fals
e
;
//just init once whether the parsing succeeds or not
try
{
this
.
document
=
Jsoup
.
parse
(
getText
());
...
...
@@ -67,7 +77,7 @@ public class Html extends PlainText {
results
.
add
(
result
);
}
}
return
new
Html
(
results
);
return
new
Html
(
results
,
false
);
}
@Override
...
...
@@ -78,7 +88,7 @@ public class Html extends PlainText {
List
<
String
>
result
=
selector
.
selectList
(
string
);
results
.
addAll
(
result
);
}
return
new
Html
(
results
);
return
new
Html
(
results
,
false
);
}
@Override
...
...
@@ -95,9 +105,9 @@ public class Html extends PlainText {
@Override
public
Selectable
xpath
(
String
xpath
)
{
XpathSelector
xpathSelector
=
new
XpathSelector
(
xpath
);
XpathSelector
xpathSelector
=
Selectors
.
xpath
(
xpath
);
if
(
document
!=
null
)
{
return
new
Html
(
xpathSelector
.
selectList
(
document
));
return
new
Html
(
xpathSelector
.
selectList
(
document
)
,
false
);
}
return
selectList
(
xpathSelector
,
strings
);
}
...
...
@@ -106,7 +116,7 @@ public class Html extends PlainText {
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
return
new
Html
(
cssSelector
.
selectList
(
document
)
,
false
);
}
return
selectList
(
cssSelector
,
strings
);
}
...
...
@@ -115,7 +125,7 @@ public class Html extends PlainText {
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
return
new
Html
(
cssSelector
.
selectList
(
document
)
,
false
);
}
return
selectList
(
cssSelector
,
strings
);
}
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java
0 → 100644
View file @
8b35d795
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.junit.Test
;
import
java.util.List
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
/**
* @author code4crafter@gmail.com
*/
public
class
SelectorTest
{
private
String
html
=
"<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>"
;
@Test
public
void
testChain
()
throws
Exception
{
Html
selectable
=
new
Html
(
html
);
List
<
String
>
linksWithoutChain
=
selectable
.
links
().
all
();
Selectable
xpath
=
selectable
.
xpath
(
"//div"
);
List
<
String
>
linksWithChainFirstCall
=
xpath
.
links
().
all
();
List
<
String
>
linksWithChainSecondCall
=
xpath
.
links
().
all
();
assertThat
(
linksWithoutChain
).
hasSameSizeAs
(
linksWithChainFirstCall
);
assertThat
(
linksWithChainFirstCall
).
hasSameSizeAs
(
linksWithChainSecondCall
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment