Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
619a12b3
Commit
619a12b3
authored
Aug 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add paged support
parent
a5c85c3c
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
337 additions
and
14 deletions
+337
-14
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+5
-0
ConsolePipeline.java
.../java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
+0
-1
DoubleKeyMap.java
...c/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+111
-0
MultiKeyMapBase.java
...ain/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
+42
-0
OschinaBlogComment.java
.../java/us/codecraft/webmagic/model/OschinaBlogComment.java
+0
-13
PagedModel.java
...-misc/src/main/java/us/codecraft/webmagic/PagedModel.java
+20
-0
PagedPipeline.java
...in/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
+78
-0
News163.java
...ain/java/us/codecraft/webmagic/model/samples/News163.java
+81
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
619a12b3
...
...
@@ -283,6 +283,11 @@ public class Spider implements Runnable, Task {
return
this
;
}
public
Spider
clearPipeline
(){
pipelines
=
new
ArrayList
<
Pipeline
>();
return
this
;
}
@Override
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
View file @
619a12b3
...
...
@@ -29,7 +29,6 @@ public class ConsolePipeline implements Pipeline{
}
else
{
System
.
out
.
println
(
entry
.
getKey
()
+
":\t"
+
entry
.
getValue
());
}
System
.
out
.
println
(
entry
.
getKey
()+
":\t"
+
entry
.
getValue
());
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
0 → 100755
View file @
619a12b3
package
us
.
codecraft
.
webmagic
.
utils
;
import
java.util.Map
;
/**
* @author yihua.huang@dianping.com
* @date Dec 14, 2012
*/
public
class
DoubleKeyMap
<
K1
,
K2
,
V
>
extends
MultiKeyMapBase
{
private
Map
<
K1
,
Map
<
K2
,
V
>>
map
;
public
DoubleKeyMap
()
{
init
();
}
public
DoubleKeyMap
(
Map
<
K1
,
Map
<
K2
,
V
>>
map
)
{
this
(
map
,
DEFAULT_CLAZZ
);
}
public
DoubleKeyMap
(
Class
<?
extends
Map
>
protoMapClass
)
{
super
(
protoMapClass
);
init
();
}
private
void
init
()
{
if
(
map
==
null
)
{
map
=
this
.<
K1
,
Map
<
K2
,
V
>>
newMap
();
}
}
/**
* init map with protoMapClass
*
* @param protoMapClass
*/
@SuppressWarnings
(
"rawtypes"
)
public
DoubleKeyMap
(
Map
<
K1
,
Map
<
K2
,
V
>>
map
,
Class
<?
extends
Map
>
protoMapClass
)
{
super
(
protoMapClass
);
this
.
map
=
map
;
init
();
}
/**
* @param key
* @return
*/
public
Map
<
K2
,
V
>
get
(
K1
key
)
{
return
map
.
get
(
key
);
}
/**
* @param key1
* @param key2
* @return
*/
public
V
get
(
K1
key1
,
K2
key2
)
{
if
(
get
(
key1
)
==
null
)
{
return
null
;
}
return
get
(
key1
).
get
(
key2
);
}
/**
* @param key1
* @param submap
* @return
*/
public
V
put
(
K1
key1
,
Map
<
K2
,
V
>
submap
)
{
return
put
(
key1
,
submap
);
}
/**
* @param key1
* @param key2
* @param value
* @return
*/
public
V
put
(
K1
key1
,
K2
key2
,
V
value
)
{
if
(
map
.
get
(
key1
)
==
null
)
{
map
.
put
(
key1
,
this
.<
K2
,
V
>
newMap
());
}
return
get
(
key1
).
put
(
key2
,
value
);
}
/**
* @param key1
* @param key2
* @return
*/
public
V
remove
(
K1
key1
,
K2
key2
)
{
if
(
get
(
key1
)
==
null
)
{
return
null
;
}
V
remove
=
get
(
key1
).
remove
(
key2
);
// 如果上一级map为空,把它也回收掉
if
(
get
(
key1
).
size
()
==
0
)
{
remove
(
key1
);
}
return
remove
;
}
/**
* @param key1
* @return
*/
public
Map
<
K2
,
V
>
remove
(
K1
key1
)
{
Map
<
K2
,
V
>
remove
=
map
.
remove
(
key1
);
return
remove
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
0 → 100755
View file @
619a12b3
package
us
.
codecraft
.
webmagic
.
utils
;
/**
* @author yihua.huang@dianping.com
* @date Dec 14, 2012
*/
import
java.util.HashMap
;
import
java.util.Map
;
/**
* multikey map, some basic objects *
*
* @author yihua.huang
*/
public
abstract
class
MultiKeyMapBase
{
protected
static
final
Class
<?
extends
Map
>
DEFAULT_CLAZZ
=
HashMap
.
class
;
@SuppressWarnings
(
"rawtypes"
)
private
Class
<?
extends
Map
>
protoMapClass
=
DEFAULT_CLAZZ
;
public
MultiKeyMapBase
()
{
}
@SuppressWarnings
(
"rawtypes"
)
public
MultiKeyMapBase
(
Class
<?
extends
Map
>
protoMapClass
)
{
this
.
protoMapClass
=
protoMapClass
;
}
@SuppressWarnings
(
"unchecked"
)
protected
<
K
,
V2
>
Map
<
K
,
V2
>
newMap
()
{
try
{
return
(
Map
<
K
,
V2
>)
protoMapClass
.
newInstance
();
}
catch
(
InstantiationException
e
)
{
throw
new
IllegalArgumentException
(
"wrong proto type map "
+
protoMapClass
);
}
catch
(
IllegalAccessException
e
)
{
throw
new
IllegalArgumentException
(
"wrong proto type map "
+
protoMapClass
);
}
}
}
\ No newline at end of file
webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java
deleted
100644 → 0
View file @
a5c85c3c
package
us
.
codecraft
.
webmagic
.
model
;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
*/
@TargetUrl
(
"http://my.oschina.net/flashsword/blog/*"
)
public
class
OschinaBlogComment
{
}
\ No newline at end of file
webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java
0 → 100644
View file @
619a12b3
package
us
.
codecraft
.
webmagic
;
import
java.util.Collection
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 下午5:18 <br>
*/
public
interface
PagedModel
{
public
String
getPageKey
();
public
Collection
<
String
>
getOtherPages
();
public
String
getPage
();
public
PagedModel
combine
(
PagedModel
pagedModel
);
}
webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
0 → 100644
View file @
619a12b3
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
us.codecraft.webmagic.PagedModel
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.utils.DoubleKeyMap
;
import
java.util.*
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 下午5:15 <br>
*/
public
class
PagedPipeline
implements
Pipeline
{
private
DoubleKeyMap
<
String
,
String
,
Boolean
>
pageMap
=
new
DoubleKeyMap
<
String
,
String
,
Boolean
>(
ConcurrentHashMap
.
class
);
private
DoubleKeyMap
<
String
,
String
,
PagedModel
>
objectMap
=
new
DoubleKeyMap
<
String
,
String
,
PagedModel
>(
ConcurrentHashMap
.
class
);
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
resultItemsAll
=
resultItems
.
getAll
();
Iterator
<
Map
.
Entry
<
String
,
Object
>>
iterator
=
resultItemsAll
.
entrySet
().
iterator
();
while
(
iterator
.
hasNext
())
{
handleObject
(
iterator
);
}
}
private
void
handleObject
(
Iterator
<
Map
.
Entry
<
String
,
Object
>>
iterator
)
{
Map
.
Entry
<
String
,
Object
>
objectEntry
=
iterator
.
next
();
Object
o
=
objectEntry
.
getValue
();
if
(
o
instanceof
PagedModel
)
{
PagedModel
pagedModel
=
(
PagedModel
)
o
;
for
(
String
otherPage
:
pagedModel
.
getOtherPages
())
{
Boolean
aBoolean
=
pageMap
.
get
(
pagedModel
.
getPageKey
(),
otherPage
);
if
(
aBoolean
==
null
)
{
pageMap
.
put
(
pagedModel
.
getPageKey
(),
otherPage
,
Boolean
.
FALSE
);
}
}
//check if all pages are processed
Map
<
String
,
Boolean
>
booleanMap
=
pageMap
.
get
(
pagedModel
.
getPageKey
());
objectMap
.
put
(
pagedModel
.
getPageKey
(),
pagedModel
.
getPage
(),
pagedModel
);
if
(
booleanMap
==
null
)
{
return
;
}
for
(
Map
.
Entry
<
String
,
Boolean
>
stringBooleanEntry
:
booleanMap
.
entrySet
())
{
if
(!
stringBooleanEntry
.
getValue
())
{
iterator
.
remove
();
return
;
}
}
List
<
Map
.
Entry
<
String
,
PagedModel
>>
entryList
=
new
ArrayList
<
Map
.
Entry
<
String
,
PagedModel
>>();
entryList
.
addAll
(
objectMap
.
get
(
pagedModel
.
getPageKey
()).
entrySet
());
if
(
entryList
.
size
()
!=
0
)
{
Collections
.
sort
(
entryList
,
new
Comparator
<
Map
.
Entry
<
String
,
PagedModel
>>()
{
@Override
public
int
compare
(
Map
.
Entry
<
String
,
PagedModel
>
o1
,
Map
.
Entry
<
String
,
PagedModel
>
o2
)
{
try
{
int
i1
=
Integer
.
parseInt
(
o1
.
getKey
());
int
i2
=
Integer
.
parseInt
(
o2
.
getKey
());
return
i1
-
i2
;
}
catch
(
NumberFormatException
e
)
{
return
o1
.
getKey
().
compareTo
(
o2
.
getKey
());
}
}
});
PagedModel
value
=
entryList
.
get
(
0
).
getValue
();
for
(
int
i
=
1
;
i
<
entryList
.
size
();
i
++){
value
=
value
.
combine
(
entryList
.
get
(
i
).
getValue
());
}
objectEntry
.
setValue
(
value
);
}
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
0 → 100644
View file @
619a12b3
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.PagedModel
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.model.*
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.PagedPipeline
;
import
us.codecraft.webmagic.selector.Selectable
;
import
java.util.Collection
;
import
java.util.List
;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 下午8:17 <br>
*/
@TargetUrl
(
"http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html"
)
public
class
News163
implements
PagedModel
,
AfterExtractor
{
@ExtractByUrl
(
"http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html"
)
private
String
pageKey
;
@ExtractByUrl
(
value
=
"http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html"
,
notNull
=
false
)
private
String
page
;
private
List
<
String
>
otherPage
;
@ExtractBy
(
"//h1[@id=\"h1title\"]/text()"
)
private
String
title
;
@ExtractBy
(
"//div[@id=\"epContentLeft\"]"
)
private
String
content
;
@Override
public
String
getPageKey
()
{
return
pageKey
;
}
@Override
public
Collection
<
String
>
getOtherPages
()
{
return
otherPage
;
}
@Override
public
String
getPage
()
{
if
(
page
==
null
)
{
return
"0"
;
}
return
page
;
}
@Override
public
PagedModel
combine
(
PagedModel
pagedModel
)
{
News163
news163
=
new
News163
();
News163
pagedModel1
=
(
News163
)
pagedModel
;
news163
.
content
=
this
.
content
+
pagedModel1
.
content
;
return
news163
;
}
@Override
public
String
toString
()
{
return
"News163{"
+
"content='"
+
content
+
'\''
+
", title='"
+
title
+
'\''
+
", otherPage="
+
otherPage
+
'}'
;
}
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://news.163.com/13/0802/05/958I1E330001124J_2.html"
),
News163
.
class
)
.
clearPipeline
().
pipeline
(
new
PagedPipeline
()).
pipeline
(
new
ConsolePipeline
()).
run
();
}
@Override
public
void
afterProcess
(
Page
page
)
{
Selectable
xpath
=
page
.
getHtml
().
xpath
(
"//div[@class=\"ep-pages\"]//a/@href"
);
otherPage
=
xpath
.
regex
(
"http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html"
).
all
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment