Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
59aad6a7
Commit
59aad6a7
authored
Aug 17, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
comments in english
parent
e566a539
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
63 additions
and
72 deletions
+63
-72
MultiPageModel.java
...n/src/main/java/us/codecraft/webmagic/MultiPageModel.java
+2
-2
FileCache.java
...main/java/us/codecraft/webmagic/downloader/FileCache.java
+0
-1
Experimental.java
.../us/codecraft/webmagic/model/annotation/Experimental.java
+8
-0
JsonFilePageModelPipeline.java
...odecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
+6
-11
JsonFilePipeline.java
...java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
+3
-9
MultiPagePipeline.java
...ava/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
+23
-21
FileCacheQueueScheduler.java
...codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+7
-7
RedisScheduler.java
.../java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+2
-3
JsonPathSelector.java
...java/us/codecraft/webmagic/selector/JsonPathSelector.java
+1
-2
DoubleKeyMap.java
...c/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+0
-1
MultiKeyMapBase.java
...ain/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
+1
-1
RedisSchedulerTest.java
...a/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
+0
-2
JsonPathSelectorTest.java
.../us/codecraft/webmagic/selector/JsonPathSelectorTest.java
+0
-2
QuickStarter.java
...rc/main/java/us/codecraft/webmagic/main/QuickStarter.java
+2
-2
News163.java
...ain/java/us/codecraft/webmagic/model/samples/News163.java
+6
-6
PagedModel-cmnt.xml
zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml
+1
-1
PagedPipeline-cmnt.xml
...ocs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml
+1
-1
No files found.
webmagic-extension/src/main/java/us/codecraft/webmagic/
Paged
Model.java
→
webmagic-extension/src/main/java/us/codecraft/webmagic/
MultiPage
Model.java
View file @
59aad6a7
...
@@ -8,7 +8,7 @@ import java.util.Collection;
...
@@ -8,7 +8,7 @@ import java.util.Collection;
* Date: 13-8-4 <br>
* Date: 13-8-4 <br>
* Time: 下午5:18 <br>
* Time: 下午5:18 <br>
*/
*/
public
interface
Paged
Model
{
public
interface
MultiPage
Model
{
public
String
getPageKey
();
public
String
getPageKey
();
...
@@ -16,6 +16,6 @@ public interface PagedModel {
...
@@ -16,6 +16,6 @@ public interface PagedModel {
public
String
getPage
();
public
String
getPage
();
public
PagedModel
combine
(
PagedModel
paged
Model
);
public
MultiPageModel
combine
(
MultiPageModel
multiPage
Model
);
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
View file @
59aad6a7
...
@@ -17,7 +17,6 @@ import java.io.*;
...
@@ -17,7 +17,6 @@ import java.io.*;
/**
/**
* Download file and saved to file for cache.<br>
* Download file and saved to file for cache.<br>
*
*
*
* @author code4crafter@gmail.com
* @author code4crafter@gmail.com
* @since 0.2.1
* @since 0.2.1
*/
*/
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java
0 → 100644
View file @
59aad6a7
package
us
.
codecraft
.
webmagic
.
model
.
annotation
;
/**
* @author code4crafter@gmail.com <br>
* Stands for features not stable.
*/
public
@interface
Experimental
{
}
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
View file @
59aad6a7
...
@@ -14,29 +14,24 @@ import java.io.IOException;
...
@@ -14,29 +14,24 @@ import java.io.IOException;
import
java.io.PrintWriter
;
import
java.io.PrintWriter
;
/**
/**
* JSON格式持久化到文件的接口。<br>
* Store results objects (page models) to files in JSON format。<br>
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。<br>
* Use model.getKey() as file name if the model implements HasKey.<br>
* Otherwise use SHA1 as file name.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.2.0
* Time: 下午6:28
*/
*/
public
class
JsonFilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
public
class
JsonFilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
/**
/**
*
新建一个JsonFilePageModelPipeline,使用默认保存路径
"/data/webmagic/"
*
new JsonFilePageModelPipeline with default path
"/data/webmagic/"
*/
*/
public
JsonFilePageModelPipeline
()
{
public
JsonFilePageModelPipeline
()
{
setPath
(
"/data/webmagic/"
);
setPath
(
"/data/webmagic/"
);
}
}
/**
* 新建一个JsonFilePageModelPipeline
*
* @param path 文件保存路径
*/
public
JsonFilePageModelPipeline
(
String
path
)
{
public
JsonFilePageModelPipeline
(
String
path
)
{
setPath
(
path
);
setPath
(
path
);
}
}
...
@@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
...
@@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
try
{
try
{
String
filename
;
String
filename
;
if
(
o
instanceof
HasKey
)
{
if
(
o
instanceof
HasKey
)
{
filename
=
path
+
((
HasKey
)
o
).
key
()
+
".json"
;
filename
=
path
+
((
HasKey
)
o
).
key
()
+
".json"
;
}
else
{
}
else
{
filename
=
path
+
DigestUtils
.
md5Hex
(
ToStringBuilder
.
reflectionToString
(
o
))
+
".json"
;
filename
=
path
+
DigestUtils
.
md5Hex
(
ToStringBuilder
.
reflectionToString
(
o
))
+
".json"
;
}
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
View file @
59aad6a7
...
@@ -13,28 +13,22 @@ import java.io.IOException;
...
@@ -13,28 +13,22 @@ import java.io.IOException;
import
java.io.PrintWriter
;
import
java.io.PrintWriter
;
/**
/**
*
JSON格式持久化到文件的接口。
*
Store results to files in JSON format。<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.2.0
* Time: 下午6:28
*/
*/
public
class
JsonFilePipeline
extends
FilePersistentBase
implements
Pipeline
{
public
class
JsonFilePipeline
extends
FilePersistentBase
implements
Pipeline
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
/**
/**
*
新建一个JsonFilePipeline,使用默认保存路径
"/data/webmagic/"
*
new JsonFilePageModelPipeline with default path
"/data/webmagic/"
*/
*/
public
JsonFilePipeline
()
{
public
JsonFilePipeline
()
{
setPath
(
"/data/webmagic"
);
setPath
(
"/data/webmagic"
);
}
}
/**
* 新建一个JsonFilePipeline
*
* @param path 文件保存路径
*/
public
JsonFilePipeline
(
String
path
)
{
public
JsonFilePipeline
(
String
path
)
{
setPath
(
path
);
setPath
(
path
);
}
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/
Paged
Pipeline.java
→
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/
MultiPage
Pipeline.java
View file @
59aad6a7
package
us
.
codecraft
.
webmagic
.
pipeline
;
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
us.codecraft.webmagic.
Paged
Model
;
import
us.codecraft.webmagic.
MultiPage
Model
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.model.annotation.Experimental
;
import
us.codecraft.webmagic.utils.DoubleKeyMap
;
import
us.codecraft.webmagic.utils.DoubleKeyMap
;
import
java.util.*
;
import
java.util.*
;
import
java.util.concurrent.ConcurrentHashMap
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
/**
* 用于实现分页的Pipeline。<br>
* A pipeline combines the result in more than one page together.<br>
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
* Used for news and articles containing more than one web page. <br>
* MultiPagePipeline will store parts of object and output them when all parts are extracted.<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* @since 0.2.0
* Time: 下午5:15 <br>
*/
*/
public
class
PagedPipeline
implements
Pipeline
{
@Experimental
public
class
MultiPagePipeline
implements
Pipeline
{
private
DoubleKeyMap
<
String
,
String
,
Boolean
>
pageMap
=
new
DoubleKeyMap
<
String
,
String
,
Boolean
>(
ConcurrentHashMap
.
class
);
private
DoubleKeyMap
<
String
,
String
,
Boolean
>
pageMap
=
new
DoubleKeyMap
<
String
,
String
,
Boolean
>(
ConcurrentHashMap
.
class
);
private
DoubleKeyMap
<
String
,
String
,
PagedModel
>
objectMap
=
new
DoubleKeyMap
<
String
,
String
,
Paged
Model
>(
ConcurrentHashMap
.
class
);
private
DoubleKeyMap
<
String
,
String
,
MultiPageModel
>
objectMap
=
new
DoubleKeyMap
<
String
,
String
,
MultiPage
Model
>(
ConcurrentHashMap
.
class
);
@Override
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
...
@@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
...
@@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
private
void
handleObject
(
Iterator
<
Map
.
Entry
<
String
,
Object
>>
iterator
)
{
private
void
handleObject
(
Iterator
<
Map
.
Entry
<
String
,
Object
>>
iterator
)
{
Map
.
Entry
<
String
,
Object
>
objectEntry
=
iterator
.
next
();
Map
.
Entry
<
String
,
Object
>
objectEntry
=
iterator
.
next
();
Object
o
=
objectEntry
.
getValue
();
Object
o
=
objectEntry
.
getValue
();
if
(
o
instanceof
Paged
Model
)
{
if
(
o
instanceof
MultiPage
Model
)
{
PagedModel
pagedModel
=
(
Paged
Model
)
o
;
MultiPageModel
multiPageModel
=
(
MultiPage
Model
)
o
;
pageMap
.
put
(
pagedModel
.
getPageKey
(),
paged
Model
.
getPage
(),
Boolean
.
TRUE
);
pageMap
.
put
(
multiPageModel
.
getPageKey
(),
multiPage
Model
.
getPage
(),
Boolean
.
TRUE
);
if
(
paged
Model
.
getOtherPages
()
!=
null
)
{
if
(
multiPage
Model
.
getOtherPages
()
!=
null
)
{
for
(
String
otherPage
:
paged
Model
.
getOtherPages
())
{
for
(
String
otherPage
:
multiPage
Model
.
getOtherPages
())
{
Boolean
aBoolean
=
pageMap
.
get
(
paged
Model
.
getPageKey
(),
otherPage
);
Boolean
aBoolean
=
pageMap
.
get
(
multiPage
Model
.
getPageKey
(),
otherPage
);
if
(
aBoolean
==
null
)
{
if
(
aBoolean
==
null
)
{
pageMap
.
put
(
paged
Model
.
getPageKey
(),
otherPage
,
Boolean
.
FALSE
);
pageMap
.
put
(
multiPage
Model
.
getPageKey
(),
otherPage
,
Boolean
.
FALSE
);
}
}
}
}
}
}
//check if all pages are processed
//check if all pages are processed
Map
<
String
,
Boolean
>
booleanMap
=
pageMap
.
get
(
paged
Model
.
getPageKey
());
Map
<
String
,
Boolean
>
booleanMap
=
pageMap
.
get
(
multiPage
Model
.
getPageKey
());
objectMap
.
put
(
pagedModel
.
getPageKey
(),
pagedModel
.
getPage
(),
paged
Model
);
objectMap
.
put
(
multiPageModel
.
getPageKey
(),
multiPageModel
.
getPage
(),
multiPage
Model
);
if
(
booleanMap
==
null
)
{
if
(
booleanMap
==
null
)
{
return
;
return
;
}
}
...
@@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
...
@@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
return
;
return
;
}
}
}
}
List
<
Map
.
Entry
<
String
,
PagedModel
>>
entryList
=
new
ArrayList
<
Map
.
Entry
<
String
,
Paged
Model
>>();
List
<
Map
.
Entry
<
String
,
MultiPageModel
>>
entryList
=
new
ArrayList
<
Map
.
Entry
<
String
,
MultiPage
Model
>>();
entryList
.
addAll
(
objectMap
.
get
(
paged
Model
.
getPageKey
()).
entrySet
());
entryList
.
addAll
(
objectMap
.
get
(
multiPage
Model
.
getPageKey
()).
entrySet
());
if
(
entryList
.
size
()
!=
0
)
{
if
(
entryList
.
size
()
!=
0
)
{
Collections
.
sort
(
entryList
,
new
Comparator
<
Map
.
Entry
<
String
,
Paged
Model
>>()
{
Collections
.
sort
(
entryList
,
new
Comparator
<
Map
.
Entry
<
String
,
MultiPage
Model
>>()
{
@Override
@Override
public
int
compare
(
Map
.
Entry
<
String
,
PagedModel
>
o1
,
Map
.
Entry
<
String
,
Paged
Model
>
o2
)
{
public
int
compare
(
Map
.
Entry
<
String
,
MultiPageModel
>
o1
,
Map
.
Entry
<
String
,
MultiPage
Model
>
o2
)
{
try
{
try
{
int
i1
=
Integer
.
parseInt
(
o1
.
getKey
());
int
i1
=
Integer
.
parseInt
(
o1
.
getKey
());
int
i2
=
Integer
.
parseInt
(
o2
.
getKey
());
int
i2
=
Integer
.
parseInt
(
o2
.
getKey
());
...
@@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
...
@@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
}
}
}
}
});
});
Paged
Model
value
=
entryList
.
get
(
0
).
getValue
();
MultiPage
Model
value
=
entryList
.
get
(
0
).
getValue
();
for
(
int
i
=
1
;
i
<
entryList
.
size
();
i
++)
{
for
(
int
i
=
1
;
i
<
entryList
.
size
();
i
++)
{
value
=
value
.
combine
(
entryList
.
get
(
i
).
getValue
());
value
=
value
.
combine
(
entryList
.
get
(
i
).
getValue
());
}
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
View file @
59aad6a7
...
@@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
...
@@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
/**
* 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
* Store urls and cursor in files so that a Spider can resume the status when shutdown。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.2.0
* Time: 下午1:13
*/
*/
public
class
FileCacheQueueScheduler
implements
Scheduler
{
public
class
FileCacheQueueScheduler
implements
Scheduler
{
...
@@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler {
...
@@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler {
private
Set
<
String
>
urls
;
private
Set
<
String
>
urls
;
public
FileCacheQueueScheduler
(
String
filePath
)
{
public
FileCacheQueueScheduler
(
String
filePath
)
{
if
(!
filePath
.
endsWith
(
"/"
)
&&!
filePath
.
endsWith
(
"\\"
))
{
if
(!
filePath
.
endsWith
(
"/"
)
&&
!
filePath
.
endsWith
(
"\\"
))
{
filePath
+=
"/"
;
filePath
+=
"/"
;
}
}
this
.
filePath
=
filePath
;
this
.
filePath
=
filePath
;
}
}
...
@@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler {
...
@@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile
();
readCursorFile
();
readUrlFile
();
readUrlFile
();
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
logger
.
error
(
"init file error"
,
e
);
logger
.
error
(
"init file error"
,
e
);
}
}
}
}
...
@@ -122,7 +122,7 @@ public class FileCacheQueueScheduler implements Scheduler {
...
@@ -122,7 +122,7 @@ public class FileCacheQueueScheduler implements Scheduler {
}
}
private
String
getFileName
(
String
filename
)
{
private
String
getFileName
(
String
filename
)
{
return
filePath
+
task
.
getUUID
()
+
filename
;
return
filePath
+
task
.
getUUID
()
+
filename
;
}
}
@Override
@Override
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
View file @
59aad6a7
...
@@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request;
...
@@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
*
使用redis管理url,构建一个分布式的爬虫
。<br>
*
Use Redis as url scheduler for distributed crawlers
。<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* @since 0.2.0
* Time: 上午7:07 <br>
*/
*/
public
class
RedisScheduler
implements
Scheduler
{
public
class
RedisScheduler
implements
Scheduler
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
View file @
59aad6a7
...
@@ -6,9 +6,8 @@ import java.util.ArrayList;
...
@@ -6,9 +6,8 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* JsonPath
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-12 <br>
* Time: 下午12:54 <br>
*/
*/
public
class
JsonPathSelector
implements
Selector
{
public
class
JsonPathSelector
implements
Selector
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
View file @
59aad6a7
...
@@ -4,7 +4,6 @@ import java.util.Map;
...
@@ -4,7 +4,6 @@ import java.util.Map;
/**
/**
* @author code4crafter@gmail.com
* @author code4crafter@gmail.com
* Date Dec 14, 2012
*/
*/
public
class
DoubleKeyMap
<
K1
,
K2
,
V
>
extends
MultiKeyMapBase
{
public
class
DoubleKeyMap
<
K1
,
K2
,
V
>
extends
MultiKeyMapBase
{
private
Map
<
K1
,
Map
<
K2
,
V
>>
map
;
private
Map
<
K1
,
Map
<
K2
,
V
>>
map
;
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
View file @
59aad6a7
...
@@ -9,7 +9,7 @@ import java.util.HashMap;
...
@@ -9,7 +9,7 @@ import java.util.HashMap;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
* multikey map, some basic objects *
* multi
-
key map, some basic objects *
*
*
* @author yihua.huang
* @author yihua.huang
*/
*/
...
...
webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
View file @
59aad6a7
...
@@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task;
...
@@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 上午7:51 <br>
*/
*/
public
class
RedisSchedulerTest
{
public
class
RedisSchedulerTest
{
...
...
webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
View file @
59aad6a7
...
@@ -7,8 +7,6 @@ import java.util.List;
...
@@ -7,8 +7,6 @@ import java.util.List;
/**
/**
* @author code4crafter@gmai.com <br>
* @author code4crafter@gmai.com <br>
* Date: 13-8-12 <br>
* Time: 下午1:12 <br>
*/
*/
public
class
JsonPathSelectorTest
{
public
class
JsonPathSelectorTest
{
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
View file @
59aad6a7
...
@@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog;
...
@@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog;
import
us.codecraft.webmagic.model.samples.News163
;
import
us.codecraft.webmagic.model.samples.News163
;
import
us.codecraft.webmagic.model.samples.OschinaBlog
;
import
us.codecraft.webmagic.model.samples.OschinaBlog
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.
Paged
Pipeline
;
import
us.codecraft.webmagic.pipeline.
MultiPage
Pipeline
;
import
java.util.LinkedHashMap
;
import
java.util.LinkedHashMap
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -40,7 +40,7 @@ public class QuickStarter {
...
@@ -40,7 +40,7 @@ public class QuickStarter {
key
=
readKey
(
key
);
key
=
readKey
(
key
);
System
.
out
.
println
(
"The demo started and will last 20 seconds..."
);
System
.
out
.
println
(
"The demo started and will last 20 seconds..."
);
//Start spider
//Start spider
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
urlMap
.
get
(
key
)),
clazzMap
.
get
(
key
)).
pipeline
(
new
Paged
Pipeline
()).
pipeline
(
new
ConsolePipeline
()).
runAsync
();
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
urlMap
.
get
(
key
)),
clazzMap
.
get
(
key
)).
pipeline
(
new
MultiPage
Pipeline
()).
pipeline
(
new
ConsolePipeline
()).
runAsync
();
try
{
try
{
Thread
.
sleep
(
20000
);
Thread
.
sleep
(
20000
);
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
View file @
59aad6a7
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.
Paged
Model
;
import
us.codecraft.webmagic.
MultiPage
Model
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.model.OOSpider
;
import
us.codecraft.webmagic.model.OOSpider
;
import
us.codecraft.webmagic.model.annotation.ComboExtract
;
import
us.codecraft.webmagic.model.annotation.ComboExtract
;
...
@@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
...
@@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
import
us.codecraft.webmagic.model.annotation.ExtractByUrl
;
import
us.codecraft.webmagic.model.annotation.ExtractByUrl
;
import
us.codecraft.webmagic.model.annotation.TargetUrl
;
import
us.codecraft.webmagic.model.annotation.TargetUrl
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.
Paged
Pipeline
;
import
us.codecraft.webmagic.pipeline.
MultiPage
Pipeline
;
import
us.codecraft.webmagic.scheduler.RedisScheduler
;
import
us.codecraft.webmagic.scheduler.RedisScheduler
;
import
java.util.Collection
;
import
java.util.Collection
;
...
@@ -20,7 +20,7 @@ import java.util.List;
...
@@ -20,7 +20,7 @@ import java.util.List;
* Time: 下午8:17 <br>
* Time: 下午8:17 <br>
*/
*/
@TargetUrl
(
"http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html"
)
@TargetUrl
(
"http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html"
)
public
class
News163
implements
Paged
Model
{
public
class
News163
implements
MultiPage
Model
{
@ExtractByUrl
(
"http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html"
)
@ExtractByUrl
(
"http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html"
)
private
String
pageKey
;
private
String
pageKey
;
...
@@ -58,10 +58,10 @@ public class News163 implements PagedModel {
...
@@ -58,10 +58,10 @@ public class News163 implements PagedModel {
}
}
@Override
@Override
public
PagedModel
combine
(
PagedModel
paged
Model
)
{
public
MultiPageModel
combine
(
MultiPageModel
multiPage
Model
)
{
News163
news163
=
new
News163
();
News163
news163
=
new
News163
();
news163
.
title
=
this
.
title
;
news163
.
title
=
this
.
title
;
News163
pagedModel1
=
(
News163
)
paged
Model
;
News163
pagedModel1
=
(
News163
)
multiPage
Model
;
news163
.
content
=
this
.
content
+
pagedModel1
.
content
;
news163
.
content
=
this
.
content
+
pagedModel1
.
content
;
return
news163
;
return
news163
;
}
}
...
@@ -77,7 +77,7 @@ public class News163 implements PagedModel {
...
@@ -77,7 +77,7 @@ public class News163 implements PagedModel {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://news.163.com/13/0802/05/958I1E330001124J_2.html"
),
News163
.
class
)
OOSpider
.
create
(
Site
.
me
().
addStartUrl
(
"http://news.163.com/13/0802/05/958I1E330001124J_2.html"
),
News163
.
class
)
.
scheduler
(
new
RedisScheduler
(
"localhost"
)).
clearPipeline
().
pipeline
(
new
Paged
Pipeline
()).
pipeline
(
new
ConsolePipeline
()).
run
();
.
scheduler
(
new
RedisScheduler
(
"localhost"
)).
clearPipeline
().
pipeline
(
new
MultiPage
Pipeline
()).
pipeline
(
new
ConsolePipeline
()).
run
();
}
}
}
}
zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml
View file @
59aad6a7
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
<date-generated>
Sat Aug 17 14:14:45 CST 2013
</date-generated>
<date-generated>
Sat Aug 17 14:14:45 CST 2013
</date-generated>
</meta>
</meta>
<comment>
<comment>
<key>
<![CDATA[us.codecraft.webmagic.
Paged
Model]]>
</key>
<key>
<![CDATA[us.codecraft.webmagic.
MultiPage
Model]]>
</key>
<data>
<![CDATA[ 实现此接口以进行支持爬虫分页抓取。<br>
<data>
<![CDATA[ 实现此接口以进行支持爬虫分页抓取。<br>
@author code4crafter@gmail.com
<br>
@author code4crafter@gmail.com
<br>
Date: 13-8-4
<br>
Date: 13-8-4
<br>
...
...
zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml
View file @
59aad6a7
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
<date-generated>
Sat Aug 17 14:14:46 CST 2013
</date-generated>
<date-generated>
Sat Aug 17 14:14:46 CST 2013
</date-generated>
</meta>
</meta>
<comment>
<comment>
<key>
<![CDATA[us.codecraft.webmagic.pipeline.
Paged
Pipeline]]>
</key>
<key>
<![CDATA[us.codecraft.webmagic.pipeline.
MultiPage
Pipeline]]>
</key>
<data>
<![CDATA[ 用于实现分页的Pipeline。<br>
<data>
<![CDATA[ 用于实现分页的Pipeline。<br>
在使用redis做分布式爬虫时,请不要使用此功能。
<br>
在使用redis做分布式爬虫时,请不要使用此功能。
<br>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment