Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
6fa82a41
Commit
6fa82a41
authored
Nov 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#29 seed urls with more information
parent
1446ada7
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
131 additions
and
38 deletions
+131
-38
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+75
-29
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+35
-7
OschinaBlogPageProcesser.java
.../webmagic/processor/example/OschinaBlogPageProcesser.java
+1
-1
UrlUtils.java
...e/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+20
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
6fa82a41
...
...
@@ -24,7 +24,7 @@ public class Site {
/**
* startUrls is the urls the crawler to start with.
*/
private
List
<
String
>
startUrls
=
new
ArrayList
<
String
>();
private
List
<
Request
>
startRequests
=
new
ArrayList
<
Request
>();
private
int
sleepTime
=
3000
;
...
...
@@ -38,7 +38,7 @@ public class Site {
private
Set
<
Integer
>
acceptStatCode
=
DEFAULT_STATUS_CODE_SET
;
private
Map
<
String
,
String
>
headers
=
new
HashMap
<
String
,
String
>();
private
Map
<
String
,
String
>
headers
=
new
HashMap
<
String
,
String
>();
public
static
interface
HeaderConst
{
...
...
@@ -182,9 +182,16 @@ public class Site {
* get start urls
*
* @return start urls
* @see #getStartRequests
* @deprecated
*/
@Deprecated
public
List
<
String
>
getStartUrls
()
{
return
startUrls
;
return
UrlUtils
.
convertToUrls
(
startRequests
);
}
public
List
<
Request
>
getStartRequests
()
{
return
startRequests
;
}
/**
...
...
@@ -194,11 +201,19 @@ public class Site {
* @return this
*/
public
Site
addStartUrl
(
String
startUrl
)
{
this
.
startUrls
.
add
(
startUrl
);
if
(
domain
==
null
)
{
if
(
startUrls
.
size
()
>
0
)
{
domain
=
UrlUtils
.
getDomain
(
startUrls
.
get
(
0
));
return
addStartRequest
(
new
Request
(
startUrl
));
}
/**
* Add a url to start url.<br>
*
* @param startUrl
* @return this
*/
public
Site
addStartRequest
(
Request
startRequest
)
{
this
.
startRequests
.
add
(
startRequest
);
if
(
domain
==
null
&&
startRequest
.
getUrl
()
!=
null
)
{
domain
=
UrlUtils
.
getDomain
(
startRequest
.
getUrl
());
}
return
this
;
}
...
...
@@ -241,12 +256,13 @@ public class Site {
/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
*
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public
Site
addHeader
(
String
key
,
String
value
){
headers
.
put
(
key
,
value
);
public
Site
addHeader
(
String
key
,
String
value
)
{
headers
.
put
(
key
,
value
);
return
this
;
}
...
...
@@ -279,6 +295,20 @@ public class Site {
return
this
;
}
public
Task
toTask
()
{
return
new
Task
()
{
@Override
public
String
getUUID
()
{
return
Site
.
this
.
getDomain
();
}
@Override
public
Site
getSite
()
{
return
Site
.
this
;
}
};
}
@Override
public
boolean
equals
(
Object
o
)
{
if
(
this
==
o
)
return
true
;
...
...
@@ -286,37 +316,53 @@ public class Site {
Site
site
=
(
Site
)
o
;
if
(
cycleRetryTimes
!=
site
.
cycleRetryTimes
)
return
false
;
if
(
retryTimes
!=
site
.
retryTimes
)
return
false
;
if
(
sleepTime
!=
site
.
sleepTime
)
return
false
;
if
(
timeOut
!=
site
.
timeOut
)
return
false
;
if
(
acceptStatCode
!=
null
?
!
acceptStatCode
.
equals
(
site
.
acceptStatCode
)
:
site
.
acceptStatCode
!=
null
)
return
false
;
if
(!
domain
.
equals
(
site
.
domain
))
return
false
;
if
(!
startUrls
.
equals
(
site
.
startUrls
))
return
false
;
if
(
charset
!=
null
?
!
charset
.
equals
(
site
.
charset
)
:
site
.
charset
!=
null
)
return
false
;
if
(
cookies
!=
null
?
!
cookies
.
equals
(
site
.
cookies
)
:
site
.
cookies
!=
null
)
return
false
;
if
(
domain
!=
null
?
!
domain
.
equals
(
site
.
domain
)
:
site
.
domain
!=
null
)
return
false
;
if
(
headers
!=
null
?
!
headers
.
equals
(
site
.
headers
)
:
site
.
headers
!=
null
)
return
false
;
if
(
startRequests
!=
null
?
!
startRequests
.
equals
(
site
.
startRequests
)
:
site
.
startRequests
!=
null
)
return
false
;
if
(
userAgent
!=
null
?
!
userAgent
.
equals
(
site
.
userAgent
)
:
site
.
userAgent
!=
null
)
return
false
;
return
true
;
}
public
Task
toTask
()
{
return
new
Task
()
{
@Override
public
String
getUUID
()
{
return
Site
.
this
.
getDomain
();
}
@Override
public
Site
getSite
()
{
return
Site
.
this
;
}
};
}
@Override
public
int
hashCode
()
{
int
result
=
domain
.
hashCode
();
result
=
31
*
result
+
(
startUrls
!=
null
?
startUrls
.
hashCode
()
:
0
);
int
result
=
domain
!=
null
?
domain
.
hashCode
()
:
0
;
result
=
31
*
result
+
(
userAgent
!=
null
?
userAgent
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookies
!=
null
?
cookies
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
charset
!=
null
?
charset
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
startRequests
!=
null
?
startRequests
.
hashCode
()
:
0
);
result
=
31
*
result
+
sleepTime
;
result
=
31
*
result
+
retryTimes
;
result
=
31
*
result
+
cycleRetryTimes
;
result
=
31
*
result
+
timeOut
;
result
=
31
*
result
+
(
acceptStatCode
!=
null
?
acceptStatCode
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
headers
!=
null
?
headers
.
hashCode
()
:
0
);
return
result
;
}
@Override
public
String
toString
()
{
return
"Site{"
+
"domain='"
+
domain
+
'\''
+
", userAgent='"
+
userAgent
+
'\''
+
", cookies="
+
cookies
+
", charset='"
+
charset
+
'\''
+
", startRequests="
+
startRequests
+
", sleepTime="
+
sleepTime
+
", retryTimes="
+
retryTimes
+
", cycleRetryTimes="
+
cycleRetryTimes
+
", timeOut="
+
timeOut
+
", acceptStatCode="
+
acceptStatCode
+
", headers="
+
headers
+
'}'
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
6fa82a41
...
...
@@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.io.Closeable
;
import
java.io.IOException
;
...
...
@@ -60,7 +61,7 @@ public class Spider implements Runnable, Task {
protected
PageProcessor
pageProcessor
;
protected
List
<
String
>
startUrl
s
;
protected
List
<
Request
>
startRequest
s
;
protected
Site
site
;
...
...
@@ -107,7 +108,7 @@ public class Spider implements Runnable, Task {
public
Spider
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
this
.
start
Urls
=
pageProcessor
.
getSite
().
getStartUrl
s
();
this
.
start
Requests
=
pageProcessor
.
getSite
().
getStartRequest
s
();
}
/**
...
...
@@ -119,7 +120,20 @@ public class Spider implements Runnable, Task {
*/
public
Spider
startUrls
(
List
<
String
>
startUrls
)
{
checkIfRunning
();
this
.
startUrls
=
startUrls
;
this
.
startRequests
=
UrlUtils
.
convertToRequests
(
startUrls
);
return
this
;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls
* @return this
*/
public
Spider
startRequest
(
List
<
Request
>
startRequests
)
{
checkIfRunning
();
this
.
startRequests
=
startRequests
;
return
this
;
}
...
...
@@ -231,11 +245,11 @@ public class Spider implements Runnable, Task {
}
downloader
.
setThread
(
threadNum
);
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
if
(
start
Url
s
!=
null
)
{
for
(
String
startUrl
:
startUrl
s
)
{
scheduler
.
push
(
new
Request
(
startUrl
)
,
this
);
if
(
start
Request
s
!=
null
)
{
for
(
Request
request
:
startRequest
s
)
{
scheduler
.
push
(
request
,
this
);
}
start
Url
s
.
clear
();
start
Request
s
.
clear
();
}
}
...
...
@@ -390,6 +404,20 @@ public class Spider implements Runnable, Task {
return
this
;
}
/**
* Add urls with information to crawl.<br/>
*
* @param urls
* @return
*/
public
Spider
addRequest
(
Request
...
requests
)
{
for
(
Request
request
:
requests
)
{
addRequest
(
request
);
}
signalNewUrl
();
return
this
;
}
private
void
waitNewUrl
()
{
try
{
newUrlLock
.
lock
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java
View file @
6fa82a41
...
...
@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
thread
(
10
).
run
();
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
thread
(
2
).
run
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
View file @
6fa82a41
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.Request
;
import
java.net.MalformedURLException
;
import
java.net.URL
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
...
...
@@ -18,7 +21,7 @@ public class UrlUtils {
/**
* canonicalizeUrl
*
*
<p/>
* Borrowed from Jsoup.
*
* @param url
...
...
@@ -85,6 +88,22 @@ public class UrlUtils {
return
stringBuilder
.
toString
();
}
public
static
List
<
Request
>
convertToRequests
(
List
<
String
>
urls
)
{
List
<
Request
>
requestList
=
new
ArrayList
<
Request
>(
urls
.
size
());
for
(
String
url
:
urls
)
{
requestList
.
add
(
new
Request
(
url
));
}
return
requestList
;
}
public
static
List
<
String
>
convertToUrls
(
List
<
Request
>
requests
)
{
List
<
String
>
urlList
=
new
ArrayList
<
String
>(
requests
.
size
());
for
(
Request
request
:
requests
)
{
urlList
.
add
(
request
.
getUrl
());
}
return
urlList
;
}
private
static
final
Pattern
patternForCharset
=
Pattern
.
compile
(
"charset\\s*=\\s*['\"]*([^\\s;'\"]*)"
);
public
static
String
getCharset
(
String
contentType
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment