Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
d141541e
Commit
d141541e
authored
Sep 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add retry
parent
a1ef2523
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
54 additions
and
7 deletions
+54
-7
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+2
-0
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+22
-1
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+16
-1
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+9
-2
RedisScheduler.java
.../java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+5
-3
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
d141541e
...
@@ -17,6 +17,8 @@ public class Request implements Serializable {
...
@@ -17,6 +17,8 @@ public class Request implements Serializable {
private
static
final
long
serialVersionUID
=
2062192774891352043L
;
private
static
final
long
serialVersionUID
=
2062192774891352043L
;
public
static
final
String
CYCLE_TRIED_TIMES
=
"_cycle_tried_times"
;
private
String
url
;
private
String
url
;
/**
/**
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
d141541e
...
@@ -30,6 +30,8 @@ public class Site {
...
@@ -30,6 +30,8 @@ public class Site {
private
int
retryTimes
=
0
;
private
int
retryTimes
=
0
;
private
int
cycleRetryTimes
=
0
;
private
static
final
Set
<
Integer
>
DEFAULT_STATUS_CODE_SET
=
new
HashSet
<
Integer
>();
private
static
final
Set
<
Integer
>
DEFAULT_STATUS_CODE_SET
=
new
HashSet
<
Integer
>();
private
Set
<
Integer
>
acceptStatCode
=
DEFAULT_STATUS_CODE_SET
;
private
Set
<
Integer
>
acceptStatCode
=
DEFAULT_STATUS_CODE_SET
;
...
@@ -200,7 +202,7 @@ public class Site {
...
@@ -200,7 +202,7 @@ public class Site {
}
}
/**
/**
* Get retry times when download fail, 0 by default.<br>
* Get retry times when download fail
immediately
, 0 by default.<br>
*
*
* @return retry times when download fail
* @return retry times when download fail
*/
*/
...
@@ -218,6 +220,25 @@ public class Site {
...
@@ -218,6 +220,25 @@ public class Site {
return
this
;
return
this
;
}
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
*
* @return retry times when download fail
*/
public
int
getCycleRetryTimes
()
{
return
cycleRetryTimes
;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler. <br>
*
* @return this
*/
public
Site
setCycleRetryTimes
(
int
cycleRetryTimes
)
{
this
.
cycleRetryTimes
=
cycleRetryTimes
;
return
this
;
}
@Override
@Override
public
boolean
equals
(
Object
o
)
{
public
boolean
equals
(
Object
o
)
{
if
(
this
==
o
)
return
true
;
if
(
this
==
o
)
return
true
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
d141541e
...
@@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader {
...
@@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader {
* @param url
* @param url
* @return html
* @return html
*/
*/
public
Html
download
(
String
url
,
String
charset
)
{
public
Html
download
(
String
url
,
String
charset
)
{
Page
page
=
download
(
new
Request
(
url
),
Site
.
me
().
setCharset
(
charset
).
toTask
());
Page
page
=
download
(
new
Request
(
url
),
Site
.
me
().
setCharset
(
charset
).
toTask
());
return
(
Html
)
page
.
getHtml
();
return
(
Html
)
page
.
getHtml
();
}
}
...
@@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
...
@@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
if
(
tried
>
retryTimes
)
{
if
(
tried
>
retryTimes
)
{
logger
.
warn
(
"download page "
+
request
.
getUrl
()
+
" error"
,
e
);
logger
.
warn
(
"download page "
+
request
.
getUrl
()
+
" error"
,
e
);
if
(
site
.
getCycleRetryTimes
()
>
0
)
{
Page
page
=
new
Page
();
Object
cycleTriedTimesObject
=
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
);
if
(
cycleTriedTimesObject
==
null
)
{
page
.
addTargetRequest
(
request
.
setPriority
(
0
).
putExtra
(
Request
.
CYCLE_TRIED_TIMES
,
1
));
}
else
{
int
cycleTriedTimes
=
(
Integer
)
cycleTriedTimesObject
;
cycleTriedTimes
++;
if
(
cycleTriedTimes
>=
site
.
getCycleRetryTimes
())
{
return
null
;
}
page
.
addTargetRequest
(
request
.
setPriority
(
0
).
putExtra
(
Request
.
CYCLE_TRIED_TIMES
,
1
));
}
return
page
;
}
return
null
;
return
null
;
}
}
logger
.
info
(
"download page "
+
request
.
getUrl
()
+
" error, retry the "
+
tried
+
" time!"
);
logger
.
info
(
"download page "
+
request
.
getUrl
()
+
" error, retry the "
+
tried
+
" time!"
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
d141541e
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
...
@@ -15,6 +16,8 @@ import java.util.List;
...
@@ -15,6 +16,8 @@ import java.util.List;
*/
*/
public
class
Html
extends
PlainText
{
public
class
Html
extends
PlainText
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
/**
/**
* Store parsed document for better performance when only one text exist.
* Store parsed document for better performance when only one text exist.
*/
*/
...
@@ -26,7 +29,11 @@ public class Html extends PlainText {
...
@@ -26,7 +29,11 @@ public class Html extends PlainText {
public
Html
(
String
text
)
{
public
Html
(
String
text
)
{
super
(
text
);
super
(
text
);
try
{
this
.
document
=
Jsoup
.
parse
(
text
);
this
.
document
=
Jsoup
.
parse
(
text
);
}
catch
(
Exception
e
)
{
logger
.
warn
(
"parse document error "
,
e
);
}
}
}
public
Html
(
Document
document
)
{
public
Html
(
Document
document
)
{
...
@@ -108,7 +115,7 @@ public class Html extends PlainText {
...
@@ -108,7 +115,7 @@ public class Html extends PlainText {
}
}
public
String
getText
()
{
public
String
getText
()
{
if
(
strings
!=
null
&&
strings
.
size
()>
0
)
{
if
(
strings
!=
null
&&
strings
.
size
()
>
0
)
{
return
strings
.
get
(
0
);
return
strings
.
get
(
0
);
}
}
return
document
.
html
();
return
document
.
html
();
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
View file @
d141541e
...
@@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
...
@@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
Jedis
jedis
=
pool
.
getResource
();
Jedis
jedis
=
pool
.
getResource
();
try
{
try
{
//使用Set进行url去重
// if cycleRetriedTimes is set, allow duplicated.
if
(!
jedis
.
sismember
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
()))
{
Object
cycleRetriedTimes
=
request
.
getExtra
(
Request
.
CYCLE_TRIED_TIMES
);
//使用List保存队列
// use set to remove duplicate url
if
(
cycleRetriedTimes
!=
null
||
!
jedis
.
sismember
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
()))
{
// use list to store queue
jedis
.
rpush
(
QUEUE_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
jedis
.
rpush
(
QUEUE_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
jedis
.
sadd
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
jedis
.
sadd
(
SET_PREFIX
+
task
.
getUUID
(),
request
.
getUrl
());
if
(
request
.
getExtras
()
!=
null
)
{
if
(
request
.
getExtras
()
!=
null
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment