Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
474b7c9d
Commit
474b7c9d
authored
Mar 20, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor
parent
25c81013
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
94 deletions
+15
-94
BannedChecker.java
.../main/java/us/codecraft/webmagic/proxy/BannedChecker.java
+13
-0
ProxyPool.java
.../src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+1
-1
TimerReuseProxyPool.java
...java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
+1
-93
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java
0 → 100644
View file @
474b7c9d
package
us
.
codecraft
.
webmagic
.
proxy
;
import
org.apache.http.HttpResponse
;
/**
* @author code4crafter@gmail.com
* Date: 17/3/20
* Time: 下午10:52
*/
public
interface
BannedChecker
{
boolean
isBanned
(
HttpResponse
httpResponse
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
View file @
474b7c9d
...
@@ -7,7 +7,7 @@ import us.codecraft.webmagic.Task;
...
@@ -7,7 +7,7 @@ import us.codecraft.webmagic.Task;
*/
*/
public
interface
ProxyPool
{
public
interface
ProxyPool
{
void
returnProxy
(
Proxy
proxy
,
int
statusCode
,
Task
task
);
void
returnProxy
(
Proxy
proxy
,
boolean
banned
,
Task
task
);
Proxy
getProxy
(
Task
task
);
Proxy
getProxy
(
Task
task
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
View file @
474b7c9d
...
@@ -34,102 +34,11 @@ public class TimerReuseProxyPool implements ProxyPool {
...
@@ -34,102 +34,11 @@ public class TimerReuseProxyPool implements ProxyPool {
private
boolean
isEnable
=
false
;
private
boolean
isEnable
=
false
;
private
boolean
validateWhenInit
=
false
;
private
boolean
validateWhenInit
=
false
;
// private boolean isUseLastProxy = true;
// private boolean isUseLastProxy = true;
private
String
proxyFilePath
=
"/data/webmagic/lastUse.proxy"
;
private
FilePersistentBase
fBase
=
new
FilePersistentBase
();
private
Timer
timer
=
new
Timer
(
true
);
private
TimerTask
saveProxyTask
=
new
TimerTask
()
{
@Override
public
void
run
()
{
saveProxyList
();
logger
.
info
(
allProxyStatus
());
}
};
public
TimerReuseProxyPool
()
{
this
(
null
,
true
);
}
public
TimerReuseProxyPool
(
List
<
String
[]>
httpProxyList
)
{
public
TimerReuseProxyPool
(
List
<
String
[]>
httpProxyList
)
{
this
(
httpProxyList
,
true
);
this
(
httpProxyList
,
true
);
}
}
public
TimerReuseProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
if
(
httpProxyList
!=
null
)
{
addProxy
(
httpProxyList
.
toArray
(
new
String
[
httpProxyList
.
size
()][]));
}
if
(
isUseLastProxy
)
{
if
(!
new
File
(
proxyFilePath
).
exists
())
{
setFilePath
();
}
readProxyList
();
timer
.
schedule
(
saveProxyTask
,
0
,
saveProxyInterval
);
}
}
private
void
setFilePath
()
{
String
tmpDir
=
System
.
getProperty
(
"java.io.tmpdir"
);
String
path
=
tmpDir
+
FilePersistentBase
.
PATH_SEPERATOR
+
"webmagic"
+
FilePersistentBase
.
PATH_SEPERATOR
+
"lastUse.proxy"
;
if
(
tmpDir
!=
null
&&
new
File
(
tmpDir
).
isDirectory
())
{
fBase
.
setPath
(
tmpDir
+
FilePersistentBase
.
PATH_SEPERATOR
+
"webmagic"
);
File
f
=
fBase
.
getFile
(
path
);
if
(!
f
.
exists
())
{
try
{
f
.
createNewFile
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"proxy file create error"
,
e
);
}
}
}
else
{
logger
.
error
(
"java tmp dir not exists"
);
}
this
.
proxyFilePath
=
path
;
}
private
void
saveProxyList
()
{
if
(
allProxy
.
size
()
==
0
)
{
return
;
}
try
{
ObjectOutputStream
os
=
new
ObjectOutputStream
(
new
FileOutputStream
(
fBase
.
getFile
(
proxyFilePath
)));
os
.
writeObject
(
prepareForSaving
());
os
.
close
();
logger
.
info
(
"save proxy"
);
}
catch
(
FileNotFoundException
e
)
{
logger
.
error
(
"proxy file not found"
,
e
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
private
Map
<
String
,
Proxy
>
prepareForSaving
()
{
Map
<
String
,
TimerReuseProxy
>
tmp
=
new
HashMap
<
String
,
TimerReuseProxy
>();
for
(
Entry
<
String
,
TimerReuseProxy
>
e
:
allProxy
.
entrySet
())
{
TimerReuseProxy
p
=
e
.
getValue
();
p
.
setFailedNum
(
0
);
tmp
.
put
(
e
.
getKey
(),
p
);
}
return
tmp
;
}
private
void
readProxyList
()
{
try
{
ObjectInputStream
is
=
new
ObjectInputStream
(
new
FileInputStream
(
fBase
.
getFile
(
proxyFilePath
)));
addProxy
((
Map
<
String
,
Proxy
>)
is
.
readObject
());
is
.
close
();
}
catch
(
FileNotFoundException
e
)
{
logger
.
info
(
"last use proxy file not found"
,
e
);
}
catch
(
IOException
e
)
{
// e.printStackTrace();
}
catch
(
ClassNotFoundException
e
)
{
// e.printStackTrace();
}
}
private
void
addProxy
(
Map
<
String
,
Proxy
>
httpProxyMap
)
{
private
void
addProxy
(
Map
<
String
,
Proxy
>
httpProxyMap
)
{
isEnable
=
true
;
isEnable
=
true
;
for
(
Entry
<
String
,
Proxy
>
entry
:
httpProxyMap
.
entrySet
())
{
for
(
Entry
<
String
,
Proxy
>
entry
:
httpProxyMap
.
entrySet
())
{
...
@@ -205,7 +114,6 @@ public class TimerReuseProxyPool implements ProxyPool {
...
@@ -205,7 +114,6 @@ public class TimerReuseProxyPool implements ProxyPool {
case
TimerReuseProxy
.
ERROR_BANNED
:
case
TimerReuseProxy
.
ERROR_BANNED
:
p
.
fail
(
TimerReuseProxy
.
ERROR_BANNED
);
p
.
fail
(
TimerReuseProxy
.
ERROR_BANNED
);
p
.
setReuseTimeInterval
(
10
*
60
*
1000
*
p
.
getFailedNum
());
p
.
setReuseTimeInterval
(
10
*
60
*
1000
*
p
.
getFailedNum
());
logger
.
warn
(
"this proxy is banned >>>> "
+
p
.
getHttpHost
());
logger
.
info
(
proxy
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
logger
.
info
(
proxy
+
" >>>> reuseTimeInterval is >>>> "
+
p
.
getReuseTimeInterval
()
/
1000.0
);
break
;
break
;
case
TimerReuseProxy
.
ERROR_404
:
case
TimerReuseProxy
.
ERROR_404
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment