Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
fd6d2fd6
Commit
fd6d2fd6
authored
Nov 06, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
try to keepalive TCP connection
parent
425df085
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
45 additions
and
18 deletions
+45
-18
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+2
-2
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+4
-1
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+5
-5
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+9
-2
BaiduBaikePageProcesser.java
...t/webmagic/processor/example/BaiduBaikePageProcesser.java
+6
-2
log4j.xml
webmagic-core/src/main/resources/log4j.xml
+5
-0
BaiduBaike.java
...c/main/java/us/codecraft/webmagic/example/BaiduBaike.java
+13
-5
OschinaBlog.java
.../main/java/us/codecraft/webmagic/example/OschinaBlog.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
fd6d2fd6
...
...
@@ -27,13 +27,13 @@ public class Site {
*/
private
List
<
Request
>
startRequests
=
new
ArrayList
<
Request
>();
private
int
sleepTime
=
3
000
;
private
int
sleepTime
=
5
000
;
private
int
retryTimes
=
0
;
private
int
cycleRetryTimes
=
0
;
private
int
timeOut
=
2
000
;
private
int
timeOut
=
5
000
;
private
static
final
Set
<
Integer
>
DEFAULT_STATUS_CODE_SET
=
new
HashSet
<
Integer
>();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
fd6d2fd6
...
...
@@ -428,7 +428,10 @@ public class Spider implements Runnable, Task {
public
<
T
>
List
<
T
>
getAll
(
Collection
<
String
>
urls
)
{
destroyWhenExit
=
false
;
spawnUrl
=
false
;
startRequests
=
UrlUtils
.
convertToRequests
(
urls
);
startRequests
.
clear
();
for
(
Request
request
:
UrlUtils
.
convertToRequests
(
urls
))
{
addRequest
(
request
);
}
CollectorPipeline
collectorPipeline
=
getCollectorPipeline
();
pipelines
.
add
(
collectorPipeline
);
run
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
fd6d2fd6
...
...
@@ -37,7 +37,7 @@ public class HttpClientDownloader implements Downloader {
private
final
Map
<
String
,
CloseableHttpClient
>
httpClients
=
new
HashMap
<
String
,
CloseableHttpClient
>();
private
int
poolSize
=
1
;
private
HttpClientGenerator
httpClientGenerator
=
new
HttpClientGenerator
()
;
/**
* A simple method to download a url.
...
...
@@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader {
private
CloseableHttpClient
getHttpClient
(
Site
site
)
{
if
(
site
==
null
)
{
return
new
HttpClientGenerator
(
poolSize
)
.
getClient
(
null
);
return
httpClientGenerator
.
getClient
(
null
);
}
String
domain
=
site
.
getDomain
();
CloseableHttpClient
httpClient
=
httpClients
.
get
(
domain
);
if
(
httpClient
==
null
)
{
synchronized
(
this
)
{
if
(
httpClient
==
null
)
{
httpClient
=
new
HttpClientGenerator
(
poolSize
)
.
getClient
(
site
);
httpClient
=
httpClientGenerator
.
getClient
(
site
);
httpClients
.
put
(
domain
,
httpClient
);
}
}
...
...
@@ -105,7 +105,7 @@ public class HttpClientDownloader implements Downloader {
.
setConnectionRequestTimeout
(
site
.
getTimeOut
())
.
setConnectTimeout
(
site
.
getTimeOut
())
.
setCookieSpec
(
CookieSpecs
.
BEST_MATCH
);
if
(
site
.
getHttpProxy
()
!=
null
)
{
if
(
site
!=
null
&&
site
.
getHttpProxy
()
!=
null
)
{
requestConfigBuilder
.
setProxy
(
site
.
getHttpProxy
());
}
requestBuilder
.
setConfig
(
requestConfigBuilder
.
build
());
...
...
@@ -168,6 +168,6 @@ public class HttpClientDownloader implements Downloader {
@Override
public
void
setThread
(
int
thread
)
{
poolSize
=
thread
;
httpClientGenerator
.
setPoolSize
(
thread
)
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
fd6d2fd6
...
...
@@ -5,6 +5,7 @@ import org.apache.http.client.CookieStore;
import
org.apache.http.client.protocol.ResponseContentEncoding
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.config.SocketConfig
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
import
org.apache.http.conn.socket.PlainConnectionSocketFactory
;
import
org.apache.http.conn.ssl.SSLConnectionSocketFactory
;
...
...
@@ -25,16 +26,20 @@ public class HttpClientGenerator {
private
PoolingHttpClientConnectionManager
connectionManager
;
public
HttpClientGenerator
(
int
poolSize
)
{
public
HttpClientGenerator
()
{
Registry
<
ConnectionSocketFactory
>
reg
=
RegistryBuilder
.<
ConnectionSocketFactory
>
create
()
.
register
(
"http"
,
PlainConnectionSocketFactory
.
INSTANCE
)
.
register
(
"https"
,
SSLConnectionSocketFactory
.
getSocketFactory
())
.
build
();
connectionManager
=
new
PoolingHttpClientConnectionManager
(
reg
);
connectionManager
.
setMaxTotal
(
poolSize
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
}
public
HttpClientGenerator
setPoolSize
(
int
poolSize
){
connectionManager
.
setMaxTotal
(
poolSize
);
return
this
;
}
public
CloseableHttpClient
getClient
(
Site
site
)
{
return
generateClient
(
site
);
}
...
...
@@ -59,6 +64,8 @@ public class HttpClientGenerator {
}
});
}
SocketConfig
socketConfig
=
SocketConfig
.
custom
().
setSoKeepAlive
(
true
).
setTcpNoDelay
(
true
).
build
();
httpClientBuilder
.
setDefaultSocketConfig
(
socketConfig
);
// Http client has some problem handling compressing entity for redirect
// So I disable it and do it manually
// https://issues.apache.org/jira/browse/HTTPCLIENT-1432
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java
View file @
fd6d2fd6
...
...
@@ -30,10 +30,14 @@ public class BaiduBaikePageProcesser implements PageProcessor {
}
public
static
void
main
(
String
[]
args
)
{
//single download
Spider
spider
=
Spider
.
create
(
new
BaiduBaikePageProcesser
()).
thread
(
2
);
List
<
String
>
list
=
new
ArrayList
<
String
>();
String
urlTemplate
=
"http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"
;
list
.
add
(
String
.
format
(
urlTemplate
,
"水力发电"
));
ResultItems
resultItems
=
spider
.<
ResultItems
>
get
(
String
.
format
(
urlTemplate
,
"水力发电"
));
System
.
out
.
println
(
resultItems
);
//multidownload
List
<
String
>
list
=
new
ArrayList
<
String
>();
list
.
add
(
String
.
format
(
urlTemplate
,
"风力发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"太阳能"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
...
...
webmagic-core/src/main/resources/log4j.xml
View file @
fd6d2fd6
...
...
@@ -13,6 +13,11 @@
<appender-ref
ref=
"stdout"
/>
</logger>
<logger
name=
"org.apache"
additivity=
"false"
>
<level
value=
"warn"
/>
<appender-ref
ref=
"stdout"
/>
</logger>
<logger
name=
"net.sf.ehcache"
additivity=
"false"
>
<level
value=
"warn"
/>
<appender-ref
ref=
"stdout"
/>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
View file @
fd6d2fd6
...
...
@@ -28,14 +28,22 @@ public class BaiduBaike{
}
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
list
=
new
ArrayList
<
String
>();
OOSpider
ooSpider
=
OOSpider
.
create
(
Site
.
me
().
setSleepTime
(
0
),
BaiduBaike
.
class
);
//single download
String
urlTemplate
=
"http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"
;
list
.
add
(
String
.
format
(
urlTemplate
,
"水力发电"
));
BaiduBaike
baike
=
ooSpider
.<
BaiduBaike
>
get
(
"http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"
);
System
.
out
.
println
(
baike
);
//multidownload
List
<
String
>
list
=
new
ArrayList
<
String
>();
list
.
add
(
String
.
format
(
urlTemplate
,
"风力发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"太阳能"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
List
<
BaiduBaike
>
baiduBaikes
=
OOSpider
.
create
(
Site
.
me
().
setSleepTime
(
100
),
BaiduBaike
.
class
).<
BaiduBaike
>
getAll
(
list
);
System
.
out
.
println
(
baiduBaikes
);
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
List
<
BaiduBaike
>
resultItemses
=
ooSpider
.<
BaiduBaike
>
getAll
(
list
);
for
(
BaiduBaike
resultItemse
:
resultItemses
)
{
System
.
out
.
println
(
resultItemse
);
}
ooSpider
.
close
();
}
}
webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
View file @
fd6d2fd6
...
...
@@ -31,7 +31,7 @@ public class OschinaBlog {
private
Date
date
;
public
static
void
main
(
String
[]
args
)
{
OOSpider
.
create
(
Site
.
me
()
OOSpider
.
create
(
Site
.
me
()
.
setSleepTime
(
0
)
,
new
JsonFilePageModelPipeline
(
"/data/webmagic/"
),
OschinaBlog
.
class
)
.
addUrl
(
"http://my.oschina.net/flashsword/blog"
).
run
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment