Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
0ae7adf3
Commit
0ae7adf3
authored
Jun 18, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add cookie support & add docs
parent
8cef8774
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
97 additions
and
23 deletions
+97
-23
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+8
-1
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+29
-1
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+13
-13
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+3
-1
Downloader.java
...ain/java/us/codecraft/webmagic/downloader/Downloader.java
+3
-2
HttpClientPool.java
...java/us/codecraft/webmagic/downloader/HttpClientPool.java
+17
-4
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+23
-0
KaichibaProcessor.java
...java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
0ae7adf3
...
@@ -10,6 +10,7 @@ import java.util.Map;
...
@@ -10,6 +10,7 @@ import java.util.Map;
import
java.util.concurrent.ConcurrentHashMap
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
/**
* Page保存了抓取的结果,并可定义下一次抓取的链接内容。
* Author: code4crafter@gmail.com
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Date: 13-4-21
* Time: 上午11:22
* Time: 上午11:22
...
@@ -65,7 +66,7 @@ public class Page {
...
@@ -65,7 +66,7 @@ public class Page {
}
}
}
}
public
void
addTargetRequest
s
(
String
requestString
)
{
public
void
addTargetRequest
(
String
requestString
)
{
if
(
StringUtils
.
isBlank
(
requestString
)
||
requestString
.
equals
(
"#"
))
{
if
(
StringUtils
.
isBlank
(
requestString
)
||
requestString
.
equals
(
"#"
))
{
return
;
return
;
}
}
...
@@ -75,6 +76,12 @@ public class Page {
...
@@ -75,6 +76,12 @@ public class Page {
}
}
}
}
public
void
addTargetRequest
(
Request
request
)
{
synchronized
(
targetRequests
)
{
targetRequests
.
add
(
request
);
}
}
public
Selectable
getUrl
()
{
public
Selectable
getUrl
()
{
return
url
;
return
url
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
0ae7adf3
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
/**
/**
* Request对象是
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* Author: code4crafter@gmail.com
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Date: 13-4-21
* Time: 上午11:37
* Time: 上午11:37
...
@@ -12,15 +27,28 @@ public class Request {
...
@@ -12,15 +27,28 @@ public class Request {
private
Object
[]
extra
;
private
Object
[]
extra
;
/**
* 构建一个request对象
* @param url 必须参数,待抓取的url
* @param extra 额外参数,可以保存一些需要的上下文信息
*/
public
Request
(
String
url
,
Object
...
extra
)
{
public
Request
(
String
url
,
Object
...
extra
)
{
this
.
url
=
url
;
this
.
url
=
url
;
this
.
extra
=
extra
;
this
.
extra
=
extra
;
}
}
/**
* 获取预存的对象
* @return object[] 预存的对象数组
*/
public
Object
[]
getExtra
()
{
public
Object
[]
getExtra
()
{
return
extra
;
return
extra
;
}
}
/**
* 获取待抓取的url
* @return url 待抓取的url
*/
public
String
getUrl
()
{
public
String
getUrl
()
{
return
url
;
return
url
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
0ae7adf3
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
java.util.HashSet
;
import
java.util.*
;
import
java.util.Set
;
/**
/**
* Site定义一个待抓取的站点的各种信息。
* Author: code4crafter@gmail.com
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Date: 13-4-21
* Time: 下午12:13
* Time: 下午12:13
...
@@ -14,11 +14,11 @@ public class Site {
...
@@ -14,11 +14,11 @@ public class Site {
private
String
userAgent
;
private
String
userAgent
;
private
String
cookie
;
private
Map
<
String
,
String
>
cookies
=
new
LinkedHashMap
<
String
,
String
>()
;
private
String
encoding
;
private
String
encoding
;
private
String
startUrl
;
private
List
<
String
>
startUrls
;
private
int
sleepTime
=
3000
;
private
int
sleepTime
=
3000
;
...
@@ -34,8 +34,8 @@ public class Site {
...
@@ -34,8 +34,8 @@ public class Site {
return
new
Site
();
return
new
Site
();
}
}
public
Site
setCookie
(
String
cooki
e
)
{
public
Site
setCookie
(
String
name
,
String
valu
e
)
{
this
.
cookie
=
cookie
;
cookies
.
put
(
name
,
value
)
;
return
this
;
return
this
;
}
}
...
@@ -44,8 +44,8 @@ public class Site {
...
@@ -44,8 +44,8 @@ public class Site {
return
this
;
return
this
;
}
}
public
String
getCookie
()
{
public
Map
<
String
,
String
>
getCookies
()
{
return
cookie
;
return
cookie
s
;
}
}
public
String
getUserAgent
()
{
public
String
getUserAgent
()
{
...
@@ -79,12 +79,12 @@ public class Site {
...
@@ -79,12 +79,12 @@ public class Site {
return
this
;
return
this
;
}
}
public
String
getStartUrl
()
{
public
List
<
String
>
getStartUrls
()
{
return
startUrl
;
return
startUrl
s
;
}
}
public
Site
setStartUrl
(
String
startUrl
)
{
public
Site
setStartUrl
(
String
startUrl
)
{
this
.
startUrl
=
startUrl
;
this
.
startUrl
s
.
add
(
startUrl
)
;
return
this
;
return
this
;
}
}
...
@@ -106,8 +106,8 @@ public class Site {
...
@@ -106,8 +106,8 @@ public class Site {
if
(
acceptStatCode
!=
null
?
!
acceptStatCode
.
equals
(
site
.
acceptStatCode
)
:
site
.
acceptStatCode
!=
null
)
if
(
acceptStatCode
!=
null
?
!
acceptStatCode
.
equals
(
site
.
acceptStatCode
)
:
site
.
acceptStatCode
!=
null
)
return
false
;
return
false
;
if
(
cookie
!=
null
?
!
cookie
.
equals
(
site
.
cookie
)
:
site
.
cookie
!=
null
)
return
false
;
if
(!
domain
.
equals
(
site
.
domain
))
return
false
;
if
(!
domain
.
equals
(
site
.
domain
))
return
false
;
if
(!
startUrls
.
equals
(
site
.
startUrls
))
return
false
;
if
(
encoding
!=
null
?
!
encoding
.
equals
(
site
.
encoding
)
:
site
.
encoding
!=
null
)
return
false
;
if
(
encoding
!=
null
?
!
encoding
.
equals
(
site
.
encoding
)
:
site
.
encoding
!=
null
)
return
false
;
if
(
userAgent
!=
null
?
!
userAgent
.
equals
(
site
.
userAgent
)
:
site
.
userAgent
!=
null
)
return
false
;
if
(
userAgent
!=
null
?
!
userAgent
.
equals
(
site
.
userAgent
)
:
site
.
userAgent
!=
null
)
return
false
;
...
@@ -117,8 +117,8 @@ public class Site {
...
@@ -117,8 +117,8 @@ public class Site {
@Override
@Override
public
int
hashCode
()
{
public
int
hashCode
()
{
int
result
=
domain
.
hashCode
();
int
result
=
domain
.
hashCode
();
result
=
31
*
result
+
(
startUrls
!=
null
?
startUrls
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
userAgent
!=
null
?
userAgent
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
userAgent
!=
null
?
userAgent
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
cookie
!=
null
?
cookie
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
encoding
!=
null
?
encoding
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
encoding
!=
null
?
encoding
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
acceptStatCode
!=
null
?
acceptStatCode
.
hashCode
()
:
0
);
result
=
31
*
result
+
(
acceptStatCode
!=
null
?
acceptStatCode
.
hashCode
()
:
0
);
return
result
;
return
result
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
0ae7adf3
...
@@ -36,7 +36,9 @@ public class Spider implements Runnable {
...
@@ -36,7 +36,9 @@ public class Spider implements Runnable {
public
Spider
processor
(
PageProcessor
pageProcessor
)
{
public
Spider
processor
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
pageProcessor
=
pageProcessor
;
schedular
.
push
(
new
Request
(
pageProcessor
.
getSite
().
getStartUrl
()),
pageProcessor
.
getSite
());
for
(
String
startUrl
:
pageProcessor
.
getSite
().
getStartUrls
())
{
schedular
.
push
(
new
Request
(
startUrl
),
pageProcessor
.
getSite
());
}
return
this
;
return
this
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
View file @
0ae7adf3
...
@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
...
@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
/**
/**
* Downloader是webmagic
抓取页面的核心
接口。
* Downloader是webmagic
下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个
接口。
* Author: code4crafter@gmail.com
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Date: 13-4-21
* Time: 下午12:14
* Time: 下午12:14
...
@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
...
@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
public
interface
Downloader
{
public
interface
Downloader
{
/**
/**
* 下载页面,并保存信息到Page对象中。
*
*
* @param request
* @param request
* @param site
* @param site
* @return
* @return
*/
*/
public
Page
download
(
Request
request
,
Site
site
);
public
Page
download
(
Request
request
,
Site
site
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
View file @
0ae7adf3
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.HttpVersion
;
import
org.apache.http.HttpVersion
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.params.ClientPNames
;
import
org.apache.http.client.params.ClientPNames
;
import
org.apache.http.client.params.CookiePolicy
;
import
org.apache.http.client.params.CookiePolicy
;
import
org.apache.http.conn.scheme.PlainSocketFactory
;
import
org.apache.http.conn.scheme.PlainSocketFactory
;
import
org.apache.http.conn.scheme.Scheme
;
import
org.apache.http.conn.scheme.Scheme
;
import
org.apache.http.conn.scheme.SchemeRegistry
;
import
org.apache.http.conn.scheme.SchemeRegistry
;
import
org.apache.http.impl.client.BasicCookieStore
;
import
org.apache.http.impl.client.DefaultHttpClient
;
import
org.apache.http.impl.client.DefaultHttpClient
;
import
org.apache.http.impl.conn.PoolingClientConnectionManager
;
import
org.apache.http.impl.conn.PoolingClientConnectionManager
;
import
org.apache.http.impl.cookie.BasicClientCookie
;
import
org.apache.http.params.*
;
import
org.apache.http.params.*
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
java.util.Map
;
/**
/**
* Author: code4crafter@gmail.com
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Date: 13-4-21
...
@@ -50,15 +55,23 @@ public class HttpClientPool {
...
@@ -50,15 +55,23 @@ public class HttpClientPool {
schemeRegistry
.
register
(
new
Scheme
(
"http"
,
80
,
PlainSocketFactory
.
getSocketFactory
()));
schemeRegistry
.
register
(
new
Scheme
(
"http"
,
80
,
PlainSocketFactory
.
getSocketFactory
()));
PoolingClientConnectionManager
connectionManager
=
new
PoolingClientConnectionManager
(
schemeRegistry
);
PoolingClientConnectionManager
connectionManager
=
new
PoolingClientConnectionManager
(
schemeRegistry
);
connectionManager
.
setMaxTotal
(
100
);
connectionManager
.
setMaxTotal
(
poolSize
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
connectionManager
.
setDefaultMaxPerRoute
(
100
);
HttpClient
httpClient
=
new
DefaultHttpClient
(
connectionManager
,
params
);
DefaultHttpClient
httpClient
=
new
DefaultHttpClient
(
connectionManager
,
params
);
generateCookie
(
httpClient
,
site
);
httpClient
.
getParams
().
setIntParameter
(
"http.socket.timeout"
,
60000
);
httpClient
.
getParams
().
setIntParameter
(
"http.socket.timeout"
,
60000
);
httpClient
.
getParams
().
setParameter
(
ClientPNames
.
COOKIE_POLICY
,
CookiePolicy
.
BEST_MATCH
);
httpClient
.
getParams
().
setParameter
(
ClientPNames
.
COOKIE_POLICY
,
CookiePolicy
.
BEST_MATCH
);
return
httpClient
;
return
httpClient
;
}
}
public
void
pushBack
(
HttpClient
httpClient
)
{
private
void
generateCookie
(
DefaultHttpClient
httpClient
,
Site
site
)
{
CookieStore
cookieStore
=
new
BasicCookieStore
();
for
(
Map
.
Entry
<
String
,
String
>
cookieEntry
:
site
.
getCookies
().
entrySet
())
{
BasicClientCookie
cookie
=
new
BasicClientCookie
(
cookieEntry
.
getKey
(),
cookieEntry
.
getValue
());
cookie
.
setDomain
(
site
.
getDomain
());
cookieStore
.
addCookie
(
cookie
);
}
httpClient
.
setCookieStore
(
cookieStore
);
}
}
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
0 → 100644
View file @
0ae7adf3
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.junit.Assert
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
/**
* Author: code4crafer@gmail.com
* Date: 13-6-18
* Time: 上午8:22
*/
public
class
HttpClientDownloaderTest
{
@Test
public
void
testCookie
()
{
Site
site
=
Site
.
me
().
setDomain
(
"www.diandian.com"
).
setCookie
(
"t"
,
"yct7q7e6v319wpg4cpxqduu5m77lcgix"
);
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
Page
download
=
httpClientDownloader
.
download
(
new
Request
(
"http://www.diandian.com"
),
site
);
Assert
.
assertTrue
(
download
.
getHtml
().
toString
().
contains
(
"flashsword30"
));
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
View file @
0ae7adf3
...
@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
...
@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int
i
=
Integer
.
valueOf
(
page
.
getUrl
().
r
(
"shop/(\\d+)"
).
toString
())
+
1
;
int
i
=
Integer
.
valueOf
(
page
.
getUrl
().
r
(
"shop/(\\d+)"
).
toString
())
+
1
;
page
.
addTargetRequest
s
(
"http://kaichiba.com/shop/"
+
i
);
page
.
addTargetRequest
(
"http://kaichiba.com/shop/"
+
i
);
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//Title"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
x
(
"//Title"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
xs
(
"//li[@class=\"foodTitle\"]"
).
rp
(
"^\\s+"
,
""
).
rp
(
"\\s+$"
,
""
).
rp
(
"<span>.*?</span>"
,
""
));
page
.
putField
(
"items"
,
page
.
getHtml
().
xs
(
"//li[@class=\"foodTitle\"]"
).
rp
(
"^\\s+"
,
""
).
rp
(
"\\s+$"
,
""
).
rp
(
"<span>.*?</span>"
,
""
));
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment