Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
21cae2ff
Commit
21cae2ff
authored
Aug 03, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update package
parent
cfb89904
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
45 additions
and
29 deletions
+45
-29
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+17
-2
AfterExtractor.java
...main/java/us/codecraft/webmagic/model/AfterExtractor.java
+1
-1
ConsolePageModelPipeline.java
...us/codecraft/webmagic/model/ConsolePageModelPipeline.java
+1
-1
ExtractBy.java
.../src/main/java/us/codecraft/webmagic/model/ExtractBy.java
+1
-1
ExtractByUrl.java
...c/main/java/us/codecraft/webmagic/model/ExtractByUrl.java
+1
-1
Extractor.java
.../src/main/java/us/codecraft/webmagic/model/Extractor.java
+1
-1
FieldExtractor.java
...main/java/us/codecraft/webmagic/model/FieldExtractor.java
+1
-1
HelpUrl.java
...re/src/main/java/us/codecraft/webmagic/model/HelpUrl.java
+1
-1
OOSpider.java
...e/src/main/java/us/codecraft/webmagic/model/OOSpider.java
+1
-1
ObjectPageProcessor.java
...java/us/codecraft/webmagic/model/ObjectPageProcessor.java
+1
-1
ObjectPipeline.java
...main/java/us/codecraft/webmagic/model/ObjectPipeline.java
+1
-1
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+1
-1
PageModelPipeline.java
...n/java/us/codecraft/webmagic/model/PageModelPipeline.java
+1
-1
TargetUrl.java
.../src/main/java/us/codecraft/webmagic/model/TargetUrl.java
+1
-1
package.html
...re/src/main/java/us/codecraft/webmagic/model/package.html
+0
-0
OschinaBlog.java
...rc/test/java/us/codecraft/webmagic/model/OschinaBlog.java
+3
-2
TestFetcher.java
...rc/test/java/us/codecraft/webmagic/model/TestFetcher.java
+1
-1
Blog.java
...c/main/java/us/codecraft/webmagic/model/samples/Blog.java
+1
-1
IteyeBlog.java
...n/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
+4
-4
OschinaAnswer.java
...va/us/codecraft/webmagic/model/samples/OschinaAnswer.java
+2
-2
OschinaBlog.java
...java/us/codecraft/webmagic/model/samples/OschinaBlog.java
+4
-4
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
21cae2ff
...
@@ -9,7 +9,7 @@ import java.util.List;
...
@@ -9,7 +9,7 @@ import java.util.List;
/**
/**
* <pre>
* <pre>
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
*
* 主要方法:
* 主要方法:
* {@link #getUrl()} 获取页面的Url
* {@link #getUrl()} 获取页面的Url
...
@@ -19,6 +19,7 @@ import java.util.List;
...
@@ -19,6 +19,7 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
*
* </pre>
* </pre>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
*/
*/
public
class
Page
{
public
class
Page
{
...
@@ -36,9 +37,16 @@ public class Page {
...
@@ -36,9 +37,16 @@ public class Page {
public
Page
()
{
public
Page
()
{
}
}
public
Page
setSkip
(
boolean
skip
)
{
resultItems
.
setSkip
(
skip
);
return
this
;
}
/**
/**
* 保存抽取的结果
* 保存抽取的结果
* @param key 结果的key
*
* @param key 结果的key
* @param field 结果的value
* @param field 结果的value
*/
*/
public
void
putField
(
String
key
,
Object
field
)
{
public
void
putField
(
String
key
,
Object
field
)
{
...
@@ -47,6 +55,7 @@ public class Page {
...
@@ -47,6 +55,7 @@ public class Page {
/**
/**
* 获取页面的html内容
* 获取页面的html内容
*
* @return html 页面的html内容
* @return html 页面的html内容
*/
*/
public
Selectable
getHtml
()
{
public
Selectable
getHtml
()
{
...
@@ -63,6 +72,7 @@ public class Page {
...
@@ -63,6 +72,7 @@ public class Page {
/**
/**
* 添加待抓取的链接
* 添加待抓取的链接
*
* @param requests 待抓取的链接
* @param requests 待抓取的链接
*/
*/
public
void
addTargetRequests
(
List
<
String
>
requests
)
{
public
void
addTargetRequests
(
List
<
String
>
requests
)
{
...
@@ -79,6 +89,7 @@ public class Page {
...
@@ -79,6 +89,7 @@ public class Page {
/**
/**
* 添加待抓取的链接
* 添加待抓取的链接
*
* @param requestString 待抓取的链接
* @param requestString 待抓取的链接
*/
*/
public
void
addTargetRequest
(
String
requestString
)
{
public
void
addTargetRequest
(
String
requestString
)
{
...
@@ -93,6 +104,7 @@ public class Page {
...
@@ -93,6 +104,7 @@ public class Page {
/**
/**
* 添加待抓取的页面,在需要传递附加信息时使用
* 添加待抓取的页面,在需要传递附加信息时使用
*
* @param request 待抓取的页面
* @param request 待抓取的页面
*/
*/
public
void
addTargetRequest
(
Request
request
)
{
public
void
addTargetRequest
(
Request
request
)
{
...
@@ -103,6 +115,7 @@ public class Page {
...
@@ -103,6 +115,7 @@ public class Page {
/**
/**
* 获取页面的Url
* 获取页面的Url
*
* @return url 当前页面的url,可用于抽取
* @return url 当前页面的url,可用于抽取
*/
*/
public
Selectable
getUrl
()
{
public
Selectable
getUrl
()
{
...
@@ -111,6 +124,7 @@ public class Page {
...
@@ -111,6 +124,7 @@ public class Page {
/**
/**
* 设置url
* 设置url
*
* @param url
* @param url
*/
*/
public
void
setUrl
(
Selectable
url
)
{
public
void
setUrl
(
Selectable
url
)
{
...
@@ -119,6 +133,7 @@ public class Page {
...
@@ -119,6 +133,7 @@ public class Page {
/**
/**
* 获取抓取请求
* 获取抓取请求
*
* @return request 抓取请求
* @return request 抓取请求
*/
*/
public
Request
getRequest
()
{
public
Request
getRequest
()
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/AfterExtractor.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/AfterExtractor.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/ConsolePageModelPipeline.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/ConsolePageModelPipeline.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.commons.lang3.builder.ToStringBuilder
;
import
org.apache.commons.lang3.builder.ToStringBuilder
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/ExtractBy.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/ExtractBy.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Retention
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/ExtractByUrl.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/ExtractByUrl.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Retention
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/Extractor.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/Extractor.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.Selector
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/FieldExtractor.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/FieldExtractor.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.Selector
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/HelpUrl.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/HelpUrl.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Retention
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/OOSpider.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/OOSpider.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/ObjectPageProcessor.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/ObjectPageProcessor.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/ObjectPipeline.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/ObjectPipeline.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/PageModelExtractor.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/PageModelExtractor.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/PageModelPipeline.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/PageModelPipeline.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/TargetUrl.java
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/TargetUrl.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.ElementType
;
import
java.lang.annotation.Retention
;
import
java.lang.annotation.Retention
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/
oo
/package.html
→
webmagic-core/src/main/java/us/codecraft/webmagic/
model
/package.html
View file @
21cae2ff
File moved
webmagic-core/src/test/java/us/codecraft/webmagic/
oo
/OschinaBlog.java
→
webmagic-core/src/test/java/us/codecraft/webmagic/
model
/OschinaBlog.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
...
@@ -10,7 +10,7 @@ import java.util.List;
...
@@ -10,7 +10,7 @@ import java.util.List;
* @date: 13-8-1 <br>
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
* Time: 下午10:18 <br>
*/
*/
@TargetUrl
(
value
=
"http://my.oschina.net/flashsword/blog/*"
,
sourceRegion
=
"//div[@class='BlogLinks']//a/@href
"
)
@TargetUrl
(
"http://my.oschina.net/flashsword/blog/*
"
)
public
class
OschinaBlog
implements
AfterExtractor
{
public
class
OschinaBlog
implements
AfterExtractor
{
@ExtractBy
(
"//title"
)
@ExtractBy
(
"//title"
)
...
@@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor {
...
@@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor {
System
.
out
.
println
(
"title:\t"
+
title
);
System
.
out
.
println
(
"title:\t"
+
title
);
System
.
out
.
println
(
"content:\t"
+
content
);
System
.
out
.
println
(
"content:\t"
+
content
);
System
.
out
.
println
(
"tags:\t"
+
tags
);
System
.
out
.
println
(
"tags:\t"
+
tags
);
page
.
setSkip
(
true
);
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/
oo
/TestFetcher.java
→
webmagic-core/src/test/java/us/codecraft/webmagic/
model
/TestFetcher.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
;
package
us
.
codecraft
.
webmagic
.
model
;
import
org.junit.Ignore
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/
oo
/samples/Blog.java
→
webmagic-samples/src/main/java/us/codecraft/webmagic/
model
/samples/Blog.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
.
samples
;
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/
oo
/samples/IteyeBlog.java
→
webmagic-samples/src/main/java/us/codecraft/webmagic/
model
/samples/IteyeBlog.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
.
samples
;
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.
oo
.ExtractBy
;
import
us.codecraft.webmagic.
model
.ExtractBy
;
import
us.codecraft.webmagic.
oo
.OOSpider
;
import
us.codecraft.webmagic.
model
.OOSpider
;
import
us.codecraft.webmagic.
oo
.TargetUrl
;
import
us.codecraft.webmagic.
model
.TargetUrl
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/
oo
/samples/OschinaAnswer.java
→
webmagic-samples/src/main/java/us/codecraft/webmagic/
model
/samples/OschinaAnswer.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
.
samples
;
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.
oo
.*
;
import
us.codecraft.webmagic.
model
.*
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/
oo
/samples/OschinaBlog.java
→
webmagic-samples/src/main/java/us/codecraft/webmagic/
model
/samples/OschinaBlog.java
View file @
21cae2ff
package
us
.
codecraft
.
webmagic
.
oo
.
samples
;
package
us
.
codecraft
.
webmagic
.
model
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.
oo
.ExtractBy
;
import
us.codecraft.webmagic.
model
.ExtractBy
;
import
us.codecraft.webmagic.
oo
.OOSpider
;
import
us.codecraft.webmagic.
model
.OOSpider
;
import
us.codecraft.webmagic.
oo
.TargetUrl
;
import
us.codecraft.webmagic.
model
.TargetUrl
;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment