Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
90bbe9b9
Commit
90bbe9b9
authored
Aug 17, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
webmagic-core
parent
17f8ead2
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
104 additions
and
75 deletions
+104
-75
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+5
-5
Downloader.java
...ain/java/us/codecraft/webmagic/downloader/Downloader.java
+8
-9
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+6
-5
HttpClientPool.java
...java/us/codecraft/webmagic/downloader/HttpClientPool.java
+1
-2
package.html
...c/main/java/us/codecraft/webmagic/downloader/package.html
+1
-1
ConsolePipeline.java
.../java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
+3
-3
FilePipeline.java
...ain/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+6
-10
Pipeline.java
...rc/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
+13
-4
package.html
...src/main/java/us/codecraft/webmagic/pipeline/package.html
+1
-1
PageProcessor.java
...n/java/us/codecraft/webmagic/processor/PageProcessor.java
+16
-6
SimplePageProcessor.java
.../us/codecraft/webmagic/processor/SimplePageProcessor.java
+8
-8
package.html
...rc/main/java/us/codecraft/webmagic/processor/package.html
+1
-1
QueueScheduler.java
.../java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+10
-7
Scheduler.java
.../main/java/us/codecraft/webmagic/scheduler/Scheduler.java
+12
-8
package.html
...rc/main/java/us/codecraft/webmagic/scheduler/package.html
+1
-1
AndSelector.java
...main/java/us/codecraft/webmagic/selector/AndSelector.java
+2
-0
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+3
-3
OrSelector.java
.../main/java/us/codecraft/webmagic/selector/OrSelector.java
+2
-0
Selector.java
...rc/main/java/us/codecraft/webmagic/selector/Selector.java
+4
-0
Page-cmnt.xml
zh_docs/us/codecraft/webmagic/Page-cmnt.xml
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
90bbe9b9
...
@@ -9,13 +9,13 @@ import java.util.List;
...
@@ -9,13 +9,13 @@ import java.util.List;
/**
/**
*
*
* Object storing extracted result and urls to
be crawled
.<br>
* Object storing extracted result and urls to
fetch
.<br>
* Main method: <br>
* Main method: <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to
crawl
<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to
fetch
<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @since 0.1.0
...
@@ -71,7 +71,7 @@ public class Page {
...
@@ -71,7 +71,7 @@ public class Page {
}
}
/**
/**
* add urls to
crawl
* add urls to
fetch
*
*
* @param requests
* @param requests
*/
*/
...
@@ -88,7 +88,7 @@ public class Page {
...
@@ -88,7 +88,7 @@ public class Page {
}
}
/**
/**
* add url to
crawl
* add url to
fetch
*
*
* @param requestString
* @param requestString
*/
*/
...
@@ -103,7 +103,7 @@ public class Page {
...
@@ -103,7 +103,7 @@ public class Page {
}
}
/**
/**
* add requests to
crawl
* add requests to
fetch
*
*
* @param request
* @param request
*/
*/
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
View file @
90bbe9b9
...
@@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request;
...
@@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br>
* Downloader is the part that downloads web pages and store in Page object. <br>
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午12:14
*/
*/
public
interface
Downloader
{
public
interface
Downloader
{
/**
/**
*
下载页面,并保存信息到Page对象中。
*
Downloads web pages and store in Page object.
*
*
* @param request
* @param request
* @param task
* @param task
...
@@ -23,10 +24,8 @@ public interface Downloader {
...
@@ -23,10 +24,8 @@ public interface Downloader {
public
Page
download
(
Request
request
,
Task
task
);
public
Page
download
(
Request
request
,
Task
task
);
/**
/**
* 设置线程数,多线程程序一般需要Downloader支持<br>
* Tell the downloader how many threads the spider used.
* 如果不考虑多线程的可以不实现这个方法<br>
* @param threadNum number of threads
*
* @param thread 线程数量
*/
*/
public
void
setThread
(
int
thread
);
public
void
setThread
(
int
thread
Num
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
90bbe9b9
...
@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
...
@@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
import
org.apache.http.Header
;
import
org.apache.http.Header
;
import
org.apache.http.HeaderElement
;
import
org.apache.http.HeaderElement
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.HttpClient
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.client.methods.HttpGet
;
...
@@ -22,12 +23,12 @@ import java.util.Set;
...
@@ -22,12 +23,12 @@ import java.util.Set;
/**
/**
*
封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
*
The http downloader based on HttpClient.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午12:15
*/
*/
@ThreadSafe
public
class
HttpClientDownloader
implements
Downloader
{
public
class
HttpClientDownloader
implements
Downloader
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
...
@@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader {
...
@@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader {
private
int
poolSize
=
1
;
private
int
poolSize
=
1
;
/**
/**
*
直接下载页面的简便方法
*
A simple method to download a url.
*
*
* @param url
* @param url
* @return html
* @return html
*/
*/
public
Html
download
(
String
url
)
{
public
Html
download
(
String
url
)
{
Page
page
=
download
(
new
Request
(
url
),
null
);
Page
page
=
download
(
new
Request
(
url
),
null
);
return
(
Html
)
page
.
getHtml
();
return
(
Html
)
page
.
getHtml
();
}
}
@Override
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
View file @
90bbe9b9
...
@@ -20,8 +20,7 @@ import java.util.Map;
...
@@ -20,8 +20,7 @@ import java.util.Map;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午12:29
*/
*/
public
class
HttpClientPool
{
public
class
HttpClientPool
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html
View file @
90bbe9b9
<html>
<html>
<body>
<body>
包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。
Downloader is the part that downloads web pages and store in Page object.
</body>
</body>
</html>
</html>
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
View file @
90bbe9b9
...
@@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task;
...
@@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
* 命令行输出抽取结果。可用于测试。<br>
* Write results in console.<br>
* Usually used in test.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午1:45
*/
*/
public
class
ConsolePipeline
implements
Pipeline
{
public
class
ConsolePipeline
implements
Pipeline
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
View file @
90bbe9b9
package
us
.
codecraft
.
webmagic
.
pipeline
;
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.commons.codec.digest.DigestUtils
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
...
@@ -12,28 +13,23 @@ import java.io.PrintWriter;
...
@@ -12,28 +13,23 @@ import java.io.PrintWriter;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
*
持久化到文件的接口。
*
Store results in files.<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午6:28
*/
*/
public
class
FilePipeline
extends
FilePersistentBase
implements
Pipeline
{
@ThreadSafe
public
class
FilePipeline
extends
FilePersistentBase
implements
Pipeline
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
/**
/**
*
新建一个FilePipeline,使用默认保存路径
"/data/webmagic/"
*
create a FilePipeline with default path
"/data/webmagic/"
*/
*/
public
FilePipeline
()
{
public
FilePipeline
()
{
setPath
(
"/data/webmagic/"
);
setPath
(
"/data/webmagic/"
);
}
}
/**
* 新建一个FilePipeline
*
* @param path 文件保存路径
*/
public
FilePipeline
(
String
path
)
{
public
FilePipeline
(
String
path
)
{
setPath
(
path
);
setPath
(
path
);
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
View file @
90bbe9b9
...
@@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems;
...
@@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
* Pipeline is the persistent and offline process part of crawler.<br>
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午1:39
* @see ConsolePipeline
* @see FilePipeline
*/
*/
public
interface
Pipeline
{
public
interface
Pipeline
{
public
void
process
(
ResultItems
resultItems
,
Task
task
);
/**
* Process extracted results.
*
* @param resultItems
* @param task
*/
public
void
process
(
ResultItems
resultItems
,
Task
task
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html
View file @
90bbe9b9
<html>
<html>
<body>
<body>
包含了处理页面抽取结果的接口Pipeline和它的几个实现类。
Pipeline is the persistent and offline process part of crawler.
</body>
</body>
</html>
</html>
webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
View file @
90bbe9b9
...
@@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page;
...
@@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
/**
/**
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
* Interface to be implemented to customize a crawler.<br>
* extends the class to implements various spiders.<br>
* <br>
* In PageProcessor, you can customize:
* <p/>
* start urls and other settings in {@link Site}<br>
* how the urls to fetch are detected <br>
* how the data are extracted and stored <br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @see Site
* Time: 上午11:42
* @see Page
* @since 0.1.0
*/
*/
public
interface
PageProcessor
{
public
interface
PageProcessor
{
/**
/**
* 定义如何处理页面,包括链接提取、内容抽取等。
* process the page, extract urls to fetch, extract the data and store
*
* @param page
* @param page
*/
*/
public
void
process
(
Page
page
);
public
void
process
(
Page
page
);
/**
/**
* 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
* get the site settings
*
* @return site
* @return site
* @see Site
*/
*/
public
Site
getSite
();
public
Site
getSite
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
View file @
90bbe9b9
...
@@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
...
@@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.util.List
;
import
java.util.List
;
/**
/**
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
* A simple PageProcessor.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-22
* @since 0.1.0
* Time: 下午9:15
*/
*/
public
class
SimplePageProcessor
implements
PageProcessor
{
public
class
SimplePageProcessor
implements
PageProcessor
{
...
@@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor {
...
@@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor {
this
.
site
=
Site
.
me
().
addStartUrl
(
startUrl
).
this
.
site
=
Site
.
me
().
addStartUrl
(
startUrl
).
setDomain
(
UrlUtils
.
getDomain
(
startUrl
));
setDomain
(
UrlUtils
.
getDomain
(
startUrl
));
//compile "*" expression to regex
//compile "*" expression to regex
this
.
urlPattern
=
"("
+
urlPattern
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
;
this
.
urlPattern
=
"("
+
urlPattern
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)
+
")"
;
}
}
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
urlPattern
).
all
();
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
urlPattern
).
all
();
//
调用page.addTargetRequests()方法添加待抓取链接
//
add urls to fetch
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
//
xpath方式抽取
//
extract by XPath
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//title"
));
//sc表示使用Readability技术抽取正文
page
.
putField
(
"html"
,
page
.
getHtml
().
toString
());
page
.
putField
(
"html"
,
page
.
getHtml
().
toString
());
//extract by Readability
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
}
}
@Override
@Override
public
Site
getSite
()
{
public
Site
getSite
()
{
//
定义抽取站点的相关参数
//
settings
return
site
;
return
site
;
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html
View file @
90bbe9b9
<html>
<html>
<body>
<body>
包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。
PageProcessor custom part of a crawler for specific site.
</body>
</body>
</html>
</html>
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
View file @
90bbe9b9
package
us
.
codecraft
.
webmagic
.
scheduler
;
package
us
.
codecraft
.
webmagic
.
scheduler
;
import
org.apache.http.annotation.ThreadSafe
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
...
@@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue;
...
@@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue;
import
java.util.concurrent.LinkedBlockingQueue
;
import
java.util.concurrent.LinkedBlockingQueue
;
/**
/**
* 内存队列实现的线程安全Scheduler。<br>
* Basic Scheduler implementation.<br>
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午1:13
*/
*/
@ThreadSafe
public
class
QueueScheduler
implements
Scheduler
{
public
class
QueueScheduler
implements
Scheduler
{
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
...
@@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler {
...
@@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler {
private
Set
<
String
>
urls
=
new
HashSet
<
String
>();
private
Set
<
String
>
urls
=
new
HashSet
<
String
>();
@Override
@Override
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
public
synchronized
void
push
(
Request
request
,
Task
task
)
{
if
(
logger
.
isDebugEnabled
()){
if
(
logger
.
isDebugEnabled
())
{
logger
.
debug
(
"push to queue "
+
request
.
getUrl
());
logger
.
debug
(
"push to queue "
+
request
.
getUrl
());
}
}
if
(
urls
.
add
(
request
.
getUrl
())){
if
(
urls
.
add
(
request
.
getUrl
()))
{
queue
.
add
(
request
);
queue
.
add
(
request
);
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java
View file @
90bbe9b9
...
@@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request;
...
@@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
* Scheduler is the part of url management.<br>
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
* You can implement interface Scheduler to do:
* manage urls to fetch
* remove duplicate urls
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 下午1:12
*/
*/
public
interface
Scheduler
{
public
interface
Scheduler
{
/**
/**
* 加入一个待抓取的链接
* add a url to fetch
* @param request 待抓取的链接
*
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @param request
* @param task
*/
*/
public
void
push
(
Request
request
,
Task
task
);
public
void
push
(
Request
request
,
Task
task
);
/**
/**
* 返回下一个要抓取的链接
* 返回下一个要抓取的链接
*
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @return 下一个要抓取的链接
* @return 下一个要抓取的链接
*/
*/
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html
View file @
90bbe9b9
<html>
<html>
<body>
<body>
包含url管理和调度的接口Scheduler及它的几个实现类。
Scheduler is the part of url management.
</body>
</body>
</html>
</html>
webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
View file @
90bbe9b9
...
@@ -4,6 +4,8 @@ import java.util.ArrayList;
...
@@ -4,6 +4,8 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* All selectors will be arranged as a pipeline. <br>
* The next selector uses the result of the previous as source.
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.0
* @since 0.2.0
*/
*/
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
90bbe9b9
...
@@ -10,10 +10,10 @@ import java.util.ArrayList;
...
@@ -10,10 +10,10 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* css风格的选择器。包装了Jsoup。<br>
* CSS selector. Based on Jsoup.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* @since 0.1.0
* Time: 上午9:39
*/
*/
public
class
CssSelector
implements
Selector
{
public
class
CssSelector
implements
Selector
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
View file @
90bbe9b9
...
@@ -4,6 +4,8 @@ import java.util.ArrayList;
...
@@ -4,6 +4,8 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.0
* @since 0.2.0
*/
*/
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
View file @
90bbe9b9
...
@@ -4,13 +4,16 @@ import java.util.List;
...
@@ -4,13 +4,16 @@ import java.util.List;
/**
/**
* Selector(extractor) for text.<br>
* Selector(extractor) for text.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
*/
public
interface
Selector
{
public
interface
Selector
{
/**
/**
* Extract single result in text.<br>
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
* If there are more than one result, only the first will be chosen.
*
* @param text
* @param text
* @return result
* @return result
*/
*/
...
@@ -18,6 +21,7 @@ public interface Selector {
...
@@ -18,6 +21,7 @@ public interface Selector {
/**
/**
* Extract all results in text.<br>
* Extract all results in text.<br>
*
* @param text
* @param text
* @return results
* @return results
*/
*/
...
...
zh_docs/us/codecraft/webmagic/Page-cmnt.xml
View file @
90bbe9b9
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
{@link #getHtml()} get content of current page
{@link #getHtml()} get content of current page
{@link #putField(String, Object)} save extracted result
{@link #putField(String, Object)} save extracted result
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to
crawl
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to
fetch
</pre>
</pre>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment