Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
a994b1c9
Commit
a994b1c9
authored
Aug 17, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
complete extension comments in en
parent
c59c1fe8
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
38 additions
and
33 deletions
+38
-33
MultiPageModel.java
...n/src/main/java/us/codecraft/webmagic/MultiPageModel.java
+1
-0
AfterExtractor.java
...main/java/us/codecraft/webmagic/model/AfterExtractor.java
+2
-1
ConsolePageModelPipeline.java
...us/codecraft/webmagic/model/ConsolePageModelPipeline.java
+3
-2
Extractor.java
.../src/main/java/us/codecraft/webmagic/model/Extractor.java
+2
-2
FieldExtractor.java
...main/java/us/codecraft/webmagic/model/FieldExtractor.java
+5
-5
HasKey.java
...ion/src/main/java/us/codecraft/webmagic/model/HasKey.java
+7
-7
ModelPageProcessor.java
.../java/us/codecraft/webmagic/model/ModelPageProcessor.java
+3
-3
ModelPipeline.java
.../main/java/us/codecraft/webmagic/model/ModelPipeline.java
+3
-4
OOSpider.java
...n/src/main/java/us/codecraft/webmagic/model/OOSpider.java
+1
-1
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+2
-3
PageModelPipeline.java
...n/java/us/codecraft/webmagic/model/PageModelPipeline.java
+3
-2
package.html
.../java/us/codecraft/webmagic/model/annotation/package.html
+1
-1
JsonPathSelector.java
...java/us/codecraft/webmagic/selector/JsonPathSelector.java
+5
-2
No files found.
webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
View file @
a994b1c9
...
@@ -8,6 +8,7 @@ import java.util.Collection;
...
@@ -8,6 +8,7 @@ import java.util.Collection;
* Extract an object of more than one pages, such as news and articles。<br>
* Extract an object of more than one pages, such as news and articles。<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
*/
@Experimental
@Experimental
public
interface
MultiPageModel
{
public
interface
MultiPageModel
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java
View file @
a994b1c9
...
@@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
...
@@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
/**
/**
*
实现这个接口即可在抽取后进行后处理
。<br>
*
Interface to be implemented by page models that need to do something after fields are extracted
。<br>
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
*/
public
interface
AfterExtractor
{
public
interface
AfterExtractor
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java
View file @
a994b1c9
...
@@ -4,9 +4,10 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
...
@@ -4,9 +4,10 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
* Print page model in console.<br>
* Usually used in test.<br>
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* @since 0.2.0
* Time: 下午3:41 <br>
*/
*/
public
class
ConsolePageModelPipeline
implements
PageModelPipeline
{
public
class
ConsolePageModelPipeline
implements
PageModelPipeline
{
@Override
@Override
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
View file @
a994b1c9
...
@@ -3,9 +3,9 @@ package us.codecraft.webmagic.model;
...
@@ -3,9 +3,9 @@ package us.codecraft.webmagic.model;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.Selector
;
/**
/**
* The object contains 'ExtractBy' information.
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* @since 0.2.0
* Time: 下午9:48 <br>
*/
*/
class
Extractor
{
class
Extractor
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
View file @
a994b1c9
...
@@ -6,18 +6,18 @@ import java.lang.reflect.Field;
...
@@ -6,18 +6,18 @@ import java.lang.reflect.Field;
import
java.lang.reflect.Method
;
import
java.lang.reflect.Method
;
/**
/**
* Wrapper of field and extractor.
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* @since 0.2.0
* Time: 下午9:48 <br>
*/
*/
class
FieldExtractor
extends
Extractor
{
class
FieldExtractor
extends
Extractor
{
private
final
Field
field
;
private
final
Field
field
;
private
Method
setterMethod
;
private
Method
setterMethod
;
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
public
FieldExtractor
(
Field
field
,
Selector
selector
,
Source
source
,
boolean
notNull
,
boolean
multi
)
{
super
(
selector
,
source
,
notNull
,
multi
);
super
(
selector
,
source
,
notNull
,
multi
);
this
.
field
=
field
;
this
.
field
=
field
;
}
}
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java
View file @
a994b1c9
package
us
.
codecraft
.
webmagic
.
model
;
package
us
.
codecraft
.
webmagic
.
model
;
import
us.codecraft.webmagic.model.annotation.Experimental
;
/**
/**
* 标志一个Model的key。<br>
* Interface to be implemented by page mode.<br>
* 实现了这个接口的Model在输出时会使用getKey()作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。<br>
* Can be used to identify a page model, or be used as name of file storing the object.<br>
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8 。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-10 <br>
* @since 0.2.0
* Time: 上午7:39 <br>
*/
*/
@Experimental
public
interface
HasKey
{
public
interface
HasKey
{
/**
/**
*
在输出时会使用key作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。
*
*
*
* @return key
* @return key
*/
*/
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java
View file @
a994b1c9
...
@@ -14,10 +14,10 @@ import java.util.regex.Matcher;
...
@@ -14,10 +14,10 @@ import java.util.regex.Matcher;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
/**
/**
* 基于PageProcessor的扩展点。<br>
* The extension to PageProcessor for page model extractor.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* @since 0.2.0
* Time: 下午8:46 <br>
*/
*/
class
ModelPageProcessor
implements
PageProcessor
{
class
ModelPageProcessor
implements
PageProcessor
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java
View file @
a994b1c9
...
@@ -11,11 +11,10 @@ import java.util.Map;
...
@@ -11,11 +11,10 @@ import java.util.Map;
import
java.util.concurrent.ConcurrentHashMap
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
/**
*
基于Pipeline的扩展点,用于实现注解格式的Pipeline。<br>
*
The extension to Pipeline for page model extractor.
*
与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* @since 0.2.0
* Time: 上午10:47 <br>
*/
*/
class
ModelPipeline
implements
Pipeline
{
class
ModelPipeline
implements
Pipeline
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
View file @
a994b1c9
...
@@ -42,7 +42,7 @@ public class OOSpider extends Spider {
...
@@ -42,7 +42,7 @@ public class OOSpider extends Spider {
}
}
/**
/**
*
创建一个爬虫。<br>
*
create a spider
* @param site
* @param site
* @param pageModelPipeline
* @param pageModelPipeline
* @param pageModels
* @param pageModels
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
a994b1c9
...
@@ -15,11 +15,10 @@ import java.util.List;
...
@@ -15,11 +15,10 @@ import java.util.List;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
/**
/**
*
Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
*
The main internal logic of page model extractor.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* @since 0.2.0
* Time: 下午9:33 <br>
*/
*/
class
PageModelExtractor
{
class
PageModelExtractor
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java
View file @
a994b1c9
...
@@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
...
@@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
/**
/**
* Implements PageModelPipeline to persistent your page model.
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* @since 0.2.0
* Time: 上午9:34 <br>
*/
*/
public
interface
PageModelPipeline
<
T
>
{
public
interface
PageModelPipeline
<
T
>
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html
View file @
a994b1c9
<html>
<html>
<body>
<body>
Annotations for defin
e a class
.
Annotations for defin
ing a extractor
.
</body>
</body>
</html>
</html>
webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
View file @
a994b1c9
...
@@ -6,8 +6,11 @@ import java.util.ArrayList;
...
@@ -6,8 +6,11 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* JsonPath
* JsonPath selector.<br>
* Used to extract content from JSON.<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
*/
public
class
JsonPathSelector
implements
Selector
{
public
class
JsonPathSelector
implements
Selector
{
...
@@ -43,7 +46,7 @@ public class JsonPathSelector implements Selector {
...
@@ -43,7 +46,7 @@ public class JsonPathSelector implements Selector {
return
list
;
return
list
;
}
}
if
(
object
instanceof
List
)
{
if
(
object
instanceof
List
)
{
return
(
List
<
String
>)
object
;
return
(
List
<
String
>)
object
;
}
else
{
}
else
{
list
.
add
(
object
.
toString
());
list
.
add
(
object
.
toString
());
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment