Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
96454fd7
Commit
96454fd7
authored
Jul 24, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update java doc
parent
81e7f798
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
62 additions
and
25 deletions
+62
-25
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+2
-2
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+3
-3
ConsolePipeline.java
.../java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
+1
-0
FilePipeline.java
...ain/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+8
-0
Pipeline.java
...rc/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
+1
-0
PageProcessor.java
...n/java/us/codecraft/webmagic/processor/PageProcessor.java
+4
-2
SimplePageProcessor.java
.../us/codecraft/webmagic/processor/SimplePageProcessor.java
+2
-0
FileCacheQueueScheduler.java
...codecraft/webmagic/schedular/FileCacheQueueScheduler.java
+3
-1
QueueScheduler.java
.../java/us/codecraft/webmagic/schedular/QueueScheduler.java
+1
-0
Scheduler.java
.../main/java/us/codecraft/webmagic/schedular/Scheduler.java
+12
-0
package.html
...rc/main/java/us/codecraft/webmagic/schedular/package.html
+1
-1
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+1
-0
RegexResult.java
...main/java/us/codecraft/webmagic/selector/RegexResult.java
+1
-0
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+1
-1
XpathSelector.java
...in/java/us/codecraft/webmagic/selector/XpathSelector.java
+5
-4
UrlUtils.java
...e/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+10
-4
UrlUtilsTest.java
...c/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+5
-6
FreemarkerPipelineTest.java
...st/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
96454fd7
...
...
@@ -101,7 +101,7 @@ public class Page {
if
(
StringUtils
.
isBlank
(
s
)
||
s
.
equals
(
"#"
)
||
s
.
startsWith
(
"javascript:"
))
{
break
;
}
s
=
UrlUtils
.
fixRelativ
eUrl
(
s
,
url
.
toString
());
s
=
UrlUtils
.
canonicaliz
eUrl
(
s
,
url
.
toString
());
targetRequests
.
add
(
new
Request
(
s
));
}
}
...
...
@@ -116,7 +116,7 @@ public class Page {
return
;
}
synchronized
(
targetRequests
)
{
requestString
=
UrlUtils
.
fixRelativ
eUrl
(
requestString
,
url
.
toString
());
requestString
=
UrlUtils
.
canonicaliz
eUrl
(
requestString
,
url
.
toString
());
targetRequests
.
add
(
new
Request
(
requestString
));
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
96454fd7
...
...
@@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader {
//charset
if
(
charset
==
null
)
{
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
charset
=
new
PlainText
(
value
).
regex
(
"charset=([^\\s]+)"
).
toString
(
);
charset
=
UrlUtils
.
getCharset
(
value
);
}
//
handleGzip
(
httpResponse
);
...
...
@@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader {
Header
ceheader
=
httpResponse
.
getEntity
().
getContentEncoding
();
if
(
ceheader
!=
null
)
{
HeaderElement
[]
codecs
=
ceheader
.
getElements
();
for
(
int
i
=
0
;
i
<
codecs
.
length
;
i
++
)
{
if
(
codec
s
[
i
]
.
getName
().
equalsIgnoreCase
(
"gzip"
))
{
for
(
HeaderElement
codec
:
codecs
)
{
if
(
codec
.
getName
().
equalsIgnoreCase
(
"gzip"
))
{
httpResponse
.
setEntity
(
new
GzipDecompressingEntity
(
httpResponse
.
getEntity
()));
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
View file @
96454fd7
...
...
@@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable;
import
java.util.Map
;
/**
* 命令行输出抽取结果。可用于测试。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:45
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
View file @
96454fd7
...
...
@@ -11,6 +11,7 @@ import java.io.IOException;
import
java.io.PrintWriter
;
/**
* 持久化到文件的接口。
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午6:28
...
...
@@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline {
private
Logger
logger
=
Logger
.
getLogger
(
getClass
());
/**
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
*/
public
FilePipeline
()
{
}
/**
* 新建一个FilePipeline
* @param path 文件保存路径
*/
public
FilePipeline
(
String
path
)
{
this
.
path
=
path
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
View file @
96454fd7
...
...
@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Task
;
/**
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:39
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
View file @
96454fd7
...
...
@@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Site
;
/**
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
* extends the class to implements various spiders.<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午11:42
...
...
@@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site;
public
interface
PageProcessor
{
/**
*
extends the class to implements variaty spiders
*
定义如何处理页面,包括链接提取、内容抽取等。
* @param page
*/
public
void
process
(
Page
page
);
/**
*
the site the processor for
*
定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
* @return site
*/
public
Site
getSite
();
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
View file @
96454fd7
...
...
@@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.util.List
;
/**
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-22
* Time: 下午9:15
...
...
@@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor {
public
SimplePageProcessor
(
String
startUrl
,
String
urlPattern
)
{
this
.
site
=
Site
.
me
().
addStartUrl
(
startUrl
).
setDomain
(
UrlUtils
.
getDomain
(
startUrl
)).
setUserAgent
(
UA
);
//compile "*" expression to regex
this
.
urlPattern
=
"("
+
urlPattern
.
replace
(
"."
,
"\\."
).
replace
(
"*"
,
"[^\"'#]*"
)+
")"
;
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
View file @
96454fd7
...
...
@@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:13
...
...
@@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile
();
readUrlFile
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"init file error"
,
e
);
}
}
...
...
@@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler {
private
void
readCursorFile
()
throws
IOException
{
BufferedReader
fileCursorReader
=
new
BufferedReader
(
new
FileReader
(
getFileName
(
fileCursor
)));
String
line
=
null
;
String
line
;
//read the last number
while
((
line
=
fileCursorReader
.
readLine
())
!=
null
)
{
cursor
=
new
AtomicInteger
(
NumberUtils
.
toInt
(
line
));
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
View file @
96454fd7
...
...
@@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue;
import
java.util.concurrent.LinkedBlockingQueue
;
/**
* 内存队列实现的线程安全Scheduler。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:13
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
View file @
96454fd7
...
...
@@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request;
import
us.codecraft.webmagic.Task
;
/**
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:12
*/
public
interface
Scheduler
{
/**
* 加入一个待抓取的链接
* @param request 待抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
*/
public
void
push
(
Request
request
,
Task
task
);
/**
* 返回下一个要抓取的链接
* @param task 定义的任务,以满足单Scheduler多Task的情况
* @return
*/
public
Request
poll
(
Task
task
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html
View file @
96454fd7
<html>
<body>
包含url管理和调度的接口Schedul
a
r及它的几个实现类。
包含url管理和调度的接口Schedul
e
r及它的几个实现类。
</body>
</html>
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
96454fd7
...
...
@@ -10,6 +10,7 @@ import java.util.ArrayList;
import
java.util.List
;
/**
* css风格的选择器。包装了Jsoup。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
View file @
96454fd7
package
us
.
codecraft
.
webmagic
.
selector
;
/**
* 封装正则表达式抽取接口的类。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午7:39
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
96454fd7
...
...
@@ -18,7 +18,7 @@ public interface Selectable {
public
Selectable
xpath
(
String
xpath
);
/**
* select list with
jquery
selector
* select list with
css
selector
*
* @param
* @return
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
View file @
96454fd7
...
...
@@ -6,6 +6,7 @@ import java.util.ArrayList;
import
java.util.List
;
/**
* xpath的选择器。包装了HtmlCleaner。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午9:39
...
...
@@ -52,12 +53,12 @@ public class XpathSelector implements Selector {
try
{
Object
[]
objects
=
tagNode
.
evaluateXPath
(
xpathStr
);
if
(
objects
!=
null
&&
objects
.
length
>=
1
)
{
for
(
int
i
=
0
;
i
<
objects
.
length
;
i
++
)
{
if
(
object
s
[
i
]
instanceof
TagNode
)
{
TagNode
tagNode1
=
(
TagNode
)
object
s
[
i
]
;
for
(
Object
object
:
objects
)
{
if
(
object
instanceof
TagNode
)
{
TagNode
tagNode1
=
(
TagNode
)
object
;
results
.
add
(
htmlCleaner
.
getInnerHtml
(
tagNode1
));
}
else
{
results
.
add
(
object
s
[
i
]
.
toString
());
results
.
add
(
object
.
toString
());
}
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
View file @
96454fd7
...
...
@@ -14,7 +14,13 @@ public class UrlUtils {
private
static
Pattern
relativePathPattern
=
Pattern
.
compile
(
"^([\\.]+)/"
);
public
static
String
fixRelativeUrl
(
String
url
,
String
refer
)
{
/**
* 将url想对地址转化为绝对地址
* @param url url地址
* @param refer url地址来自哪个页面
* @return
*/
public
static
String
canonicalizeUrl
(
String
url
,
String
refer
)
{
if
(
StringUtils
.
isBlank
(
url
)
||
StringUtils
.
isBlank
(
refer
))
{
return
url
;
}
...
...
@@ -62,12 +68,12 @@ public class UrlUtils {
private
static
Pattern
patternForProtocal
=
Pattern
.
compile
(
"[\\w]+://"
);
public
static
String
removeProtoc
a
l
(
String
url
)
{
public
static
String
removeProtoc
o
l
(
String
url
)
{
return
patternForProtocal
.
matcher
(
url
).
replaceAll
(
""
);
}
public
static
String
getDomain
(
String
url
)
{
String
domain
=
removeProtoc
a
l
(
url
);
String
domain
=
removeProtoc
o
l
(
url
);
int
i
=
StringUtils
.
indexOf
(
domain
,
"/"
,
1
);
if
(
i
>
0
)
{
domain
=
StringUtils
.
substring
(
domain
,
0
,
i
);
...
...
@@ -84,7 +90,7 @@ public class UrlUtils {
while
(
matcher
.
find
())
{
stringBuilder
.
append
(
StringUtils
.
substring
(
html
,
lastEnd
,
matcher
.
start
()));
stringBuilder
.
append
(
matcher
.
group
(
1
));
stringBuilder
.
append
(
"\""
+
fixRelativeUrl
(
matcher
.
group
(
2
),
url
)
+
"\""
);
stringBuilder
.
append
(
"\""
).
append
(
canonicalizeUrl
(
matcher
.
group
(
2
),
url
)).
append
(
"\""
);
lastEnd
=
matcher
.
end
();
}
stringBuilder
.
append
(
StringUtils
.
substring
(
html
,
lastEnd
));
...
...
webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
View file @
96454fd7
...
...
@@ -12,18 +12,18 @@ public class UrlUtilsTest {
@Test
public
void
testFixRelativeUrl
()
{
String
fixrelativeurl
=
UrlUtils
.
fixRelativ
eUrl
(
"aa"
,
"http://www.dianping.com/sh/ss/com"
);
String
fixrelativeurl
=
UrlUtils
.
canonicaliz
eUrl
(
"aa"
,
"http://www.dianping.com/sh/ss/com"
);
System
.
out
.
println
(
"fix: "
+
fixrelativeurl
);
Assert
.
assertEquals
(
"http://www.dianping.com/sh/ss/aa"
,
fixrelativeurl
);
fixrelativeurl
=
UrlUtils
.
fixRelativ
eUrl
(
"../aa"
,
"http://www.dianping.com/sh/ss/com"
);
fixrelativeurl
=
UrlUtils
.
canonicaliz
eUrl
(
"../aa"
,
"http://www.dianping.com/sh/ss/com"
);
Assert
.
assertEquals
(
"http://www.dianping.com/sh/aa"
,
fixrelativeurl
);
fixrelativeurl
=
UrlUtils
.
fixRelativ
eUrl
(
"..../aa"
,
"http://www.dianping.com/sh/ss/com"
);
fixrelativeurl
=
UrlUtils
.
canonicaliz
eUrl
(
"..../aa"
,
"http://www.dianping.com/sh/ss/com"
);
Assert
.
assertEquals
(
"http://www.dianping.com/aa"
,
fixrelativeurl
);
fixrelativeurl
=
UrlUtils
.
fixRelativ
eUrl
(
".../aa"
,
"http://www.dianping.com/sh/ss/com"
);
fixrelativeurl
=
UrlUtils
.
canonicaliz
eUrl
(
".../aa"
,
"http://www.dianping.com/sh/ss/com"
);
Assert
.
assertEquals
(
"http://www.dianping.com/aa"
,
fixrelativeurl
);
fixrelativeurl
=
UrlUtils
.
fixRelativ
eUrl
(
"..aa"
,
"http://www.dianping.com/sh/ss/com"
);
fixrelativeurl
=
UrlUtils
.
canonicaliz
eUrl
(
"..aa"
,
"http://www.dianping.com/sh/ss/com"
);
Assert
.
assertEquals
(
"http://www.dianping.com/sh/ss/..aa"
,
fixrelativeurl
);
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com");
// System.out.println("fix: " + fixrelativeurl);
...
...
@@ -628,7 +628,6 @@ public class UrlUtilsTest {
"\t\t\t<script src=\"http://discuz.gtimg.cn/cloud/scripts/discuz_tips.js?v=1\" type=\"text/javascript\" charset=\"UTF-8\"></script></body>\n"
+
"</html>\n"
;
String
newHtml
=
UrlUtils
.
fixAllRelativeHrefs
(
html
,
"http://www.huxiu.com/"
);
String
text
=
"<a class=\"xu_subscribe\" href=\"home.php?mod=spacecp&ac=profile&op=info\" >订阅<span >虎嗅</span></a>"
;
Assert
.
assertTrue
(
html
.
contains
(
"<a href=\"article"
));
Assert
.
assertFalse
(
newHtml
.
contains
(
"<a href=\"article"
));
}
...
...
webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
View file @
96454fd7
...
...
@@ -14,6 +14,6 @@ public class FreemarkerPipelineTest {
@Test
public
void
testTemplateLoad
()
throws
IOException
{
FreemarkerPipeline
freemarkerPipeline
=
new
FreemarkerPipeline
(
"wordpress.ftl"
);
new
FreemarkerPipeline
(
"wordpress.ftl"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment