Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
00dfebbc
Commit
00dfebbc
authored
Dec 18, 2016
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#424 remove guava dep and add fix docs
parent
c2531c68
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
23 changed files
with
75 additions
and
52 deletions
+75
-52
pom.xml
pom.xml
+5
-5
pom.xml
webmagic-core/pom.xml
+0
-11
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+9
-8
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+3
-4
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+2
-2
HashSetDuplicateRemover.java
...webmagic/scheduler/component/HashSetDuplicateRemover.java
+2
-2
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+1
-1
WMCollections.java
.../main/java/us/codecraft/webmagic/utils/WMCollections.java
+30
-0
pom.xml
webmagic-extension/pom.xml
+6
-0
ExpressionType.java
...va/us/codecraft/webmagic/configurable/ExpressionType.java
+0
-1
ExtractRule.java
.../java/us/codecraft/webmagic/configurable/ExtractRule.java
+0
-1
PhantomJSDownloader.java
...us/codecraft/webmagic/downloader/PhantomJSDownloader.java
+1
-1
CompositePageProcessor.java
...us/codecraft/webmagic/handler/CompositePageProcessor.java
+0
-1
SubPageProcessor.java
.../java/us/codecraft/webmagic/handler/SubPageProcessor.java
+0
-1
SpiderMonitor.java
...ain/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
+1
-0
BloomFilterDuplicateRemover.java
...craft/webmagic/scheduler/BloomFilterDuplicateRemover.java
+9
-2
BloomFilterDuplicateRemoverTest.java
...t/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
+0
-1
BaiduNews.java
...n/java/us/codecraft/webmagic/model/samples/BaiduNews.java
+0
-1
QQMeishi.java
...in/java/us/codecraft/webmagic/model/samples/QQMeishi.java
+0
-1
OschinaBlogPageProcesser.java
.../codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+1
-1
PhantomJSPageProcessor.java
...us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
+1
-1
OneFilePipeline.java
.../codecraft/webmagic/samples/pipeline/OneFilePipeline.java
+0
-3
ScriptConsole.java
...ain/java/us/codecraft/webmagic/scripts/ScriptConsole.java
+4
-4
No files found.
pom.xml
View file @
00dfebbc
...
...
@@ -70,16 +70,16 @@
<artifactId>
httpclient
</artifactId>
<version>
4.5.2
</version>
</dependency>
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
<artifactId>
json-path
</artifactId>
<version>
0.8.1
</version>
</dependency>
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
<version>
15.0
</version>
</dependency>
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
<artifactId>
json-path
</artifactId>
<version>
0.8.1
</version>
</dependency>
<dependency>
<groupId>
org.slf4j
</groupId>
<artifactId>
slf4j-api
</artifactId>
...
...
webmagic-core/pom.xml
View file @
00dfebbc
...
...
@@ -20,11 +20,6 @@
<artifactId>
junit
</artifactId>
</dependency>
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
</dependency>
<dependency>
<groupId>
org.apache.commons
</groupId>
<artifactId>
commons-lang3
</artifactId>
...
...
@@ -73,12 +68,6 @@
<dependency>
<groupId>
com.jayway.jsonpath
</groupId>
<artifactId>
json-path
</artifactId>
<exclusions>
<exclusion>
<groupId>
commons-lang
</groupId>
<artifactId>
commons-lang
</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
;
import
com.google.common.collect.HashBasedTable
;
import
com.google.common.collect.Table
;
import
org.apache.http.HttpHost
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.SimpleProxyPool
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.ProxyPool
;
import
us.codecraft.webmagic.proxy.SimpleProxyPool
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.util.*
;
...
...
@@ -27,7 +24,7 @@ public class Site {
private
Map
<
String
,
String
>
defaultCookies
=
new
LinkedHashMap
<
String
,
String
>();
private
Table
<
String
,
String
,
String
>
cookies
=
HashBasedTable
.
create
();
private
Map
<
String
,
Map
<
String
,
String
>>
cookies
=
new
HashMap
<
String
,
Map
<
String
,
String
>>
();
private
String
charset
;
...
...
@@ -104,7 +101,10 @@ public class Site {
* @return this
*/
public
Site
addCookie
(
String
domain
,
String
name
,
String
value
)
{
cookies
.
put
(
domain
,
name
,
value
);
if
(!
cookies
.
containsKey
(
domain
)){
cookies
.
put
(
domain
,
new
HashMap
<
String
,
String
>());
}
cookies
.
get
(
domain
).
put
(
name
,
value
);
return
this
;
}
...
...
@@ -134,7 +134,7 @@ public class Site {
* @return get cookies
*/
public
Map
<
String
,
Map
<
String
,
String
>>
getAllCookies
()
{
return
cookies
.
rowMap
()
;
return
cookies
;
}
/**
...
...
@@ -483,6 +483,7 @@ public class Site {
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
*
* @param httpProxyList httpProxyList
* @param isUseLastProxy isUseLastProxy
* @return this
*/
public
Site
setHttpProxyPool
(
List
<
String
[]>
httpProxyList
,
boolean
isUseLastProxy
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
;
import
com.google.common.collect.Lists
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.http.HttpHost
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.downloader.Downloader
;
...
...
@@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.thread.CountableThreadPool
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.Closeable
;
import
java.io.IOException
;
...
...
@@ -173,9 +172,9 @@ public class Spider implements Runnable, Task {
*
* @param scheduler scheduler
* @return this
* @Deprecated
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public
Spider
scheduler
(
Scheduler
scheduler
)
{
return
setScheduler
(
scheduler
);
}
...
...
@@ -499,7 +498,7 @@ public class Spider implements Runnable, Task {
}
public
<
T
>
T
get
(
String
url
)
{
List
<
String
>
urls
=
List
s
.
newArrayList
(
url
);
List
<
String
>
urls
=
WMCollection
s
.
newArrayList
(
url
);
List
<
T
>
resultItemses
=
getAll
(
urls
);
if
(
resultItemses
!=
null
&&
resultItemses
.
size
()
>
0
)
{
return
resultItemses
.
get
(
0
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
.
downloader
;
import
com.google.common.collect.Sets
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpHost
;
...
...
@@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
...
...
@@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader {
charset
=
site
.
getCharset
();
headers
=
site
.
getHeaders
();
}
else
{
acceptStatCode
=
Set
s
.
newHashSet
(
200
);
acceptStatCode
=
WMCollection
s
.
newHashSet
(
200
);
}
logger
.
info
(
"downloading page {}"
,
request
.
getUrl
());
CloseableHttpResponse
httpResponse
=
null
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
.
scheduler
.
component
;
import
com.google.common.collect.Sets
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
java.util.Collections
;
import
java.util.Set
;
import
java.util.concurrent.ConcurrentHashMap
;
...
...
@@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap;
*/
public
class
HashSetDuplicateRemover
implements
DuplicateRemover
{
private
Set
<
String
>
urls
=
Set
s
.
newSetFromMap
(
new
ConcurrentHashMap
<
String
,
Boolean
>());
private
Set
<
String
>
urls
=
Collection
s
.
newSetFromMap
(
new
ConcurrentHashMap
<
String
,
Boolean
>());
@Override
public
boolean
isDuplicate
(
Request
request
,
Task
task
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
View file @
00dfebbc
...
...
@@ -33,11 +33,11 @@ public abstract class Selectors {
}
/**
* @Deprecated
* @see #xpath(String)
* @param expr expr
* @return new selector
*/
@Deprecated
public
static
XpathSelector
xsoup
(
String
expr
)
{
return
new
XpathSelector
(
expr
);
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java
0 → 100644
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
.
utils
;
import
java.util.ArrayList
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:16
*/
public
class
WMCollections
{
public
static
<
T
>
Set
<
T
>
newHashSet
(
T
...
t
){
Set
<
T
>
set
=
new
HashSet
<
T
>(
t
.
length
);
for
(
T
t1
:
t
)
{
set
.
add
(
t1
);
}
return
set
;
}
public
static
<
T
>
List
<
T
>
newArrayList
(
T
...
t
){
List
<
T
>
set
=
new
ArrayList
<
T
>(
t
.
length
);
for
(
T
t1
:
t
)
{
set
.
add
(
t1
);
}
return
set
;
}
}
webmagic-extension/pom.xml
View file @
00dfebbc
...
...
@@ -15,6 +15,12 @@
<artifactId>
jedis
</artifactId>
<version>
2.9.0
</version>
</dependency>
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
<version>
15.0
</version>
<optional>
true
</optional>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-core
</artifactId>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java
View file @
00dfebbc
...
...
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public
enum
ExpressionType
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java
View file @
00dfebbc
...
...
@@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public
class
ExtractRule
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
View file @
00dfebbc
...
...
@@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand
* @param phantomJsCommand
phantomJsCommand
*/
public
PhantomJSDownloader
(
String
phantomJsCommand
)
{
this
.
initPhantomjsCrawlPath
();
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java
View file @
00dfebbc
...
...
@@ -9,7 +9,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public
class
CompositePageProcessor
implements
PageProcessor
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
View file @
00dfebbc
...
...
@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public
interface
SubPageProcessor
extends
RequestMatcher
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
View file @
00dfebbc
...
...
@@ -45,6 +45,7 @@ public class SpiderMonitor {
*
* @param spiders spiders
* @return this
* @throws JMException
*/
public
synchronized
SpiderMonitor
register
(
Spider
...
spiders
)
throws
JMException
{
for
(
Spider
spider
:
spiders
)
{
...
...
webmagic-
core/src/main/java/us/codecraft/webmagic/scheduler/component
/BloomFilterDuplicateRemover.java
→
webmagic-
extension/src/main/java/us/codecraft/webmagic/scheduler
/BloomFilterDuplicateRemover.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
.
scheduler
.
component
;
package
us
.
codecraft
.
webmagic
.
scheduler
;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:23
*/
import
com.google.common.hash.BloomFilter
;
import
com.google.common.hash.Funnels
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.scheduler.component.DuplicateRemover
;
import
java.nio.charset.Charset
;
import
java.util.concurrent.atomic.AtomicInteger
;
...
...
@@ -67,4 +74,4 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover {
public
int
getTotalRequestsCount
(
Task
task
)
{
return
counter
.
get
();
}
}
}
\ No newline at end of file
webmagic-
core
/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
→
webmagic-
extension
/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
View file @
00dfebbc
...
...
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover
;
import
us.codecraft.webmagic.scheduler.component.DuplicateRemover
;
import
us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover
;
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java
View file @
00dfebbc
...
...
@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
/**
* @author code4crafter@gmail.com
* @date 14-4-9
*/
public
class
BaiduNews
{
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java
View file @
00dfebbc
...
...
@@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com
* @date 14-4-11
*/
@TargetUrl
(
"http://meishi.qq.com/beijing/c/all[\\-p2]*"
)
@ExtractBy
(
value
=
"//ul[@id=\"promos_list2\"]/li"
,
multi
=
true
)
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
View file @
00dfebbc
...
...
@@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.monitor.SpiderMonitor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover
;
import
javax.management.JMException
;
import
java.util.List
;
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
View file @
00dfebbc
...
...
@@ -13,7 +13,7 @@ import java.util.List;
/**
* Created by dolphineor on 2014-11-21.
* <p
/
>
* <p>
* 以淘宝为例, 搜索冬装的相关结果
*/
public
class
PhantomJSPageProcessor
implements
PageProcessor
{
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java
View file @
00dfebbc
...
...
@@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline {
private
PrintWriter
printWriter
;
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public
OneFilePipeline
()
throws
FileNotFoundException
,
UnsupportedEncodingException
{
this
(
"/data/webmagic/"
);
}
...
...
webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java
View file @
00dfebbc
package
us
.
codecraft
.
webmagic
.
scripts
;
import
com.google.common.collect.Sets
;
import
org.apache.commons.cli.*
;
import
org.apache.log4j.Level
;
import
org.apache.log4j.Logger
;
...
...
@@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.utils.WMCollections
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
@@ -29,8 +29,8 @@ public class ScriptConsole {
private
static
Map
<
Language
,
Set
<
String
>>
alias
=
new
HashMap
<
Language
,
Set
<
String
>>();
static
{
alias
.
put
(
Language
.
JavaScript
,
Set
s
.<
String
>
newHashSet
(
"js"
,
"javascript"
,
"JavaScript"
,
"JS"
));
alias
.
put
(
Language
.
JRuby
,
Set
s
.<
String
>
newHashSet
(
"ruby"
,
"jruby"
,
"Ruby"
,
"JRuby"
));
alias
.
put
(
Language
.
JavaScript
,
WMCollection
s
.<
String
>
newHashSet
(
"js"
,
"javascript"
,
"JavaScript"
,
"JS"
));
alias
.
put
(
Language
.
JRuby
,
WMCollection
s
.<
String
>
newHashSet
(
"ruby"
,
"jruby"
,
"Ruby"
,
"JRuby"
));
}
public
void
setLanguagefromArg
(
String
arg
)
{
...
...
@@ -93,7 +93,7 @@ public class ScriptConsole {
.
language
(
params
.
getLanguage
()).
scriptFromFile
(
params
.
getScriptFileName
()).
thread
(
params
.
getThread
()).
build
();
pageProcessor
.
getSite
().
setSleepTime
(
params
.
getSleepTime
());
pageProcessor
.
getSite
().
setRetryTimes
(
3
);
pageProcessor
.
getSite
().
setAcceptStatCode
(
Set
s
.<
Integer
>
newHashSet
(
200
,
404
,
403
,
500
,
502
));
pageProcessor
.
getSite
().
setAcceptStatCode
(
WMCollection
s
.<
Integer
>
newHashSet
(
200
,
404
,
403
,
500
,
502
));
Spider
spider
=
Spider
.
create
(
pageProcessor
).
thread
(
params
.
getThread
());
spider
.
clearPipeline
().
addPipeline
(
new
Pipeline
()
{
@Override
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment