Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
8f774afc
Commit
8f774afc
authored
Nov 05, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add direct download
parent
86cfefb5
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
225 additions
and
50 deletions
+225
-50
ResultItems.java
...core/src/main/java/us/codecraft/webmagic/ResultItems.java
+9
-0
Site.java
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+24
-0
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+69
-8
HttpClientGenerator.java
...us/codecraft/webmagic/downloader/HttpClientGenerator.java
+42
-35
CollectorPipeline.java
...ava/us/codecraft/webmagic/pipeline/CollectorPipeline.java
+25
-0
BaiduBaikePageProcesser.java
...t/webmagic/processor/example/BaiduBaikePageProcesser.java
+48
-0
GithubRepoPageProcesser.java
...t/webmagic/processor/example/GithubRepoPageProcesser.java
+2
-2
OschinaBlogPageProcesser.java
.../webmagic/processor/example/OschinaBlogPageProcesser.java
+2
-2
UrlUtils.java
...e/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+3
-2
BaiduBaike.java
...c/main/java/us/codecraft/webmagic/example/BaiduBaike.java
+1
-1
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
View file @
8f774afc
...
@@ -68,4 +68,13 @@ public class ResultItems {
...
@@ -68,4 +68,13 @@ public class ResultItems {
this
.
skip
=
skip
;
this
.
skip
=
skip
;
return
this
;
return
this
;
}
}
@Override
public
String
toString
()
{
return
"ResultItems{"
+
"fields="
+
fields
+
", request="
+
request
+
", skip="
+
skip
+
'}'
;
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
View file @
8f774afc
...
@@ -43,6 +43,8 @@ public class Site {
...
@@ -43,6 +43,8 @@ public class Site {
private
HttpHost
httpProxy
;
private
HttpHost
httpProxy
;
private
boolean
useGzip
=
true
;
public
static
interface
HeaderConst
{
public
static
interface
HeaderConst
{
public
static
final
String
REFERER
=
"Referer"
;
public
static
final
String
REFERER
=
"Referer"
;
...
@@ -199,7 +201,10 @@ public class Site {
...
@@ -199,7 +201,10 @@ public class Site {
/**
/**
* Add a url to start url.<br>
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
*
*
* @deprecated
* @see Spider#addUrl(String...)
* @param startUrl
* @param startUrl
* @return this
* @return this
*/
*/
...
@@ -209,7 +214,10 @@ public class Site {
...
@@ -209,7 +214,10 @@ public class Site {
/**
/**
* Add a url to start url.<br>
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
*
*
* @deprecated
* @see Spider#addRequest(Request...)
* @param startUrl
* @param startUrl
* @return this
* @return this
*/
*/
...
@@ -312,6 +320,22 @@ public class Site {
...
@@ -312,6 +320,22 @@ public class Site {
return
this
;
return
this
;
}
}
public
boolean
isUseGzip
()
{
return
useGzip
;
}
/**
* Whether use gzip. <br>
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip
* @return
*/
public
Site
setUseGzip
(
boolean
useGzip
)
{
this
.
useGzip
=
useGzip
;
return
this
;
}
public
Task
toTask
()
{
public
Task
toTask
()
{
return
new
Task
()
{
return
new
Task
()
{
@Override
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
8f774afc
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
com.google.common.collect.Lists
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.log4j.Logger
;
import
org.apache.log4j.Logger
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.pipeline.CollectorPipeline
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
@@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
...
@@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
import
java.io.Closeable
;
import
java.io.Closeable
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.List
;
import
java.util.List
;
import
java.util.UUID
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.concurrent.locks.Condition
;
import
java.util.concurrent.locks.Condition
;
...
@@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
...
@@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
protected
final
static
int
STAT_STOPPED
=
2
;
protected
final
static
int
STAT_STOPPED
=
2
;
protected
boolean
spawnUrl
=
true
;
protected
boolean
destroyWhenExit
=
true
;
private
ReentrantLock
newUrlLock
=
new
ReentrantLock
();
private
ReentrantLock
newUrlLock
=
new
ReentrantLock
();
private
Condition
newUrlCondition
=
newUrlLock
.
newCondition
();
private
Condition
newUrlCondition
=
newUrlLock
.
newCondition
();
...
@@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
...
@@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
pipelines
.
add
(
new
ConsolePipeline
());
pipelines
.
add
(
new
ConsolePipeline
());
}
}
downloader
.
setThread
(
threadNum
);
downloader
.
setThread
(
threadNum
);
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
if
(
executorService
==
null
||
executorService
.
isShutdown
())
{
executorService
=
ThreadUtils
.
newFixedThreadPool
(
threadNum
);
}
if
(
startRequests
!=
null
)
{
if
(
startRequests
!=
null
)
{
for
(
Request
request
:
startRequests
)
{
for
(
Request
request
:
startRequests
)
{
scheduler
.
push
(
request
,
this
);
scheduler
.
push
(
request
,
this
);
...
@@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
...
@@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
});
});
}
}
}
}
executorService
.
shutdown
();
stat
.
set
(
STAT_STOPPED
);
stat
.
set
(
STAT_STOPPED
);
// release some resources
// release some resources
destroy
();
if
(
destroyWhenExit
)
{
close
();
}
}
}
private
void
checkRunningStat
()
{
private
void
checkRunningStat
()
{
...
@@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
...
@@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
}
}
}
}
p
rotected
void
destroy
()
{
p
ublic
void
close
()
{
destroyEach
(
downloader
);
destroyEach
(
downloader
);
destroyEach
(
pageProcessor
);
destroyEach
(
pageProcessor
);
for
(
Pipeline
pipeline
:
pipelines
)
{
for
(
Pipeline
pipeline
:
pipelines
)
{
destroyEach
(
pipeline
);
destroyEach
(
pipeline
);
}
}
executorService
.
shutdown
();
}
}
private
void
destroyEach
(
Object
object
)
{
private
void
destroyEach
(
Object
object
)
{
...
@@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
...
@@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
}
}
protected
void
extractAndAddRequests
(
Page
page
)
{
protected
void
extractAndAddRequests
(
Page
page
)
{
if
(
CollectionUtils
.
isNotEmpty
(
page
.
getTargetRequests
()))
{
if
(
spawnUrl
&&
CollectionUtils
.
isNotEmpty
(
page
.
getTargetRequests
()))
{
for
(
Request
request
:
page
.
getTargetRequests
())
{
for
(
Request
request
:
page
.
getTargetRequests
())
{
addRequest
(
request
);
addRequest
(
request
);
}
}
...
@@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
...
@@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
}
}
private
void
addRequest
(
Request
request
)
{
private
void
addRequest
(
Request
request
)
{
if
(
site
.
getDomain
()
==
null
&&
request
!=
null
&&
request
.
getUrl
()
!=
null
)
{
site
.
setDomain
(
UrlUtils
.
getDomain
(
request
.
getUrl
()));
}
scheduler
.
push
(
request
,
this
);
scheduler
.
push
(
request
,
this
);
}
}
protected
void
checkIfRunning
()
{
protected
void
checkIfRunning
()
{
...
@@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
...
@@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
}
}
/**
/**
* Add urls to crawl.<br/>
* Add urls to crawl.
<br/>
*
*
* @param urls
* @param urls
* @return
* @return
...
@@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
...
@@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
return
this
;
return
this
;
}
}
/**
* Download urls synchronizing.
*
* @param urls
* @return
*/
public
List
<
ResultItems
>
getAll
(
Collection
<
String
>
urls
)
{
destroyWhenExit
=
false
;
spawnUrl
=
false
;
startRequests
=
UrlUtils
.
convertToRequests
(
urls
);
CollectorPipeline
collectorPipeline
=
new
CollectorPipeline
();
pipelines
.
add
(
collectorPipeline
);
run
();
spawnUrl
=
true
;
destroyWhenExit
=
true
;
return
collectorPipeline
.
getCollector
();
}
public
ResultItems
get
(
String
url
)
{
List
<
String
>
urls
=
Lists
.
newArrayList
(
url
);
List
<
ResultItems
>
resultItemses
=
getAll
(
urls
);
if
(
resultItemses
!=
null
&&
resultItemses
.
size
()
>
0
)
{
return
resultItemses
.
get
(
0
);
}
else
{
return
null
;
}
}
/**
/**
* Add urls with information to crawl.<br/>
* Add urls with information to crawl.<br/>
*
*
...
@@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
...
@@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
return
this
;
return
this
;
}
}
public
boolean
isSpawnUrl
()
{
return
spawnUrl
;
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl
* @return
* @since 0.4.0
*/
public
Spider
setSpawnUrl
(
boolean
spawnUrl
)
{
this
.
spawnUrl
=
spawnUrl
;
return
this
;
}
@Override
@Override
public
String
getUUID
()
{
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
if
(
uuid
!=
null
)
{
...
@@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
...
@@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
if
(
site
!=
null
)
{
if
(
site
!=
null
)
{
return
site
.
getDomain
();
return
site
.
getDomain
();
}
}
return
null
;
uuid
=
UUID
.
randomUUID
().
toString
();
return
uuid
;
}
}
@Override
@Override
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
View file @
8f774afc
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.*
;
import
org.apache.http.HttpException
;
import
org.apache.http.HttpRequest
;
import
org.apache.http.HttpRequestInterceptor
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
...
@@ -19,7 +20,7 @@ import java.util.Map;
...
@@ -19,7 +20,7 @@ import java.util.Map;
/**
/**
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.
3.3
* @since 0.
4.0
*/
*/
public
class
HttpClientGenerator
{
public
class
HttpClientGenerator
{
...
@@ -46,42 +47,48 @@ public class HttpClientGenerator {
...
@@ -46,42 +47,48 @@ public class HttpClientGenerator {
}
else
{
}
else
{
httpClientBuilder
.
setUserAgent
(
""
);
httpClientBuilder
.
setUserAgent
(
""
);
}
}
httpClientBuilder
.
addInterceptorFirst
(
new
HttpRequestInterceptor
()
{
if
(
site
==
null
||
site
.
isUseGzip
())
{
httpClientBuilder
.
addInterceptorFirst
(
new
HttpRequestInterceptor
()
{
public
void
process
(
public
void
process
(
final
HttpRequest
request
,
final
HttpRequest
request
,
final
HttpContext
context
)
throws
HttpException
,
IOException
{
final
HttpContext
context
)
throws
HttpException
,
IOException
{
if
(!
request
.
containsHeader
(
"Accept-Encoding"
))
{
if
(!
request
.
containsHeader
(
"Accept-Encoding"
))
{
request
.
addHeader
(
"Accept-Encoding"
,
"gzip"
);
request
.
addHeader
(
"Accept-Encoding"
,
"gzip"
);
}
}
}).
addInterceptorFirst
(
new
HttpResponseInterceptor
()
{
public
void
process
(
final
HttpResponse
response
,
final
HttpContext
context
)
throws
HttpException
,
IOException
{
HttpEntity
entity
=
response
.
getEntity
();
if
(
entity
!=
null
)
{
Header
ceheader
=
entity
.
getContentEncoding
();
if
(
ceheader
!=
null
)
{
HeaderElement
[]
codecs
=
ceheader
.
getElements
();
for
(
int
i
=
0
;
i
<
codecs
.
length
;
i
++)
{
if
(
codecs
[
i
].
getName
().
equalsIgnoreCase
(
"gzip"
))
{
response
.
setEntity
(
new
GzipDecompressingEntity
(
response
.
getEntity
()));
return
;
}
}
}
}
}
}
});
}
if
(
site
!=
null
){
});
httpClientBuilder
.
setRetryHandler
(
new
DefaultHttpRequestRetryHandler
(
site
.
getRetryTimes
(),
true
));
}
// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() {
//
// public void process(
// final HttpResponse response,
// final HttpContext context) throws HttpException, IOException {
// if (response.getStatusLine().getStatusCode() != 200) {
// return;
// }
// HttpEntity entity = response.getEntity();
// if (entity != null) {
// Header ceheader = entity.getContentEncoding();
// if (ceheader != null) {
// HeaderElement[] codecs = ceheader.getElements();
// for (int i = 0; i < codecs.length; i++) {
// if (codecs[i].getName().equalsIgnoreCase("gzip")) {
// response.setEntity(
// new GzipDecompressingEntity(response.getEntity()));
// return;
// }
// }
// }
// }
// }
//
// });
if
(
site
!=
null
)
{
httpClientBuilder
.
setRetryHandler
(
new
DefaultHttpRequestRetryHandler
(
site
.
getRetryTimes
(),
true
));
}
}
generateCookie
(
httpClientBuilder
,
site
);
generateCookie
(
httpClientBuilder
,
site
);
return
httpClientBuilder
.
build
();
return
httpClientBuilder
.
build
();
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
0 → 100644
View file @
8f774afc
package
us
.
codecraft
.
webmagic
.
pipeline
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public
class
CollectorPipeline
implements
Pipeline
{
private
List
<
ResultItems
>
collector
=
new
ArrayList
<
ResultItems
>();
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
collector
.
add
(
resultItems
);
}
public
List
<
ResultItems
>
getCollector
()
{
return
collector
;
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java
0 → 100644
View file @
8f774afc
package
us
.
codecraft
.
webmagic
.
processor
.
example
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public
class
BaiduBaikePageProcesser
implements
PageProcessor
{
private
Site
site
=
Site
.
me
()
//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.
setCharset
(
"utf-8"
).
setRetryTimes
(
3
).
setSleepTime
(
1000
).
setUseGzip
(
true
);
@Override
public
void
process
(
Page
page
)
{
page
.
putField
(
"name"
,
page
.
getHtml
().
$
(
"h1.title div.lemmaTitleH1"
,
"text"
).
toString
());
page
.
putField
(
"description"
,
page
.
getHtml
().
xpath
(
"//div[@id='lemmaContent-0']//div[@class='para']/allText()"
));
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
static
void
main
(
String
[]
args
)
{
Spider
spider
=
Spider
.
create
(
new
BaiduBaikePageProcesser
()).
thread
(
2
);
List
<
String
>
list
=
new
ArrayList
<
String
>();
String
urlTemplate
=
"http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"
;
list
.
add
(
String
.
format
(
urlTemplate
,
"水力发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"风力发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"太阳能"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"众数"
));
list
.
add
(
String
.
format
(
urlTemplate
,
"地热发电"
));
List
<
ResultItems
>
resultItemses
=
spider
.
getAll
(
list
);
for
(
ResultItems
resultItemse
:
resultItemses
)
{
System
.
out
.
println
(
resultItemse
.
getAll
());
}
spider
.
close
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java
View file @
8f774afc
...
@@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
...
@@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
*/
public
class
GithubRepoPageProcesser
implements
PageProcessor
{
public
class
GithubRepoPageProcesser
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
addStartUrl
(
"https://github.com/code4craft"
).
setRetryTimes
(
3
).
setSleepTime
(
100
);
private
Site
site
=
Site
.
me
().
setRetryTimes
(
3
).
setSleepTime
(
100
);
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
...
@@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
...
@@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
GithubRepoPageProcesser
()).
thread
(
5
).
run
();
Spider
.
create
(
new
GithubRepoPageProcesser
()).
addUrl
(
"https://github.com/code4craft"
).
thread
(
5
).
run
();
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java
View file @
8f774afc
...
@@ -12,7 +12,7 @@ import java.util.List;
...
@@ -12,7 +12,7 @@ import java.util.List;
*/
*/
public
class
OschinaBlogPageProcesser
implements
PageProcessor
{
public
class
OschinaBlogPageProcesser
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setDomain
(
"my.oschina.net"
)
.
addStartUrl
(
"http://my.oschina.net/flashsword/blog"
)
;
private
Site
site
=
Site
.
me
().
setDomain
(
"my.oschina.net"
);
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
...
@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
thread
(
2
).
run
();
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
addUrl
(
"http://my.oschina.net/flashsword/blog"
).
thread
(
2
).
run
();
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
View file @
8f774afc
...
@@ -7,6 +7,7 @@ import java.net.MalformedURLException;
...
@@ -7,6 +7,7 @@ import java.net.MalformedURLException;
import
java.net.URL
;
import
java.net.URL
;
import
java.nio.charset.Charset
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.List
;
import
java.util.List
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
...
@@ -88,7 +89,7 @@ public class UrlUtils {
...
@@ -88,7 +89,7 @@ public class UrlUtils {
return
stringBuilder
.
toString
();
return
stringBuilder
.
toString
();
}
}
public
static
List
<
Request
>
convertToRequests
(
List
<
String
>
urls
)
{
public
static
List
<
Request
>
convertToRequests
(
Collection
<
String
>
urls
)
{
List
<
Request
>
requestList
=
new
ArrayList
<
Request
>(
urls
.
size
());
List
<
Request
>
requestList
=
new
ArrayList
<
Request
>(
urls
.
size
());
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
requestList
.
add
(
new
Request
(
url
));
requestList
.
add
(
new
Request
(
url
));
...
@@ -96,7 +97,7 @@ public class UrlUtils {
...
@@ -96,7 +97,7 @@ public class UrlUtils {
return
requestList
;
return
requestList
;
}
}
public
static
List
<
String
>
convertToUrls
(
List
<
Request
>
requests
)
{
public
static
List
<
String
>
convertToUrls
(
Collection
<
Request
>
requests
)
{
List
<
String
>
urlList
=
new
ArrayList
<
String
>(
requests
.
size
());
List
<
String
>
urlList
=
new
ArrayList
<
String
>(
requests
.
size
());
for
(
Request
request
:
requests
)
{
for
(
Request
request
:
requests
)
{
urlList
.
add
(
request
.
getUrl
());
urlList
.
add
(
request
.
getUrl
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
View file @
8f774afc
...
@@ -11,7 +11,7 @@ import java.util.ArrayList;
...
@@ -11,7 +11,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
/**
/**
* @since 0.
3.3
* @since 0.
4.0
* NO implement yet!!!!!!!!!!!!
* NO implement yet!!!!!!!!!!!!
* @author code4crafter@gmail.com
* @author code4crafter@gmail.com
*/
*/
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment