Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
2183ba9b
Commit
2183ba9b
authored
Jul 22, 2017
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#571 add getBytes to Page
parent
c3bdb204
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
10 deletions
+55
-10
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+10
-0
Request.java
...gic-core/src/main/java/us/codecraft/webmagic/Request.java
+14
-0
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+13
-10
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+18
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
2183ba9b
...
...
@@ -46,6 +46,8 @@ public class Page {
private
boolean
downloadSuccess
=
true
;
private
byte
[]
bytes
;
private
List
<
Request
>
targetRequests
=
new
ArrayList
<
Request
>();
public
Page
()
{
...
...
@@ -228,6 +230,14 @@ public class Page {
this
.
downloadSuccess
=
downloadSuccess
;
}
public
byte
[]
getBytes
()
{
return
bytes
;
}
public
void
setBytes
(
byte
[]
bytes
)
{
this
.
bytes
=
bytes
;
}
@Override
public
String
toString
()
{
return
"Page{"
+
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
View file @
2183ba9b
...
...
@@ -45,6 +45,12 @@ public class Request implements Serializable {
*/
private
long
priority
;
/**
* When it is set to TRUE, the downloader will not try to parse response body to text.
*
*/
private
boolean
binarayContent
=
false
;
public
Request
()
{
}
...
...
@@ -162,6 +168,14 @@ public class Request implements Serializable {
this
.
requestBody
=
requestBody
;
}
public
boolean
isBinarayContent
()
{
return
binarayContent
;
}
public
void
setBinarayContent
(
boolean
binarayContent
)
{
this
.
binarayContent
=
binarayContent
;
}
@Override
public
String
toString
()
{
return
"Request{"
+
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
2183ba9b
...
...
@@ -108,9 +108,13 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
getResponseContent
(
charset
,
httpResponse
);
byte
[]
bytes
=
IOUtils
.
toByteArray
(
httpResponse
.
getEntity
().
getContent
());
String
contentType
=
httpResponse
.
getEntity
().
getContentType
()
==
null
?
""
:
httpResponse
.
getEntity
().
getContentType
().
getValue
();
Page
page
=
new
Page
();
page
.
setRawText
(
content
);
page
.
setBytes
(
bytes
);
if
(!
request
.
isBinarayContent
()){
page
.
setRawText
(
getResponseContent
(
charset
,
contentType
,
bytes
));
}
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
page
.
setStatusCode
(
httpResponse
.
getStatusLine
().
getStatusCode
());
...
...
@@ -121,22 +125,21 @@ public class HttpClientDownloader extends AbstractDownloader {
return
page
;
}
private
String
getResponseContent
(
String
charset
,
HttpResponse
httpResponse
)
throws
IOException
{
private
String
getResponseContent
(
String
charset
,
String
contentType
,
byte
[]
bytes
)
throws
IOException
{
if
(
charset
==
null
)
{
byte
[]
contentBytes
=
IOUtils
.
toByteArray
(
httpResponse
.
getEntity
().
getContent
());
String
htmlCharset
=
getHtmlCharset
(
httpResponse
,
contentBytes
);
String
htmlCharset
=
getHtmlCharset
(
contentType
,
bytes
);
if
(
htmlCharset
!=
null
)
{
return
new
String
(
contentB
ytes
,
htmlCharset
);
return
new
String
(
b
ytes
,
htmlCharset
);
}
else
{
logger
.
warn
(
"Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()"
,
Charset
.
defaultCharset
());
return
new
String
(
contentB
ytes
);
return
new
String
(
b
ytes
);
}
}
else
{
return
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
()
,
charset
);
return
new
String
(
bytes
,
charset
);
}
}
private
String
getHtmlCharset
(
HttpResponse
httpRespons
e
,
byte
[]
contentBytes
)
throws
IOException
{
return
CharsetUtils
.
detectCharset
(
httpResponse
.
getEntity
().
getContentType
()
==
null
?
""
:
httpResponse
.
getEntity
().
getContentType
().
getValue
()
,
contentBytes
);
private
String
getHtmlCharset
(
String
contentTyp
e
,
byte
[]
contentBytes
)
throws
IOException
{
return
CharsetUtils
.
detectCharset
(
contentType
,
contentBytes
);
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
View file @
2183ba9b
...
...
@@ -271,4 +271,22 @@ public class HttpClientDownloaderTest {
});
}
@Test
public
void
test_download_binary_content
()
throws
Exception
{
HttpServer
server
=
httpServer
(
13423
);
server
.
response
(
"binary"
);
Runner
.
running
(
server
,
new
Runnable
()
{
@Override
public
void
run
()
throws
Exception
{
final
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
Request
request
=
new
Request
();
request
.
setBinarayContent
(
true
);
request
.
setUrl
(
"http://127.0.0.1:13423/"
);
Page
page
=
httpClientDownloader
.
download
(
request
,
Site
.
me
().
toTask
());
assertThat
(
page
.
getRawText
()).
isNull
();
assertThat
(
page
.
getBytes
()).
isEqualTo
(
"binary"
.
getBytes
());
}
});
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment