Commit 2183ba9b authored by yihua.huang's avatar yihua.huang

#571 add getBytes to Page

parent c3bdb204
......@@ -46,6 +46,8 @@ public class Page {
private boolean downloadSuccess = true;
private byte[] bytes;
private List<Request> targetRequests = new ArrayList<Request>();
public Page() {
......@@ -228,6 +230,14 @@ public class Page {
this.downloadSuccess = downloadSuccess;
}
public byte[] getBytes() {
return bytes;
}
public void setBytes(byte[] bytes) {
this.bytes = bytes;
}
@Override
public String toString() {
return "Page{" +
......
......@@ -45,6 +45,12 @@ public class Request implements Serializable {
*/
private long priority;
/**
* When it is set to TRUE, the downloader will not try to parse response body to text.
*
*/
private boolean binarayContent = false;
public Request() {
}
......@@ -162,6 +168,14 @@ public class Request implements Serializable {
this.requestBody = requestBody;
}
public boolean isBinarayContent() {
return binarayContent;
}
public void setBinarayContent(boolean binarayContent) {
this.binarayContent = binarayContent;
}
@Override
public String toString() {
return "Request{" +
......
......@@ -108,9 +108,13 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getResponseContent(charset, httpResponse);
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setRawText(content);
page.setBytes(bytes);
if (!request.isBinarayContent()){
page.setRawText(getResponseContent(charset, contentType, bytes));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
......@@ -121,22 +125,21 @@ public class HttpClientDownloader extends AbstractDownloader {
return page;
}
private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException {
private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
String htmlCharset = getHtmlCharset(contentType, bytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
return new String(bytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes);
return new String(bytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
return new String(bytes, charset);
}
}
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(), contentBytes);
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(contentType, contentBytes);
}
}
......@@ -271,4 +271,22 @@ public class HttpClientDownloaderTest {
});
}
@Test
public void test_download_binary_content() throws Exception {
HttpServer server = httpServer(13423);
server.response("binary");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setBinarayContent(true);
request.setUrl("http://127.0.0.1:13423/");
Page page = httpClientDownloader.download(request, Site.me().toTask());
assertThat(page.getRawText()).isNull();
assertThat(page.getBytes()).isEqualTo("binary".getBytes());
}
});
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment