Commit 2183ba9b authored by yihua.huang's avatar yihua.huang

#571 add getBytes to Page

parent c3bdb204
...@@ -46,6 +46,8 @@ public class Page { ...@@ -46,6 +46,8 @@ public class Page {
private boolean downloadSuccess = true; private boolean downloadSuccess = true;
private byte[] bytes;
private List<Request> targetRequests = new ArrayList<Request>(); private List<Request> targetRequests = new ArrayList<Request>();
public Page() { public Page() {
...@@ -228,6 +230,14 @@ public class Page { ...@@ -228,6 +230,14 @@ public class Page {
this.downloadSuccess = downloadSuccess; this.downloadSuccess = downloadSuccess;
} }
public byte[] getBytes() {
return bytes;
}
public void setBytes(byte[] bytes) {
this.bytes = bytes;
}
@Override @Override
public String toString() { public String toString() {
return "Page{" + return "Page{" +
......
...@@ -45,6 +45,12 @@ public class Request implements Serializable { ...@@ -45,6 +45,12 @@ public class Request implements Serializable {
*/ */
private long priority; private long priority;
/**
* When it is set to TRUE, the downloader will not try to parse response body to text.
*
*/
private boolean binarayContent = false;
public Request() { public Request() {
} }
...@@ -162,6 +168,14 @@ public class Request implements Serializable { ...@@ -162,6 +168,14 @@ public class Request implements Serializable {
this.requestBody = requestBody; this.requestBody = requestBody;
} }
public boolean isBinarayContent() {
return binarayContent;
}
public void setBinarayContent(boolean binarayContent) {
this.binarayContent = binarayContent;
}
@Override @Override
public String toString() { public String toString() {
return "Request{" + return "Request{" +
......
...@@ -108,9 +108,13 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -108,9 +108,13 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getResponseContent(charset, httpResponse); byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page(); Page page = new Page();
page.setRawText(content); page.setBytes(bytes);
if (!request.isBinarayContent()){
page.setRawText(getResponseContent(charset, contentType, bytes));
}
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
...@@ -121,22 +125,21 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -121,22 +125,21 @@ public class HttpClientDownloader extends AbstractDownloader {
return page; return page;
} }
private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException { private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException {
if (charset == null) { if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(contentType, bytes);
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) { if (htmlCharset != null) {
return new String(contentBytes, htmlCharset); return new String(bytes, htmlCharset);
} else { } else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes); return new String(bytes);
} }
} else { } else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset); return new String(bytes, charset);
} }
} }
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(), contentBytes); return CharsetUtils.detectCharset(contentType, contentBytes);
} }
} }
...@@ -271,4 +271,22 @@ public class HttpClientDownloaderTest { ...@@ -271,4 +271,22 @@ public class HttpClientDownloaderTest {
}); });
} }
@Test
public void test_download_binary_content() throws Exception {
HttpServer server = httpServer(13423);
server.response("binary");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setBinarayContent(true);
request.setUrl("http://127.0.0.1:13423/");
Page page = httpClientDownloader.download(request, Site.me().toTask());
assertThat(page.getRawText()).isNull();
assertThat(page.getBytes()).isEqualTo("binary".getBytes());
}
});
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment