Commit 32f1f2cf authored by yihua.huang's avatar yihua.huang

#613 add charset to page

parent 65049bac
......@@ -113,7 +113,11 @@ public class HttpClientDownloader extends AbstractDownloader {
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()){
page.setRawText(getResponseContent(charset, contentType, bytes));
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
......@@ -125,21 +129,12 @@ public class HttpClientDownloader extends AbstractDownloader {
return page;
}
private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException {
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
String htmlCharset = getHtmlCharset(contentType, bytes);
if (htmlCharset != null) {
return new String(bytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(bytes);
}
} else {
return new String(bytes, charset);
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
}
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(contentType, contentBytes);
return charset;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment