Commit 5a6a68a3 authored by yihua.huang's avatar yihua.huang

add gzip support

parent adeed3bc
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
......@@ -26,15 +29,19 @@ public class HttpClientDownloader implements Downloader {
public Page download(Request request, Site site) {
logger.info("downloading page " + request.getUrl());
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
String encoding = site.getEncoding();
try {
HttpGet httpGet = new HttpGet(request.getUrl());
HttpResponse httpResponse = httpClient.execute(httpGet);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (site.getAcceptStatCode().contains(statusCode)) {
if (site.getEncoding() == null){
//charset
if (encoding == null){
String value = httpResponse.getEntity().getContentType().getValue();
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
}
//
handleGzip(httpResponse);
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
site.getEncoding());
Page page = new Page();
......@@ -50,4 +57,17 @@ public class HttpClientDownloader implements Downloader {
}
return null;
}
private void handleGzip(HttpResponse httpResponse) {
Header ceheader = httpResponse.getEntity().getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
httpResponse.setEntity(
new GzipDecompressingEntity(httpResponse.getEntity()));
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment