Commit 21982d34 authored by yihua.huang's avatar yihua.huang

remove cpdetector temporary #126

parent fcbfb756
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
...@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page; ...@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException; import java.io.IOException;
import java.net.URL;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
...@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
} }
private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
// 1、head头部包含编码集 // 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue(); String value = httpResponse.getEntity().getContentType().getValue();
String charset = UrlUtils.getCharset(value); String charset = UrlUtils.getCharset(value);
if(StringUtils.isEmpty(charset)) { if (StringUtils.isEmpty(charset)) {
// 2、meta元素中包含编码集 // 2、charset in meta
String content = IOUtils.toString(httpResponse.getEntity().getContent()); String content = IOUtils.toString(httpResponse.getEntity().getContent());
if(StringUtils.isNotEmpty(content)) { if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content); Document document = Jsoup.parse(content);
Elements links = document.select("meta"); Elements links = document.select("meta");
for(Element link : links) { for (Element link : links) {
// 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> // 2.1、 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content"); String metaContent = link.attr("content");
String metaCharset = link.attr("charset"); String metaCharset = link.attr("charset");
if(metaContent.indexOf("charset") != -1) { if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1]; charset = metaContent.split("=")[1];
break; break;
} }
// 2.2、处理场景: <meta charset="UTF-8" /> // 2.2、 <meta charset="UTF-8" />
else if(StringUtils.isNotEmpty(metaCharset)) { else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset; charset = metaCharset;
break; break;
} }
} }
// 3、todo use tools as cpdetector for content decode
// 3、以上两种都不包含的场景
if(StringUtils.isEmpty(charset)) {
java.nio.charset.Charset nioCharset = null;
try {
nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length());
charset = nioCharset.name();
} catch (IOException e) {
// ignore
}
}
} }
} }
return charset; return charset;
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
...@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site; ...@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
...@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest { ...@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest {
} }
@Test @Test
public void testGetHtmlCharset() { public void testGetHtmlCharset() throws IOException {
HttpClientDownloader downloader = new HttpClientDownloader(); HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me(); Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
try { // encoding in http header Content-Type
// 头部包含编码 Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); String charset = downloader.getHtmlCharset(httpResponse);
String charset = downloader.getHtmlCharset(httpResponse); assertEquals(charset, "GBK");
assertEquals(charset, "GBK");
// meta包含编码 // encoding in meta
Request requestUTF_8 = new Request("http://preshing.com/"); Request requestUTF_8 = new Request("http://preshing.com/");
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null)); httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
charset = downloader.getHtmlCharset(httpResponse); charset = downloader.getHtmlCharset(httpResponse);
assertEquals(charset, "utf-8"); assertEquals(charset, "utf-8");
// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
// charset = downloader.getHtmlCharset(httpResponse);
// assertEquals(charset, "GBK");
} catch (Exception e) {
e.printStackTrace();
}
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment