remove cpdetector temporary #126

21982d34 · yihua.huang · fcbfb756 · 21982d34 · fcbfb756 · fcbfb756
Commit 21982d34 authored May 14, 2014 by yihua.huang
4 changed files
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
 package us.codecraft.webmagic.downloader;

 import com.google.common.collect.Sets;
-import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.http.HttpResponse;
@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Request;
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.utils.HttpConstant;
 import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
 import us.codecraft.webmagic.utils.UrlUtils;

 import java.io.IOException;
-import java.net.URL;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader {
        }
    }

-    private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
    protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
-        // 1、head头部包含编码集
+        // 1、encoding in http header Content-Type
        String value = httpResponse.getEntity().getContentType().getValue();
        String charset = UrlUtils.getCharset(value);

-        if(StringUtils.isEmpty(charset)) {
-            // 2、meta元素中包含编码集
+        if (StringUtils.isEmpty(charset)) {
+            // 2、charset in meta
            String content = IOUtils.toString(httpResponse.getEntity().getContent());
-            if(StringUtils.isNotEmpty(content)) {
+            if (StringUtils.isNotEmpty(content)) {
                Document document = Jsoup.parse(content);
                Elements links = document.select("meta");
-                for(Element link : links) {
-                    // 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+                for (Element link : links) {
+                    // 2.1、 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
                    String metaContent = link.attr("content");
                    String metaCharset = link.attr("charset");
-                    if(metaContent.indexOf("charset") != -1) {
+                    if (metaContent.indexOf("charset") != -1) {
                        metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
                        charset = metaContent.split("=")[1];
                        break;
                    }
-                    // 2.2、处理场景: <meta charset="UTF-8" />
-                    else if(StringUtils.isNotEmpty(metaCharset)) {
+                    // 2.2、 <meta charset="UTF-8" />
+                    else if (StringUtils.isNotEmpty(metaCharset)) {
                        charset = metaCharset;
                        break;
                    }
                }
-
-                // 3、以上两种都不包含的场景
-                if(StringUtils.isEmpty(charset)) {
-                    java.nio.charset.Charset nioCharset = null;
-                    try {
-                        nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length());
-                        charset = nioCharset.name();
-                    } catch (IOException e) {
-                        // ignore
-                    }
-                }
+                // 3、todo use tools as cpdetector for content decode
            }
        }
        return charset;

--- a/webmagic-core/src/main/lib/antlr-2.7.4.jar
+++ b/webmagic-core/src/main/lib/antlr-2.7.4.jar
--- a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar
+++ b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
 package us.codecraft.webmagic.downloader;

 import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpUriRequest;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Task;
 import us.codecraft.webmagic.selector.Html;

+import java.io.IOException;
 import java.io.UnsupportedEncodingException;

 import static org.assertj.core.api.Assertions.assertThat;
@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest {
    }

    @Test
-    public void testGetHtmlCharset() {
+    public void testGetHtmlCharset() throws IOException {
        HttpClientDownloader downloader = new HttpClientDownloader();
        Site site = Site.me();
        CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
-        try {
-            // 头部包含编码
+        // encoding in http header Content-Type
        Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
        CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
        String charset = downloader.getHtmlCharset(httpResponse);
        assertEquals(charset, "GBK");

-            // meta包含编码
+        // encoding in meta
        Request requestUTF_8 = new Request("http://preshing.com/");
        httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
        charset = downloader.getHtmlCharset(httpResponse);
        assertEquals(charset, "utf-8");
-
-//            Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
-//            httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
-//            charset = downloader.getHtmlCharset(httpResponse);
-//            assertEquals(charset, "GBK");
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
    }
 }