Commit 692de76f authored by yihua.huang's avatar yihua.huang

fix issue #21 charset detect error

parent e1b6b540
...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils; ...@@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.nio.charset.Charset;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -98,15 +99,17 @@ public class UrlUtils { ...@@ -98,15 +99,17 @@ public class UrlUtils {
return stringBuilder.toString(); return stringBuilder.toString();
} }
private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)"); private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
public static String getCharset(String contentType) { public static String getCharset(String contentType) {
Matcher matcher = patternForCharset.matcher(contentType); Matcher matcher = patternForCharset.matcher(contentType);
if (matcher.find()) { if (matcher.find()) {
return matcher.group(1); String charset = matcher.group(1);
} else { if (Charset.isSupported(charset)) {
return null; return charset;
}
} }
return null;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment