Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
21982d34
Commit
21982d34
authored
May 14, 2014
by
yihua.huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove cpdetector temporary #126
parent
fcbfb756
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
45 deletions
+23
-45
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+11
-24
antlr-2.7.4.jar
webmagic-core/src/main/lib/antlr-2.7.4.jar
+0
-0
cpdetector_1.0.10.jar
webmagic-core/src/main/lib/cpdetector_1.0.10.jar
+0
-0
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+12
-21
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
21982d34
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
com.google.common.collect.Sets
;
import
com.google.common.collect.Sets
;
import
info.monitorenter.cpdetector.io.CodepageDetectorProxy
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang.StringUtils
;
import
org.apache.commons.lang.StringUtils
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpResponse
;
...
@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page;
...
@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.net.URL
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Set
;
import
java.util.Set
;
...
@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader {
...
@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
}
}
private
static
CodepageDetectorProxy
detector
=
CodepageDetectorProxy
.
getInstance
();
protected
String
getHtmlCharset
(
CloseableHttpResponse
httpResponse
)
throws
IOException
{
protected
String
getHtmlCharset
(
CloseableHttpResponse
httpResponse
)
throws
IOException
{
// 1、
head头部包含编码集
// 1、
encoding in http header Content-Type
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
String
charset
=
UrlUtils
.
getCharset
(
value
);
String
charset
=
UrlUtils
.
getCharset
(
value
);
if
(
StringUtils
.
isEmpty
(
charset
))
{
if
(
StringUtils
.
isEmpty
(
charset
))
{
// 2、
meta元素中包含编码集
// 2、
charset in meta
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
());
String
content
=
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
());
if
(
StringUtils
.
isNotEmpty
(
content
))
{
if
(
StringUtils
.
isNotEmpty
(
content
))
{
Document
document
=
Jsoup
.
parse
(
content
);
Document
document
=
Jsoup
.
parse
(
content
);
Elements
links
=
document
.
select
(
"meta"
);
Elements
links
=
document
.
select
(
"meta"
);
for
(
Element
link
:
links
)
{
for
(
Element
link
:
links
)
{
// 2.1、
处理场景:
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
// 2.1、 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String
metaContent
=
link
.
attr
(
"content"
);
String
metaContent
=
link
.
attr
(
"content"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
metaContent
.
indexOf
(
"charset"
)
!=
-
1
)
{
if
(
metaContent
.
indexOf
(
"charset"
)
!=
-
1
)
{
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
charset
=
metaContent
.
split
(
"="
)[
1
];
charset
=
metaContent
.
split
(
"="
)[
1
];
break
;
break
;
}
}
// 2.2、
处理场景:
<meta charset="UTF-8" />
// 2.2、 <meta charset="UTF-8" />
else
if
(
StringUtils
.
isNotEmpty
(
metaCharset
))
{
else
if
(
StringUtils
.
isNotEmpty
(
metaCharset
))
{
charset
=
metaCharset
;
charset
=
metaCharset
;
break
;
break
;
}
}
}
}
// 3、todo use tools as cpdetector for content decode
// 3、以上两种都不包含的场景
if
(
StringUtils
.
isEmpty
(
charset
))
{
java
.
nio
.
charset
.
Charset
nioCharset
=
null
;
try
{
nioCharset
=
detector
.
detectCodepage
(
httpResponse
.
getEntity
().
getContent
(),
content
.
length
());
charset
=
nioCharset
.
name
();
}
catch
(
IOException
e
)
{
// ignore
}
}
}
}
}
}
return
charset
;
return
charset
;
...
...
webmagic-core/src/main/lib/antlr-2.7.4.jar
deleted
100644 → 0
View file @
fcbfb756
File deleted
webmagic-core/src/main/lib/cpdetector_1.0.10.jar
deleted
100644 → 0
View file @
fcbfb756
File deleted
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
View file @
21982d34
package
us
.
codecraft
.
webmagic
.
downloader
;
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.junit.Ignore
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
...
@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site;
...
@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Html
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
...
@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest {
...
@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest {
}
}
@Test
@Test
public
void
testGetHtmlCharset
()
{
public
void
testGetHtmlCharset
()
throws
IOException
{
HttpClientDownloader
downloader
=
new
HttpClientDownloader
();
HttpClientDownloader
downloader
=
new
HttpClientDownloader
();
Site
site
=
Site
.
me
();
Site
site
=
Site
.
me
();
CloseableHttpClient
httpClient
=
new
HttpClientGenerator
().
getClient
(
site
);
CloseableHttpClient
httpClient
=
new
HttpClientGenerator
().
getClient
(
site
);
try
{
// encoding in http header Content-Type
// 头部包含编码
Request
requestGBK
=
new
Request
(
"http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"
);
Request
requestGBK
=
new
Request
(
"http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"
);
CloseableHttpResponse
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestGBK
,
site
,
null
));
CloseableHttpResponse
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestGBK
,
site
,
null
));
String
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
String
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
assertEquals
(
charset
,
"GBK"
);
assertEquals
(
charset
,
"GBK"
);
// meta包含编码
// encoding in meta
Request
requestUTF_8
=
new
Request
(
"http://preshing.com/"
);
Request
requestUTF_8
=
new
Request
(
"http://preshing.com/"
);
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestUTF_8
,
site
,
null
));
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestUTF_8
,
site
,
null
));
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
assertEquals
(
charset
,
"utf-8"
);
assertEquals
(
charset
,
"utf-8"
);
// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
// charset = downloader.getHtmlCharset(httpResponse);
// assertEquals(charset, "GBK");
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment