Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
fcbfb756
Commit
fcbfb756
authored
May 14, 2014
by
fengwuze
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改自动从网页中获取字符的代码块,抽取出来成为单独的方法。
parent
95494d3c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
44 additions
and
5 deletions
+44
-5
HttpClientDownloader.java
...s/codecraft/webmagic/downloader/HttpClientDownloader.java
+14
-5
antlr-2.7.4.jar
webmagic-core/src/main/lib/antlr-2.7.4.jar
+0
-0
cpdetector_1.0.10.jar
webmagic-core/src/main/lib/cpdetector_1.0.10.jar
+0
-0
HttpClientDownloaderTest.java
...decraft/webmagic/downloader/HttpClientDownloaderTest.java
+30
-0
No files found.
webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
View file @
fcbfb756
package
us
.
codecraft
.
webmagic
.
downloader
;
import
com.google.common.collect.Sets
;
import
info.monitorenter.cpdetector.io.CodepageDetectorProxy
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang.StringUtils
;
import
org.apache.http.HttpResponse
;
...
...
@@ -28,6 +29,7 @@ import us.codecraft.webmagic.selector.PlainText;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
java.io.IOException
;
import
java.net.URL
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Set
;
...
...
@@ -92,6 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader {
//charset
if
(
charset
==
null
)
{
charset
=
getHtmlCharset
(
httpResponse
);
logger
.
debug
(
"Auto get charset: "
+
charset
);
}
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
onSuccess
(
request
);
...
...
@@ -119,6 +122,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
private
static
CodepageDetectorProxy
detector
=
CodepageDetectorProxy
.
getInstance
();
protected
String
getHtmlCharset
(
CloseableHttpResponse
httpResponse
)
throws
IOException
{
// 1、head头部包含编码集
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
...
...
@@ -133,23 +137,28 @@ public class HttpClientDownloader extends AbstractDownloader {
for
(
Element
link
:
links
)
{
// 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String
metaContent
=
link
.
attr
(
"content"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
metaContent
.
indexOf
(
"charset"
)
!=
-
1
)
{
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
charset
=
metaContent
.
split
(
"="
)[
1
];
break
;
}
// 2.2、处理场景: <meta charset="UTF-8" />
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
StringUtils
.
isNotEmpty
(
metaCharset
))
{
charset
=
metaCharset
.
split
(
"="
)[
1
];
else
if
(
StringUtils
.
isNotEmpty
(
metaCharset
))
{
charset
=
metaCharset
;
break
;
}
}
// 3、以上两种都不包含的场景
if
(
StringUtils
.
isEmpty
(
charset
))
{
// TODO http://cpdetector.sourceforge.net/usage.shtml
java
.
nio
.
charset
.
Charset
nioCharset
=
null
;
try
{
nioCharset
=
detector
.
detectCodepage
(
httpResponse
.
getEntity
().
getContent
(),
content
.
length
());
charset
=
nioCharset
.
name
();
}
catch
(
IOException
e
)
{
// ignore
}
}
}
}
...
...
webmagic-core/src/main/lib/antlr-2.7.4.jar
0 → 100644
View file @
fcbfb756
File added
webmagic-core/src/main/lib/cpdetector_1.0.10.jar
0 → 100644
View file @
fcbfb756
File added
webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
View file @
fcbfb756
package
us
.
codecraft
.
webmagic
.
downloader
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Page
;
...
...
@@ -11,6 +14,7 @@ import us.codecraft.webmagic.selector.Html;
import
java.io.UnsupportedEncodingException
;
import
static
org
.
assertj
.
core
.
api
.
Assertions
.
assertThat
;
import
static
org
.
junit
.
Assert
.
assertEquals
;
import
static
org
.
junit
.
Assert
.
assertTrue
;
/**
...
...
@@ -52,4 +56,30 @@ public class HttpClientDownloaderTest {
assertThat
((
Integer
)
page
.
getTargetRequests
().
get
(
0
).
getExtra
(
Request
.
CYCLE_TRIED_TIMES
)).
isEqualTo
(
2
);
}
@Test
public
void
testGetHtmlCharset
()
{
HttpClientDownloader
downloader
=
new
HttpClientDownloader
();
Site
site
=
Site
.
me
();
CloseableHttpClient
httpClient
=
new
HttpClientGenerator
().
getClient
(
site
);
try
{
// 头部包含编码
Request
requestGBK
=
new
Request
(
"http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"
);
CloseableHttpResponse
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestGBK
,
site
,
null
));
String
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
assertEquals
(
charset
,
"GBK"
);
// meta包含编码
Request
requestUTF_8
=
new
Request
(
"http://preshing.com/"
);
httpResponse
=
httpClient
.
execute
(
downloader
.
getHttpUriRequest
(
requestUTF_8
,
site
,
null
));
charset
=
downloader
.
getHtmlCharset
(
httpResponse
);
assertEquals
(
charset
,
"utf-8"
);
// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
// charset = downloader.getHtmlCharset(httpResponse);
// assertEquals(charset, "GBK");
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment