Commit 807aefe9 authored by yihua.huang's avatar yihua.huang

change EntityUtil to IOUtil because some encoding error

parent 00b0a751
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.CookieSpecs;
...@@ -8,7 +9,6 @@ import org.apache.http.client.config.RequestConfig; ...@@ -8,7 +9,6 @@ import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
...@@ -158,7 +158,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -158,7 +158,7 @@ public class HttpClientDownloader implements Downloader {
} }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = EntityUtils.toString(httpResponse.getEntity(), charset); String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page(); Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
......
...@@ -16,7 +16,7 @@ import java.util.List; ...@@ -16,7 +16,7 @@ import java.util.List;
public class BaiduBaikePageProcesser implements PageProcessor { public class BaiduBaikePageProcesser implements PageProcessor {
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true); .setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override @Override
public void process(Page page) { public void process(Page page) {
......
...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page; ...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.UrlTemplate;
import us.codecraft.webmagic.model.direct.Param; import us.codecraft.webmagic.model.direct.Param;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -12,10 +11,8 @@ import java.util.List; ...@@ -12,10 +11,8 @@ import java.util.List;
/** /**
* @since 0.4.0 * @since 0.4.0
* NO implement yet!!!!!!!!!!!!
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
@UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8")
public class BaiduBaike implements AfterExtractor{ public class BaiduBaike implements AfterExtractor{
private String word; private String word;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment