Commit 0ae7adf3 authored by yihua.huang's avatar yihua.huang

add cookie support & add docs

parent 8cef8774
...@@ -10,6 +10,7 @@ import java.util.Map; ...@@ -10,6 +10,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
/** /**
* Page保存了抓取的结果,并可定义下一次抓取的链接内容。
* Author: code4crafter@gmail.com * Author: code4crafter@gmail.com
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午11:22 * Time: 上午11:22
...@@ -65,7 +66,7 @@ public class Page { ...@@ -65,7 +66,7 @@ public class Page {
} }
} }
public void addTargetRequests(String requestString) { public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return; return;
} }
...@@ -75,6 +76,12 @@ public class Page { ...@@ -75,6 +76,12 @@ public class Page {
} }
} }
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
targetRequests.add(request);
}
}
public Selectable getUrl() { public Selectable getUrl() {
return url; return url;
} }
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
/** /**
* Request对象是 * Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* Author: code4crafter@gmail.com * Author: code4crafter@gmail.com
* Date: 13-4-21 * Date: 13-4-21
* Time: 上午11:37 * Time: 上午11:37
...@@ -12,15 +27,28 @@ public class Request { ...@@ -12,15 +27,28 @@ public class Request {
private Object[] extra; private Object[] extra;
/**
* 构建一个request对象
* @param url 必须参数,待抓取的url
* @param extra 额外参数,可以保存一些需要的上下文信息
*/
public Request(String url, Object... extra) { public Request(String url, Object... extra) {
this.url = url; this.url = url;
this.extra = extra; this.extra = extra;
} }
/**
* 获取预存的对象
* @return object[] 预存的对象数组
*/
public Object[] getExtra() { public Object[] getExtra() {
return extra; return extra;
} }
/**
* 获取待抓取的url
* @return url 待抓取的url
*/
public String getUrl() { public String getUrl() {
return url; return url;
} }
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import java.util.HashSet; import java.util.*;
import java.util.Set;
/** /**
* Site定义一个待抓取的站点的各种信息。
* Author: code4crafter@gmail.com * Author: code4crafter@gmail.com
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:13 * Time: 下午12:13
...@@ -14,11 +14,11 @@ public class Site { ...@@ -14,11 +14,11 @@ public class Site {
private String userAgent; private String userAgent;
private String cookie; private Map<String,String> cookies = new LinkedHashMap<String, String>();
private String encoding; private String encoding;
private String startUrl; private List<String> startUrls;
private int sleepTime = 3000; private int sleepTime = 3000;
...@@ -34,8 +34,8 @@ public class Site { ...@@ -34,8 +34,8 @@ public class Site {
return new Site(); return new Site();
} }
public Site setCookie(String cookie) { public Site setCookie(String name,String value) {
this.cookie = cookie; cookies.put(name,value);
return this; return this;
} }
...@@ -44,8 +44,8 @@ public class Site { ...@@ -44,8 +44,8 @@ public class Site {
return this; return this;
} }
public String getCookie() { public Map<String,String> getCookies() {
return cookie; return cookies;
} }
public String getUserAgent() { public String getUserAgent() {
...@@ -79,12 +79,12 @@ public class Site { ...@@ -79,12 +79,12 @@ public class Site {
return this; return this;
} }
public String getStartUrl() { public List<String> getStartUrls() {
return startUrl; return startUrls;
} }
public Site setStartUrl(String startUrl) { public Site setStartUrl(String startUrl) {
this.startUrl = startUrl; this.startUrls.add(startUrl);
return this; return this;
} }
...@@ -106,8 +106,8 @@ public class Site { ...@@ -106,8 +106,8 @@ public class Site {
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false; return false;
if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
if (!domain.equals(site.domain)) return false; if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
...@@ -117,8 +117,8 @@ public class Site { ...@@ -117,8 +117,8 @@ public class Site {
@Override @Override
public int hashCode() { public int hashCode() {
int result = domain.hashCode(); int result = domain.hashCode();
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
result = 31 * result + (encoding != null ? encoding.hashCode() : 0); result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
return result; return result;
......
...@@ -36,7 +36,9 @@ public class Spider implements Runnable { ...@@ -36,7 +36,9 @@ public class Spider implements Runnable {
public Spider processor(PageProcessor pageProcessor) { public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite()); for (String startUrl : pageProcessor.getSite().getStartUrls()) {
schedular.push(new Request(startUrl), pageProcessor.getSite());
}
return this; return this;
} }
......
...@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request; ...@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
/** /**
* Downloader是webmagic抓取页面的核心接口。 * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
* Author: code4crafter@gmail.com * Author: code4crafter@gmail.com
* Date: 13-4-21 * Date: 13-4-21
* Time: 下午12:14 * Time: 下午12:14
...@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site; ...@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
public interface Downloader { public interface Downloader {
/** /**
* 下载页面,并保存信息到Page对象中。
* *
* @param request * @param request
* @param site * @param site
* @return * @return
*/ */
public Page download(Request request,Site site); public Page download(Request request, Site site);
} }
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion; import org.apache.http.HttpVersion;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy; import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.*; import org.apache.http.params.*;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import java.util.Map;
/** /**
* Author: code4crafter@gmail.com * Author: code4crafter@gmail.com
* Date: 13-4-21 * Date: 13-4-21
...@@ -50,15 +55,23 @@ public class HttpClientPool { ...@@ -50,15 +55,23 @@ public class HttpClientPool {
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
connectionManager.setMaxTotal(100); connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100); connectionManager.setDefaultMaxPerRoute(100);
HttpClient httpClient = new DefaultHttpClient(connectionManager, params); DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
generateCookie(httpClient, site);
httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient; return httpClient;
} }
public void pushBack(HttpClient httpClient) { private void generateCookie(DefaultHttpClient httpClient, Site site) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
httpClient.setCookieStore(cookieStore);
} }
} }
package us.codecraft.webmagic.downloader;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* Author: code4crafer@gmail.com
* Date: 13-6-18
* Time: 上午8:22
*/
public class HttpClientDownloaderTest {
@Test
public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
}
}
...@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor { ...@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
page.addTargetRequests("http://kaichiba.com/shop/"+i); page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title")); page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", "")); page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment