Commit 0ae7adf3 authored by yihua.huang's avatar yihua.huang

add cookie support & add docs

parent 8cef8774
......@@ -10,6 +10,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Page保存了抓取的结果,并可定义下一次抓取的链接内容。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午11:22
......@@ -65,7 +66,7 @@ public class Page {
}
}
public void addTargetRequests(String requestString) {
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
......@@ -75,6 +76,12 @@ public class Page {
}
}
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
targetRequests.add(request);
}
}
public Selectable getUrl() {
return url;
}
......
package us.codecraft.webmagic;
/**
* Request对象是
* Request对象封装了待抓取的url信息。<br/>
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
* <pre>
* Example:
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
* 在上一个页面:
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
* 在下一个页面:
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午11:37
......@@ -12,15 +27,28 @@ public class Request {
private Object[] extra;
/**
* 构建一个request对象
* @param url 必须参数,待抓取的url
* @param extra 额外参数,可以保存一些需要的上下文信息
*/
public Request(String url, Object... extra) {
this.url = url;
this.extra = extra;
}
/**
* 获取预存的对象
* @return object[] 预存的对象数组
*/
public Object[] getExtra() {
return extra;
}
/**
* 获取待抓取的url
* @return url 待抓取的url
*/
public String getUrl() {
return url;
}
......
package us.codecraft.webmagic;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
/**
* Site定义一个待抓取的站点的各种信息。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午12:13
......@@ -14,11 +14,11 @@ public class Site {
private String userAgent;
private String cookie;
private Map<String,String> cookies = new LinkedHashMap<String, String>();
private String encoding;
private String startUrl;
private List<String> startUrls;
private int sleepTime = 3000;
......@@ -34,8 +34,8 @@ public class Site {
return new Site();
}
public Site setCookie(String cookie) {
this.cookie = cookie;
public Site setCookie(String name,String value) {
cookies.put(name,value);
return this;
}
......@@ -44,8 +44,8 @@ public class Site {
return this;
}
public String getCookie() {
return cookie;
public Map<String,String> getCookies() {
return cookies;
}
public String getUserAgent() {
......@@ -79,12 +79,12 @@ public class Site {
return this;
}
public String getStartUrl() {
return startUrl;
public List<String> getStartUrls() {
return startUrls;
}
public Site setStartUrl(String startUrl) {
this.startUrl = startUrl;
this.startUrls.add(startUrl);
return this;
}
......@@ -106,8 +106,8 @@ public class Site {
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
......@@ -117,8 +117,8 @@ public class Site {
@Override
public int hashCode() {
int result = domain.hashCode();
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
return result;
......
......@@ -36,7 +36,9 @@ public class Spider implements Runnable {
public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite());
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
schedular.push(new Request(startUrl), pageProcessor.getSite());
}
return this;
}
......
......@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* Downloader是webmagic抓取页面的核心接口。
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午12:14
......@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
public interface Downloader {
/**
* 下载页面,并保存信息到Page对象中。
*
* @param request
* @param site
* @return
*/
public Page download(Request request,Site site);
public Page download(Request request, Site site);
}
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.*;
import us.codecraft.webmagic.Site;
import java.util.Map;
/**
* Author: code4crafter@gmail.com
* Date: 13-4-21
......@@ -50,15 +55,23 @@ public class HttpClientPool {
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
connectionManager.setMaxTotal(100);
connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100);
HttpClient httpClient = new DefaultHttpClient(connectionManager, params);
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
generateCookie(httpClient, site);
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient;
}
public void pushBack(HttpClient httpClient) {
private void generateCookie(DefaultHttpClient httpClient, Site site) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
httpClient.setCookieStore(cookieStore);
}
}
package us.codecraft.webmagic.downloader;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* Author: code4crafer@gmail.com
* Date: 13-6-18
* Time: 上午8:22
*/
public class HttpClientDownloaderTest {
@Test
public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
}
}
......@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
page.addTargetRequests("http://kaichiba.com/shop/"+i);
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment