Commit 21cae2ff authored by yihua.huang's avatar yihua.huang

update package

parent cfb89904
...@@ -9,7 +9,7 @@ import java.util.List; ...@@ -9,7 +9,7 @@ import java.util.List;
/** /**
* <pre> * <pre>
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。 * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
* *
* 主要方法: * 主要方法:
* {@link #getUrl()} 获取页面的Url * {@link #getUrl()} 获取页面的Url
...@@ -19,6 +19,7 @@ import java.util.List; ...@@ -19,6 +19,7 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接 * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
* *
* </pre> * </pre>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
*/ */
public class Page { public class Page {
...@@ -36,9 +37,16 @@ public class Page { ...@@ -36,9 +37,16 @@ public class Page {
public Page() { public Page() {
} }
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/** /**
* 保存抽取的结果 * 保存抽取的结果
* @param key 结果的key *
* @param key 结果的key
* @param field 结果的value * @param field 结果的value
*/ */
public void putField(String key, Object field) { public void putField(String key, Object field) {
...@@ -47,6 +55,7 @@ public class Page { ...@@ -47,6 +55,7 @@ public class Page {
/** /**
* 获取页面的html内容 * 获取页面的html内容
*
* @return html 页面的html内容 * @return html 页面的html内容
*/ */
public Selectable getHtml() { public Selectable getHtml() {
...@@ -63,6 +72,7 @@ public class Page { ...@@ -63,6 +72,7 @@ public class Page {
/** /**
* 添加待抓取的链接 * 添加待抓取的链接
*
* @param requests 待抓取的链接 * @param requests 待抓取的链接
*/ */
public void addTargetRequests(List<String> requests) { public void addTargetRequests(List<String> requests) {
...@@ -79,6 +89,7 @@ public class Page { ...@@ -79,6 +89,7 @@ public class Page {
/** /**
* 添加待抓取的链接 * 添加待抓取的链接
*
* @param requestString 待抓取的链接 * @param requestString 待抓取的链接
*/ */
public void addTargetRequest(String requestString) { public void addTargetRequest(String requestString) {
...@@ -93,6 +104,7 @@ public class Page { ...@@ -93,6 +104,7 @@ public class Page {
/** /**
* 添加待抓取的页面,在需要传递附加信息时使用 * 添加待抓取的页面,在需要传递附加信息时使用
*
* @param request 待抓取的页面 * @param request 待抓取的页面
*/ */
public void addTargetRequest(Request request) { public void addTargetRequest(Request request) {
...@@ -103,6 +115,7 @@ public class Page { ...@@ -103,6 +115,7 @@ public class Page {
/** /**
* 获取页面的Url * 获取页面的Url
*
* @return url 当前页面的url,可用于抽取 * @return url 当前页面的url,可用于抽取
*/ */
public Selectable getUrl() { public Selectable getUrl() {
...@@ -111,6 +124,7 @@ public class Page { ...@@ -111,6 +124,7 @@ public class Page {
/** /**
* 设置url * 设置url
*
* @param url * @param url
*/ */
public void setUrl(Selectable url) { public void setUrl(Selectable url) {
...@@ -119,6 +133,7 @@ public class Page { ...@@ -119,6 +133,7 @@ public class Page {
/** /**
* 获取抓取请求 * 获取抓取请求
*
* @return request 抓取请求 * @return request 抓取请求
*/ */
public Request getRequest() { public Request getRequest() {
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
import java.lang.annotation.Retention; import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
import java.lang.annotation.Retention; import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector; import us.codecraft.webmagic.selector.Selector;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector; import us.codecraft.webmagic.selector.Selector;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
import java.lang.annotation.Retention; import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
import java.lang.annotation.Retention; import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
...@@ -10,7 +10,7 @@ import java.util.List; ...@@ -10,7 +10,7 @@ import java.util.List;
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 下午10:18 <br> * Time: 下午10:18 <br>
*/ */
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href") @TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class OschinaBlog implements AfterExtractor { public class OschinaBlog implements AfterExtractor {
@ExtractBy("//title") @ExtractBy("//title")
...@@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor { ...@@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor {
System.out.println("title:\t"+title); System.out.println("title:\t"+title);
System.out.println("content:\t"+content); System.out.println("content:\t"+content);
System.out.println("tags:\t" + tags); System.out.println("tags:\t" + tags);
page.setSkip(true);
} }
public static void main(String[] args) { public static void main(String[] args) {
......
package us.codecraft.webmagic.oo; package us.codecraft.webmagic.model;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
......
package us.codecraft.webmagic.oo.samples; package us.codecraft.webmagic.model.samples;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.ExtractBy; import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.oo.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.oo.TargetUrl; import us.codecraft.webmagic.model.TargetUrl;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.*; import us.codecraft.webmagic.model.*;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.ExtractBy; import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.oo.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.oo.TargetUrl; import us.codecraft.webmagic.model.TargetUrl;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment