Commit 21cae2ff authored by yihua.huang's avatar yihua.huang

update package

parent cfb89904
......@@ -9,7 +9,7 @@ import java.util.List;
/**
* <pre>
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
* 主要方法:
* {@link #getUrl()} 获取页面的Url
......@@ -19,6 +19,7 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
*
* </pre>
*
* @author code4crafter@gmail.com <br>
*/
public class Page {
......@@ -36,8 +37,15 @@ public class Page {
public Page() {
}
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* 保存抽取的结果
*
* @param key 结果的key
* @param field 结果的value
*/
......@@ -47,6 +55,7 @@ public class Page {
/**
* 获取页面的html内容
*
* @return html 页面的html内容
*/
public Selectable getHtml() {
......@@ -63,6 +72,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requests 待抓取的链接
*/
public void addTargetRequests(List<String> requests) {
......@@ -79,6 +89,7 @@ public class Page {
/**
* 添加待抓取的链接
*
* @param requestString 待抓取的链接
*/
public void addTargetRequest(String requestString) {
......@@ -93,6 +104,7 @@ public class Page {
/**
* 添加待抓取的页面,在需要传递附加信息时使用
*
* @param request 待抓取的页面
*/
public void addTargetRequest(Request request) {
......@@ -103,6 +115,7 @@ public class Page {
/**
* 获取页面的Url
*
* @return url 当前页面的url,可用于抽取
*/
public Selectable getUrl() {
......@@ -111,6 +124,7 @@ public class Page {
/**
* 设置url
*
* @param url
*/
public void setUrl(Selectable url) {
......@@ -119,6 +133,7 @@ public class Page {
/**
* 获取抓取请求
*
* @return request 抓取请求
*/
public Request getRequest() {
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.selector.Selector;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
......@@ -10,7 +10,7 @@ import java.util.List;
* @date: 13-8-1 <br>
* Time: 下午10:18 <br>
*/
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href")
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class OschinaBlog implements AfterExtractor {
@ExtractBy("//title")
......@@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor {
System.out.println("title:\t"+title);
System.out.println("content:\t"+content);
System.out.println("tags:\t" + tags);
page.setSkip(true);
}
public static void main(String[] args) {
......
package us.codecraft.webmagic.oo;
package us.codecraft.webmagic.model;
import org.junit.Ignore;
import org.junit.Test;
......
package us.codecraft.webmagic.oo.samples;
package us.codecraft.webmagic.model.samples;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples;
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.ExtractBy;
import us.codecraft.webmagic.oo.OOSpider;
import us.codecraft.webmagic.oo.TargetUrl;
import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples;
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.*;
import us.codecraft.webmagic.model.*;
/**
* @author code4crafter@gmail.com <br>
......
package us.codecraft.webmagic.oo.samples;
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.ExtractBy;
import us.codecraft.webmagic.oo.OOSpider;
import us.codecraft.webmagic.oo.TargetUrl;
import us.codecraft.webmagic.model.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment