Commit 3b00190f authored by yihua.huang's avatar yihua.huang

api without implementation for #28: add specific url crawl

parent 719100d6
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.UrlTemplate;
import us.codecraft.webmagic.model.direct.Param;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
*/
@UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8")
public class BaiduBaike {
private String word;
@ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()")
private String description;
public static void main(String[] args) {
List<Param> words = new ArrayList<Param>();
words.add(new Param().put("word","红烧肉"));
OOSpider.direct(words, BaiduBaike.class).thread(10).run();
}
}
...@@ -2,8 +2,11 @@ package us.codecraft.webmagic.model; ...@@ -2,8 +2,11 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.direct.Param;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Collection;
/** /**
* The spider for page model extractor.<br> * The spider for page model extractor.<br>
* In webmagic, we call a POJO containing extract result as "page model". <br> * In webmagic, we call a POJO containing extract result as "page model". <br>
...@@ -22,13 +25,14 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -22,13 +25,14 @@ import us.codecraft.webmagic.processor.PageProcessor;
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) * {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
* private List<String> tags; * private List<String> tags;
* } * }
</pre> * </pre>
* And start the spider by: * And start the spider by:
* <pre> * <pre>
* OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") * OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
* ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); * ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
* } * }
</pre> * </pre>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
...@@ -49,6 +53,7 @@ public class OOSpider extends Spider { ...@@ -49,6 +53,7 @@ public class OOSpider extends Spider {
/** /**
* create a spider * create a spider
*
* @param site * @param site
* @param pageModelPipeline * @param pageModelPipeline
* @param pageModels * @param pageModels
...@@ -57,7 +62,7 @@ public class OOSpider extends Spider { ...@@ -57,7 +62,7 @@ public class OOSpider extends Spider {
this(ModelPageProcessor.create(site, pageModels)); this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline(); this.modelPipeline = new ModelPipeline();
super.addPipeline(modelPipeline); super.addPipeline(modelPipeline);
if (pageModelPipeline!=null){ if (pageModelPipeline != null) {
for (Class pageModel : pageModels) { for (Class pageModel : pageModels) {
this.modelPipeline.put(pageModel, pageModelPipeline); this.modelPipeline.put(pageModel, pageModelPipeline);
} }
...@@ -72,6 +77,22 @@ public class OOSpider extends Spider { ...@@ -72,6 +77,22 @@ public class OOSpider extends Spider {
return new OOSpider(site, pageModelPipeline, pageModels); return new OOSpider(site, pageModelPipeline, pageModels);
} }
public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(null, pageModelPipeline, pageModels);
}
public static OOSpider direct(Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public static OOSpider direct(Collection<Param> params,Class... pageModels) {
return new OOSpider(null, null, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) { for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel); modelPageProcessor.addPageModel(pageModel);
......
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Define the url patterns for class. <br>
* All urls matching the pattern will be crawled and extracted for new objects. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.3
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface UrlTemplate {
/**
* The url patterns for class.<br>
* Use regex expression with some changes: <br>
* "." stand for literal character "." instead of "any character". <br>
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
*
* @return the url patterns for class
*/
String value();
/**
* Define the region for url extracting. <br>
* Only support XPath.<br>
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
*
* @return the region for url extracting
*/
String encoding() default "utf8";
}
package us.codecraft.webmagic.model.direct;
import java.util.LinkedHashMap;
/**
* @author code4crafter@gmail.com
*/
public class Param extends LinkedHashMap<String,Object>{
@Override
public Param put(String key, Object value) {
super.put(key, value);
return this;
}
}
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
...@@ -18,14 +18,26 @@ public class Kr36NewsModel { ...@@ -18,14 +18,26 @@ public class Kr36NewsModel {
@ExtractBy("//h1[@class='entry-title sep10']") @ExtractBy("//h1[@class='entry-title sep10']")
private String title; private String title;
@ExtractBy("//div[@class='mainContent sep-10']") @ExtractBy("//div[@class='mainContent sep-10']/tidyText()")
private String content; private String content;
@ExtractByUrl @ExtractByUrl
private String url; private String url;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/"), new ConsolePageModelPipeline(), OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0),new JsonFilePageModelPipeline(),
Kr36NewsModel.class).run(); Kr36NewsModel.class).thread(20).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
public String getUrl() {
return url;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment