Commit 95ab4ede authored by yihua.huang's avatar yihua.huang

some bugfix

parent 250cc5e6
...@@ -96,11 +96,6 @@ public class Site { ...@@ -96,11 +96,6 @@ public class Site {
* @return get domain * @return get domain
*/ */
public String getDomain() { public String getDomain() {
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return domain; return domain;
} }
...@@ -176,6 +171,11 @@ public class Site { ...@@ -176,6 +171,11 @@ public class Site {
*/ */
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl); this.startUrls.add(startUrl);
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return this; return this;
} }
......
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.Formatter;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements AfterExtractor{
@ExtractBy("//title/text()")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
@Formatter("YYYY-MM-dd HH:mm")
@ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')")
private String date;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
public List<String> getTags() {
return tags;
}
// public Date getDate() {
// return date;
// }
@Override
public void afterProcess(Page page) {
System.out.println(date);
System.out.println(title);
}
}
...@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; ...@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.Date;
import java.util.List; import java.util.List;
/** /**
...@@ -24,9 +23,6 @@ public class OschinaBlog{ ...@@ -24,9 +23,6 @@ public class OschinaBlog{
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags; private List<String> tags;
@ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')")
private Date date;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment