Commit 64293cba authored by yihua.huang's avatar yihua.huang

samples

parent bc1d14fe
...@@ -4,4 +4,34 @@ package us.codecraft.webmagic.samples; ...@@ -4,4 +4,34 @@ package us.codecraft.webmagic.samples;
* @author code4crafer@gmail.com * @author code4crafer@gmail.com
*/ */
public class GithubRepo { public class GithubRepo {
}
private String name;
private String author;
private String readme;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getReadme() {
return readme;
}
public void setReadme(String readme) {
this.readme = readme;
}
}
\ No newline at end of file
...@@ -7,7 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -7,7 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.3.2 * @since 0.5.1
*/ */
public class GithubRepoPageProcessor implements PageProcessor { public class GithubRepoPageProcessor implements PageProcessor {
...@@ -17,13 +17,16 @@ public class GithubRepoPageProcessor implements PageProcessor { ...@@ -17,13 +17,16 @@ public class GithubRepoPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); GithubRepo githubRepo = new GithubRepo();
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
if (page.getResultItems().get("name")==null){ githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
if (githubRepo.getName() == null) {
//skip this page //skip this page
page.setSkip(true); page.setSkip(true);
} else {
page.putField("repo", githubRepo);
} }
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
} }
@Override @Override
......
package us.codecraft.webmagic.samples.pipeline;
/**
* @author code4crafer@gmail.com
*/
public class ReplacePipeline {
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment