Commit f3a29d93 authored by yihua.huang's avatar yihua.huang

fix pagedmodel bug

parent 629f8ac2
......@@ -55,8 +55,10 @@ class PageModelExtractor {
fieldExtractor = fieldExtractorTmp;
}
// ExtractBy2 & ExtractBy3
addAnnotationExtractBy2(clazz, fieldExtractor);
addAnnotationExtractBy3(clazz, fieldExtractor);
if (fieldExtractor!=null){
addAnnotationExtractBy2(fieldExtractor);
addAnnotationExtractBy3(fieldExtractor);
}
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
......@@ -69,8 +71,8 @@ class PageModelExtractor {
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
fieldExtractors.add(fieldExtractor);
}
}
}
......@@ -122,7 +124,7 @@ class PageModelExtractor {
return fieldExtractor;
}
private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
if (extractBy != null) {
String value = extractBy.value();
......@@ -147,7 +149,7 @@ class PageModelExtractor {
}
}
private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
if (extractBy != null) {
String value = extractBy.value();
......
......@@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline {
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
if (pagedModel.getOtherPages()!=null){
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
......
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Collection;
import java.util.List;
......@@ -17,14 +15,16 @@ import java.util.List;
* Time: 下午8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel, AfterExtractor {
public class News163 implements PagedModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
......@@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor {
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
......@@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor {
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
@Override
public void afterProcess(Page page) {
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment