Commit cc9d319f authored by Yihua Huang's avatar Yihua Huang

Merge pull request #94 from sebastian1118/master

update:PatternHandler
parents da2f023c 99e12aaf
...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems; ...@@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler; import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.handler.SubPageProcessor;
import us.codecraft.webmagic.pipeline.PatternPipeline; import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor; import us.codecraft.webmagic.processor.PatternPageProcessor;
...@@ -32,21 +33,23 @@ public class PatternProcessorDemo { ...@@ -32,21 +33,23 @@ public class PatternProcessorDemo {
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override @Override
public void onExtract(Page page) { public SubPageProcessor.MatchOtherProcessor process(Page page) {
log.info("Extracting from " + page.getUrl()); log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)"); page.putField("test", "hello world:)");
return MatchOtherProcessor.YES;
} }
@Override @Override
public void onHandle(ResultItems result, Task task) { public void handle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl()); log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test")); log.info("Retrieved test=" + result.get("test"));
} }
}; };
handler.register(processor, pipeline); processor.addHandler(handler);
pipeline.addHandler(handler);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
} }
......
...@@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler; ...@@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
import java.util.UUID; import java.util.UUID;
...@@ -17,7 +15,7 @@ import java.util.UUID; ...@@ -17,7 +15,7 @@ import java.util.UUID;
* A PatternHandler is in charge of both page extraction and data processing by implementing * A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods. * its two abstract methods.
*/ */
public abstract class PatternHandler { public abstract class PatternHandler implements SubPageProcessor {
/** /**
* identity of the handler. * identity of the handler.
...@@ -47,46 +45,25 @@ public abstract class PatternHandler { ...@@ -47,46 +45,25 @@ public abstract class PatternHandler {
return url.matches(pattern); return url.matches(pattern);
} }
/** public boolean processPage(Page page) {
* registers to both the page processor and the pipeline so the handler could take charge of
* both end of procedure.
*
* @param processor
* the processor to handle
* @param pipeline
* the pipeline to handle
*/
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.addHandler(this);
pipeline.addHandler(this);
}
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.removeHandler(this);
pipeline.removeHandler(this);
}
public boolean process(Page page) {
if(match(page.getUrl().toString())) { if(match(page.getUrl().toString())) {
page.putField(id, true); page.putField(id, true);
onExtract(page); process(page);
return true; return true;
} else { } else {
return false; return false;
} }
} }
public boolean process(ResultItems resultItems, Task task) { public boolean processResult(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) { if(resultItems.isSkip()) {
return false; return false;
} }
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
onHandle(resultItems, task); handle(resultItems, task);
return true; return true;
} else { } else {
return false; return false;
...@@ -94,20 +71,20 @@ public abstract class PatternHandler { ...@@ -94,20 +71,20 @@ public abstract class PatternHandler {
} }
/** /**
* implements this method to extract from page. * override this method to handle the extraction result. this method MUST use
* * with PatternPipeline
* @param page
* the page to extract
*/
public abstract void onExtract(Page page);
/**
* implements this method to handle the extraction result.
* *
* @param result * @param result
* extraction result * extraction result
* @param task * @param task
*/ */
public abstract void onHandle(ResultItems result, Task task); public void handle(ResultItems result, Task task) {
}
@Override
public boolean match(Page page) {
return match(page.getUrl().toString());
}
} }
...@@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page; ...@@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page;
*/ */
public interface SubPageProcessor { public interface SubPageProcessor {
/** /**
* Check whether the SubPageProcessor can process the page.<br></br> * Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method. * Please DO NOT change page status in this method.
* *
* @param page * @param page
* @return *
*/ * @return
public boolean match(Page page); */
public boolean match(Page page);
/** /**
* * process the page, extract urls to fetch, extract the data and store
* process the page, extract urls to fetch, extract the data and store *
* * @param page
* @param page *
* @return whether continue to match * @return whether continue to match
*/ */
public MatchOtherProcessor process(Page page); public MatchOtherProcessor process(Page page);
public enum MatchOtherProcessor { public enum MatchOtherProcessor {
YES, NO; YES, NO
} }
} }
...@@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline { ...@@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline {
* *
* @param handler the pattern handler * @param handler the pattern handler
* *
* @see PatternHandler#register
*/ */
public void addHandler(PatternHandler handler) { public void addHandler(PatternHandler handler) {
...@@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline { ...@@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline {
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) { for(PatternHandler handler : handlers) {
handler.process(resultItems, task); handler.processResult(resultItems, task);
} }
} }
} }
...@@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor { ...@@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor {
page.addTargetRequests(requests); page.addTargetRequests(requests);
for(PatternHandler handler : handlers) { for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) { if(handler.match(page.getUrl().toString())) {
handler.process(page); handler.processPage(page);
} }
} }
} }
/** /**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
* *
* @param handler the pattern handler * @param handler the pattern handler
* *
* @see PatternHandler#register *
*/ */
public void addHandler(PatternHandler handler) { public void addHandler(PatternHandler handler) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment