Commit 474f785d authored by Yihua Huang's avatar Yihua Huang

Merge pull request #86 from sebastian1118/master

new feature: PatternProcessor
parents 8fe967ba 38a12f86
package us.codecraft.webmagic.example;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 21:23
*/
public class PatternProcessorDemo {
private static Logger log = Logger.getLogger(PatternProcessorDemo.class);
public static void main(String... args) {
PatternPageProcessor processor
= new PatternPageProcessor("http://item.jd.com/981821.html",
PatternPageProcessor.TARGET_PATTERN_ALL
);
PatternPipeline pipeline = new PatternPipeline();
// define a handler which handles only "http://item.jd.com/.*"
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override
public void onExtract(Page page) {
log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)");
}
@Override
public void onHandle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test"));
}
};
handler.register(processor, pipeline);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
}
}
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
import java.util.UUID;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 03, 2014
* Time: 10:00
* <p></p>
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
public abstract class PatternHandler {
/**
* identity of the handler.
*/
protected String id;
/**
* match pattern. only matched page should be handled.
*/
protected String pattern;
/**
* @param pattern
* url pattern to handle
*/
protected PatternHandler(String pattern) {
this.pattern = pattern;
this.id = UUID.randomUUID().toString();
}
/**
* determine if the page should be handled.
*/
public boolean match(String url) {
return url.matches(pattern);
}
/**
* registers to both the page processor and the pipeline so the handler could take charge of
* both end of procedure.
*
* @param processor
* the processor to handle
* @param pipeline
* the pipeline to handle
*/
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.addHandler(this);
pipeline.addHandler(this);
}
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.removeHandler(this);
pipeline.removeHandler(this);
}
public boolean process(Page page) {
if(match(page.getUrl().toString())) {
page.putField(id, true);
onExtract(page);
return true;
} else {
return false;
}
}
public boolean process(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) {
return false;
}
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
onHandle(resultItems, task);
return true;
} else {
return false;
}
}
/**
* implements this method to extract from page.
*
* @param page
* the page to extract
*/
public abstract void onExtract(Page page);
/**
* implements this method to handle the extraction result.
*
* @param result
* extraction result
* @param task
*/
public abstract void onHandle(ResultItems result, Task task);
}
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import java.util.ArrayList;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 20:44
*/
public class PatternPipeline implements Pipeline {
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) {
handler.process(resultItems, task);
}
}
}
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 15:36
* <p></p>
* A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern.
*
* @see us.codecraft.webmagic.handler.PatternHandler
*/
public class PatternPageProcessor implements PageProcessor {
public static final String TARGET_PATTERN_ALL = "http://*";
protected Site site;
protected String targetPattern;
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
public PatternPageProcessor(String startUrl, String targetPattern) {
this.targetPattern = targetPattern;
this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl));
this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*",
"[^\"'#]*") + ")";
site.setUserAgent("Chrome/5.0.354.0");
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(targetPattern).all();
page.addTargetRequests(requests);
for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) {
handler.process(page);
}
}
}
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public Site getSite() {
return site;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment