Commit 644e8d1f authored by 愤怒的番茄's avatar 愤怒的番茄

同步官方源码

parent 610ac42c
...@@ -78,7 +78,6 @@ public class GithubRepoPageProcessor implements PageProcessor { ...@@ -78,7 +78,6 @@ public class GithubRepoPageProcessor implements PageProcessor {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
} }
} }
``` ```
* `page.addTargetRequests(links)` * `page.addTargetRequests(links)`
...@@ -164,6 +163,10 @@ To write webmagic, I refered to the projects below : ...@@ -164,6 +163,10 @@ To write webmagic, I refered to the projects below :
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
QQ Group: 330192938
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
<mockup version="1.0" skin="sketch" fontFace="Balsamiq Sans" measuredW="1154" measuredH="470" mockupW="709" mockupH="470">
<controls>
<control controlID="0" controlTypeID="com.balsamiq.mockups::BrowserWindow" x="445" y="0" w="709" h="470" measuredW="450" measuredH="400" zOrder="0" locked="false" isInGroup="-1">
<controlProperties>
<text>A%20Web%20Page%0Ahttp%3A//</text>
</controlProperties>
</control>
</controls>
</mockup>
\ No newline at end of file
...@@ -182,6 +182,15 @@ ...@@ -182,6 +182,15 @@
<encoding>UTF-8</encoding> <encoding>UTF-8</encoding>
</configuration> </configuration>
</plugin> </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<excludes>
<exclude>log4j.xml</exclude>
</excludes>
</configuration>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId> <artifactId>maven-source-plugin</artifactId>
......
<html>
<head>
<script src=""></script>
</head>
<div class="url-box">
<input type="text" id="url-input">
</div>
<div class="content-show">
</div>
</html>
\ No newline at end of file
...@@ -74,7 +74,7 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -74,7 +74,7 @@ public class HttpClientDownloader extends AbstractDownloader {
} else { } else {
acceptStatCode = Sets.newHashSet(200); acceptStatCode = Sets.newHashSet(200);
} }
logger.info("downloading page " + request.getUrl()); logger.info("downloading page {}" , request.getUrl());
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (headers != null) { if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
......
...@@ -23,8 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { ...@@ -23,8 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler {
@Override @Override
public void push(Request request, Task task) { public void push(Request request, Task task) {
logger.debug("push to queue " + request.getUrl()); logger.trace("get a candidate url {}", request.getUrl());
if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task); pushWhenNoDuplicate(request, task);
} }
} }
......
...@@ -131,6 +131,7 @@ public class Html extends PlainText { ...@@ -131,6 +131,7 @@ public class Html extends PlainText {
} }
public Document getDocument() { public Document getDocument() {
initDocument();
return document; return document;
} }
......
...@@ -8,21 +8,11 @@ ...@@ -8,21 +8,11 @@
</layout> </layout>
</appender> </appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false"> <logger name="org.apache" additivity="false">
<level value="warn" /> <level value="warn" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
</logger> </logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root> <root>
<level value="info" /> <level value="info" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
......
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class ConfigurablePageProcessor implements PageProcessor {
private Site site;
private List<ExtractRule> extractRules;
public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
this.site = site;
this.extractRules = extractRules;
}
@Override
public void process(Page page) {
for (ExtractRule extractRule : extractRules) {
if (extractRule.isMulti()) {
List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
if (extractRule.isNotNull() && results.size() == 0) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), results);
}
} else {
String result = page.getHtml().selectDocument(extractRule.getSelector());
if (extractRule.isNotNull() && result == null) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), result);
}
}
}
}
@Override
public Site getSite() {
return site;
}
}
package us.codecraft.webmagic.configurable;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public enum ExpressionType {
XPath, Regex, Css, JsonPath;
}
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.selector.Selector;
import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ExtractRule {
private String fieldName;
private ExpressionType expressionType;
private String expressionValue;
private String[] expressionParams;
private boolean multi = false;
private volatile Selector selector;
private boolean notNull = false;
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public ExpressionType getExpressionType() {
return expressionType;
}
public void setExpressionType(ExpressionType expressionType) {
this.expressionType = expressionType;
}
public String getExpressionValue() {
return expressionValue;
}
public void setExpressionValue(String expressionValue) {
this.expressionValue = expressionValue;
}
public String[] getExpressionParams() {
return expressionParams;
}
public void setExpressionParams(String[] expressionParams) {
this.expressionParams = expressionParams;
}
public boolean isMulti() {
return multi;
}
public void setMulti(boolean multi) {
this.multi = multi;
}
public Selector getSelector() {
if (selector == null) {
synchronized (this) {
if (selector == null) {
selector = compileSelector();
}
}
}
return selector;
}
private Selector compileSelector() {
switch (expressionType) {
case Css:
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
case XPath:
return xpath(expressionValue);
case Regex:
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
case JsonPath:
return new JsonPathSelector(expressionValue);
default:
return xpath(expressionValue);
}
}
public void setSelector(Selector selector) {
this.selector = selector;
}
public boolean isNotNull() {
return notNull;
}
public void setNotNull(boolean notNull) {
this.notNull = notNull;
}
}
package us.codecraft.webmagic.configurable;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface Inject {
String value() default "";
}
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Map;
/**
* Inject property to object by {@link Inject} annotation.
*
* @author yihua.huang@dianping.com
*/
public class PropertyLoader<T> {
public T load(T object, Map<String, String> properties) {
return object;
}
}
package us.codecraft.webmagic.example;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.configurable.Inject;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
*/
public class ConfigurableBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@Inject("linkRegex")
private String linkRegex;
@Inject("titleXpath")
private String titleXpath;
@Inject("contentXpath")
private String contentXpath;
@Inject("tagsXpath")
private String tagsXpath;
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(linkRegex).all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath(titleXpath).toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("content", page.getHtml().smartContent().toString());
page.putField("tags", page.getHtml().xpath(tagsXpath).all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
}
}
package us.codecraft.webmagic.example;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 21:23
*/
public class PatternProcessorDemo {
private static Logger log = Logger.getLogger(PatternProcessorDemo.class);
public static void main(String... args) {
PatternPageProcessor processor
= new PatternPageProcessor("http://item.jd.com/981821.html",
PatternPageProcessor.TARGET_PATTERN_ALL
);
PatternPipeline pipeline = new PatternPipeline();
// define a handler which handles only "http://item.jd.com/.*"
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override
public void onExtract(Page page) {
log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)");
}
@Override
public void onHandle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test"));
}
};
handler.register(processor, pipeline);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
}
}
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class CompositePageProcessor implements PageProcessor {
private Site site;
private List<SubPageProcessor> subPageProcessors;
@Override
public void process(Page page) {
for (SubPageProcessor subPageProcessor : subPageProcessors) {
if (subPageProcessor.match(page)) {
SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page);
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) {
return;
}
}
}
}
public CompositePageProcessor setSite(Site site) {
this.site = site;
return this;
}
public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) {
this.subPageProcessors = new ArrayList<SubPageProcessor>();
for (SubPageProcessor subPageProcessor : subPageProcessors) {
this.subPageProcessors.add(subPageProcessor);
}
return this;
}
@Override
public Site getSite() {
return site;
}
}
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
import java.util.UUID;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 03, 2014
* Time: 10:00
* <p></p>
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
public abstract class PatternHandler {
/**
* identity of the handler.
*/
protected String id;
/**
* match pattern. only matched page should be handled.
*/
protected String pattern;
/**
* @param pattern
* url pattern to handle
*/
protected PatternHandler(String pattern) {
this.pattern = pattern;
this.id = UUID.randomUUID().toString();
}
/**
* determine if the page should be handled.
*/
public boolean match(String url) {
return url.matches(pattern);
}
/**
* registers to both the page processor and the pipeline so the handler could take charge of
* both end of procedure.
*
* @param processor
* the processor to handle
* @param pipeline
* the pipeline to handle
*/
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.addHandler(this);
pipeline.addHandler(this);
}
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.removeHandler(this);
pipeline.removeHandler(this);
}
public boolean process(Page page) {
if(match(page.getUrl().toString())) {
page.putField(id, true);
onExtract(page);
return true;
} else {
return false;
}
}
public boolean process(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) {
return false;
}
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
onHandle(resultItems, task);
return true;
} else {
return false;
}
}
/**
* implements this method to extract from page.
*
* @param page
* the page to extract
*/
public abstract void onExtract(Page page);
/**
* implements this method to handle the extraction result.
*
* @param result
* extraction result
* @param task
*/
public abstract void onHandle(ResultItems result, Task task);
}
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public interface SubPageProcessor {
/**
* Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
* @return
*/
public boolean match(Page page);
/**
*
* process the page, extract urls to fetch, extract the data and store
*
* @param page
* @return whether continue to match
*/
public MatchOtherProcessor process(Page page);
public enum MatchOtherProcessor {
YES, NO;
}
}
...@@ -25,8 +25,6 @@ class ModelPageProcessor implements PageProcessor { ...@@ -25,8 +25,6 @@ class ModelPageProcessor implements PageProcessor {
private Site site; private Site site;
private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
public static ModelPageProcessor create(Site site, Class... clazzs) { public static ModelPageProcessor create(Site site, Class... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
for (Class clazz : clazzs) { for (Class clazz : clazzs) {
...@@ -38,8 +36,6 @@ class ModelPageProcessor implements PageProcessor { ...@@ -38,8 +36,6 @@ class ModelPageProcessor implements PageProcessor {
public ModelPageProcessor addPageModel(Class clazz) { public ModelPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor); pageModelExtractorList.add(pageModelExtractor);
return this; return this;
} }
...@@ -55,11 +51,14 @@ class ModelPageProcessor implements PageProcessor { ...@@ -55,11 +51,14 @@ class ModelPageProcessor implements PageProcessor {
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
Object process = pageModelExtractor.process(page); Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List) process).size() == 0)) { if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true); continue;
} }
postProcessPageModel(pageModelExtractor.getClazz(), process); postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
} }
if (page.getResultItems().getAll().size() == 0) {
page.getResultItems().setSkip(true);
}
} }
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) { private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
......
...@@ -340,9 +340,7 @@ class PageModelExtractor { ...@@ -340,9 +340,7 @@ class PageModelExtractor {
private Object convert(String value, ObjectFormatter objectFormatter) { private Object convert(String value, ObjectFormatter objectFormatter) {
try { try {
Object format = objectFormatter.format(value); Object format = objectFormatter.format(value);
if (logger.isDebugEnabled()) { logger.debug("String {} is converted to {}", value, format);
logger.debug("String " + value + " is converted to " + format);
}
return format; return format;
} catch (Exception e) { } catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
......
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import java.util.ArrayList;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 20:44
*/
public class PatternPipeline implements Pipeline {
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) {
handler.process(resultItems, task);
}
}
}
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 15:36
* <p></p>
* A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern.
*
* @see us.codecraft.webmagic.handler.PatternHandler
*/
public class PatternPageProcessor implements PageProcessor {
public static final String TARGET_PATTERN_ALL = "http://*";
protected Site site;
protected String targetPattern;
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
public PatternPageProcessor(String startUrl, String targetPattern) {
this.targetPattern = targetPattern;
this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl));
this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*",
"[^\"'#]*") + ")";
site.setUserAgent("Chrome/5.0.354.0");
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(targetPattern).all();
page.addTargetRequests(requests);
for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) {
handler.process(page);
}
}
}
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public Site getSite() {
return site;
}
}
...@@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger; ...@@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
public class FileCacheQueueScheduler implements Scheduler { public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler {
private Logger logger = LoggerFactory.getLogger(getClass()); private Logger logger = LoggerFactory.getLogger(getClass());
...@@ -145,20 +145,14 @@ public class FileCacheQueueScheduler implements Scheduler { ...@@ -145,20 +145,14 @@ public class FileCacheQueueScheduler implements Scheduler {
} }
@Override @Override
public synchronized void push(Request request, Task task) { protected void pushWhenNoDuplicate(Request request, Task task) {
if (!inited.get()) { if (!inited.get()) {
init(task); init(task);
} }
if (logger.isDebugEnabled()) {
logger.debug("push to queue " + request.getUrl());
}
if (urls.add(request.getUrl())) {
queue.add(request); queue.add(request);
fileUrlWriter.println(request.getUrl()); fileUrlWriter.println(request.getUrl());
} }
}
@Override @Override
public synchronized Request poll(Task task) { public synchronized Request poll(Task task) {
if (!inited.get()) { if (!inited.get()) {
......
package us.codecraft.webmagic.configurable;
import org.junit.Test;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import java.util.ArrayList;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ConfigurablePageProcessorTest {
@Test
public void test() throws Exception {
List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
ExtractRule extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//title");
extractRule.setFieldName("title");
extractRules.add(extractRule);
extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
extractRule.setFieldName("star");
extractRules.add(extractRule);
ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
.setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic &middot; GitHub</title>");
assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
}
}
package us.codecraft.webmagic.model;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.selector.PlainText;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-4
*/
public class ModelPageProcessorTest {
@TargetUrl("http://codecraft.us/foo")
public static class ModelFoo {
@ExtractBy(value = "//div/@foo", notNull = true)
private String foo;
}
@TargetUrl("http://codecraft.us/bar")
public static class ModelBar {
@ExtractBy(value = "//div/@bar", notNull = true)
private String bar;
}
@Test
public void testMultiModel_should_not_skip_when_match() throws Exception {
Page page = new Page();
page.setRawText("<div foo='foo'></div>");
page.setRequest(new Request("http://codecraft.us/foo"));
page.setUrl(PlainText.create("http://codecraft.us/foo"));
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class);
modelPageProcessor.process(page);
assertThat(page.getResultItems().isSkip()).isFalse();
}
}
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
/**
* @author code4crafter@gmail.com
* @date 14-4-9
*/
public class BaiduNews {
@ExtractBy("//h3[@class='c-title']/a/text()")
private String name;
@ExtractBy("//div[@class='c-summary']/text()")
private String description;
@Override
public String toString() {
return "BaiduNews{" +
"name='" + name + '\'' +
", description='" + description + '\'' +
'}';
}
public static void main(String[] args) {
OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class);
//single download
BaiduNews baike = ooSpider.<BaiduNews>get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient");
System.out.println(baike);
ooSpider.close();
}
public String getName() {
return name;
}
public String getDescription() {
return description;
}
}
\ No newline at end of file
...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.model.samples; ...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
...@@ -26,9 +25,8 @@ public class News163 implements MultiPageModel { ...@@ -26,9 +25,8 @@ public class News163 implements MultiPageModel {
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page; private String page;
@ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"), @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)"
@ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)}, , multi = true, notNull = false)
multi = true, notNull = false)
private List<String> otherPage; private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()") @ExtractBy("//h1[@id=\"h1title\"]/text()")
...@@ -74,8 +72,8 @@ public class News163 implements MultiPageModel { ...@@ -74,8 +72,8 @@ public class News163 implements MultiPageModel {
} }
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run(); .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
} }
} }
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com
* @date 14-4-11
*/
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
public class QQMeishi {
@ExtractBy("//div[@class=info]/a[@class=title]/h4/text()")
private String shopName;
@ExtractBy("//div[@class=info]/a[@class=title]/text()")
private String promo;
public static void main(String[] args) {
OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run();
}
}
...@@ -5,7 +5,7 @@ import org.junit.Test; ...@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser; import us.codecraft.webmagic.samples.SinaBlogProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException; import java.io.IOException;
...@@ -20,7 +20,7 @@ public class SinablogProcessorTest { ...@@ -20,7 +20,7 @@ public class SinablogProcessorTest {
@Ignore @Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor();
//pipeline是抓取结束后的处理 //pipeline是抓取结束后的处理
//默认放到/data/webmagic/ftl/[domain]目录下 //默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
...@@ -29,7 +29,7 @@ public class SinablogProcessorTest { ...@@ -29,7 +29,7 @@ public class SinablogProcessorTest {
//ConsolePipeline输出结果到控制台 //ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行 //Spider.run()执行
Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run(); run();
} }
} }
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
<artifactId>jruby</artifactId> <artifactId>jruby</artifactId>
<version>1.7.6</version> <version>1.7.6</version>
</dependency> </dependency>
<dependency><groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>2.5.3</version>
</dependency>
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>
......
...@@ -7,7 +7,9 @@ public enum Language { ...@@ -7,7 +7,9 @@ public enum Language {
JavaScript("javascript","js/defines.js",""), JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb",""); JRuby("jruby","ruby/defines.rb",""),
Jython("jython","python/defines.py","");
private String engineName; private String engineName;
......
package us.codecraft.webmagic.scripts; package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash;
import org.python.core.PyDictionary;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -10,6 +12,8 @@ import javax.script.ScriptEngine; ...@@ -10,6 +12,8 @@ import javax.script.ScriptEngine;
import javax.script.ScriptException; import javax.script.ScriptException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
...@@ -50,20 +54,35 @@ public class ScriptProcessor implements PageProcessor { ...@@ -50,20 +54,35 @@ public class ScriptProcessor implements PageProcessor {
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try { try {
switch (language) {
case JavaScript:
engine.eval(defines + "\n" + script, context); engine.eval(defines + "\n" + script, context);
// switch (language) {
// case JavaScript:
// NativeObject o = (NativeObject) engine.get("result"); // NativeObject o = (NativeObject) engine.get("result");
// if (o != null) { // if (o != null) {
// for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) { // for (Object o1 : o.getIds()) {
// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); // String key = String.valueOf(o1);
// page.getResultItems().put(key, NativeObject.getProperty(o, key));
// } // }
// } // }
// break; break;
// case JRuby: case JRuby:
// Object o1 = engine.get("result"); RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context);
// break; Iterator itruby = oRuby.entrySet().iterator();
// } while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry) itruby.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
case Jython:
engine.eval(defines + "\n" + script, context);
PyDictionary oJython = (PyDictionary) engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry) it.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
}
} catch (ScriptException e) { } catch (ScriptException e) {
e.printStackTrace(); e.printStackTrace();
} }
...@@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor { ...@@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor {
} }
} }
@Override @Override
public Site getSite() { public Site getSite() {
return site; return site;
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -9,3 +9,4 @@ var config = { ...@@ -9,3 +9,4 @@ var config = {
title = $("div.BlogTitle h1"), title = $("div.BlogTitle h1"),
content = $("div.BlogContent") content = $("div.BlogContent")
urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") urls("http://my\\.oschina\\.net/flashsword/blog/\\d+")
config;
File mode changed from 100644 to 100755
def xpath(str):
return page.getHtml().xpath(str).toString()
def css(str):
return page.getHtml().css(str).toString()
def urls(str):
links=page.getHtml().links().regex(str).all()
page.addTargetRequests(links);
def tomap(key,value):
return "hello world"
title=xpath("div[@class=BlogTitle]")
urls="http://my\\.oschina\\.net/flashsword/blog/\\d+"
result={"title":title,"urls":urls}
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"
title = css "div.BlogTitle h1" title = css "div.BlogTitle h1"
content = css "div.BlogContent" content = css "div.BlogContent"
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"
\ No newline at end of file return {"title"=>title,"content"=>content}
...@@ -22,4 +22,12 @@ public class ScriptProcessorTest { ...@@ -22,4 +22,12 @@ public class ScriptProcessorTest {
pageProcessor.getSite().setSleepTime(0); pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
} }
@Test
public void testPythonProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
} }
File mode changed from 100644 to 100755
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment