Commit 0633ea16 authored by yihua.huang's avatar yihua.huang

add javascript support

parent df8ca8ad
package us.codecraft.webmagic.processor;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
*/
public class JsScriptProcessor implements PageProcessor{
private ScriptEngine rubyEngine;
private String defines;
private String script;
public JsScriptProcessor(String filename){
ScriptEngineManager manager = new ScriptEngineManager();
rubyEngine = manager.getEngineByName("javascript");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
try {
defines = IOUtils.toString(resourceAsStream);
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(Page page) {
ScriptContext context = rubyEngine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
try {
rubyEngine.eval(defines+script, context);
} catch (ScriptException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}
...@@ -15,39 +15,39 @@ import java.io.InputStream; ...@@ -15,39 +15,39 @@ import java.io.InputStream;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class ScriptProcessor implements PageProcessor{ public class RubyScriptProcessor implements PageProcessor{
private ScriptEngine rubyEngine; private ScriptEngine rubyEngine;
private String defines; private String defines;
ScriptProcessor(){ private String script;
public RubyScriptProcessor(String filename){
ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngineManager manager = new ScriptEngineManager();
rubyEngine = manager.getEngineByName("jruby"); rubyEngine = manager.getEngineByName("jruby");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb"); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb");
try { try {
defines = IOUtils.toString(resourceAsStream); defines = IOUtils.toString(resourceAsStream);
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
script = IOUtils.toString(resourceAsStream);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
@Override @Override
public void process(Page page) { public void process(Page page) {
ScriptContext context = rubyEngine.getContext(); ScriptContext context = rubyEngine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
String script;
try { try {
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb"); rubyEngine.eval(defines+script, context);
try {
script = IOUtils.toString(resourceAsStream);
rubyEngine.eval(defines+script, context);
} catch (IOException e) {
e.printStackTrace();
}
} catch (ScriptException e) { } catch (ScriptException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
@Override @Override
...@@ -56,6 +56,6 @@ public class ScriptProcessor implements PageProcessor{ ...@@ -56,6 +56,6 @@ public class ScriptProcessor implements PageProcessor{
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); Spider.create(new RubyScriptProcessor("ruby/oschina.rb")).addUrl("http://my.oschina.net/flashsword/blog").run();
} }
} }
function $(str){
return page.getHtml().$(str).toString();
}
function xpath(str){
return page.getHtml().xpath(str).toString();
}
function urls(str){
links = page.getHtml().links().regex(str).all();
page.addTargetRequests(links);
}
\ No newline at end of file
var result = {
title: $("div.BlogTitle h1"),
content: $("div.BlogContent")
}
var config = {
ua: '',
sleepTime : 20
}
urls("http://my\\.oschina\\.net/flashsword/blog/\\d+")
\ No newline at end of file
package us.codecraft.webmagic.js;
import org.junit.Test;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
/**
* @author code4crafter@gmail.com
*/
public class TestJsCall {
@Test
public void test() throws ScriptException {
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine rubyEngine = manager.getEngineByName("javascript");
ScriptContext context = rubyEngine.getContext();
context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE);
// rubyEngine.eval("", context);
rubyEngine.eval("print(a)", context);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment