Commit 0fd4623f authored by yihua.huang's avatar yihua.huang

Merge branch 'osc'

parents ce5495ec 2a8e1b65
#!/bin/sh
mvn clean package
rsync -avz --delete ./webmagic-samples/target/lib/ ./lib/
<?xml version="1.0" encoding="UTF-8"?>
<project name="module_webmagic-core" default="compile.module.webmagic-core">
<dirname property="module.webmagic-core.basedir" file="${ant.file.module_webmagic-core}"/>
<property name="module.jdk.home.webmagic-core" value="${project.jdk.home}"/>
<property name="module.jdk.bin.webmagic-core" value="${project.jdk.bin}"/>
<property name="module.jdk.classpath.webmagic-core" value="${project.jdk.classpath}"/>
<property name="compiler.args.webmagic-core" value="${compiler.args}"/>
<property name="webmagic-core.output.dir" value="${module.webmagic-core.basedir}/target/classes"/>
<property name="webmagic-core.testoutput.dir" value="${module.webmagic-core.basedir}/target/test-classes"/>
<path id="webmagic-core.module.bootclasspath">
<!-- Paths to be included in compilation bootclasspath -->
</path>
<path id="webmagic-core.module.production.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.production.module.classpath">
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.module.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.module.classpath">
<pathelement location="${webmagic-core.testoutput.dir}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<patternset id="excluded.from.module.webmagic-core">
<patternset refid="ignored.files"/>
</patternset>
<patternset id="excluded.from.compilation.webmagic-core">
<patternset refid="excluded.from.module.webmagic-core"/>
</patternset>
<path id="webmagic-core.module.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/main/java"/>
<include name="src/main/resources"/>
</dirset>
</path>
<path id="webmagic-core.module.test.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/test/java"/>
<include name="src/test/resources"/>
</dirset>
</path>
<target name="compile.module.webmagic-core" depends="compile.module.webmagic-core.production,compile.module.webmagic-core.tests" description="Compile module webmagic-core"/>
<target name="compile.module.webmagic-core.production" depends="register.custom.compilers" description="Compile module webmagic-core; production classes">
<mkdir dir="${webmagic-core.output.dir}"/>
<javac2 destdir="${webmagic-core.output.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.production.classpath"/>
<src refid="webmagic-core.module.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.output.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/main/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/main/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="compile.module.webmagic-core.tests" depends="register.custom.compilers,compile.module.webmagic-core.production" description="compile module webmagic-core; test classes" unless="skip.tests">
<mkdir dir="${webmagic-core.testoutput.dir}"/>
<javac2 destdir="${webmagic-core.testoutput.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.classpath"/>
<src refid="webmagic-core.module.test.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.testoutput.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/test/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/test/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="clean.module.webmagic-core" description="cleanup module">
<delete dir="${webmagic-core.output.dir}"/>
<delete dir="${webmagic-core.testoutput.dir}"/>
</target>
</project>
\ No newline at end of file
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author yihua.huang@dianping.com <br>
* Date: 13-8-13 <br>
* Time: 上午10:13 <br>
*/
@TargetUrl("http://*.alpha.dp/*")
public class DianpingFtlDataScanner implements AfterExtractor {
@ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
private List<String> data;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
.thread(5).run();
}
@Override
public void afterProcess(Page page) {
if (data.size() > 1) {
System.err.println(page.getUrl());
}
if (data.size() > 0 && data.get(0).length() > 100) {
System.err.println(page.getUrl());
}
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.PlainText;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午8:08
*/
public class DiaoyuwengProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
}
@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 下午1:48
*/
public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd"));
}
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
}
}
......@@ -27,4 +27,5 @@ public class HuxiuProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 下午5:31
*/
public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 下午5:31
*/
public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment