Commit 6dc88fa1 authored by yihua.huang's avatar yihua.huang

split modules

parent 3c3f0011
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.0.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-core</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.2.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>13.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package us.codecraft.spider; package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.spider.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.spider.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
......
package us.codecraft.spider; package us.codecraft.webmagic;
import us.codecraft.spider.Site;
import java.util.List;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider; package us.codecraft.webmagic;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
......
package us.codecraft.spider; package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.spider.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.spider.downloader.HttpClientDownloader; import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.spider.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.spider.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.spider.schedular.QueueSchedular; import us.codecraft.webmagic.schedular.QueueSchedular;
import us.codecraft.spider.schedular.Schedular; import us.codecraft.webmagic.schedular.Schedular;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.downloader; package us.codecraft.webmagic.downloader;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Request; import us.codecraft.webmagic.Request;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.downloader; package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Request; import us.codecraft.webmagic.Request;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.spider.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.spider.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
/** /**
......
package us.codecraft.spider.downloader; package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion; import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
...@@ -10,7 +10,7 @@ import org.apache.http.conn.scheme.SchemeRegistry; ...@@ -10,7 +10,7 @@ import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.*; import org.apache.http.params.*;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import java.util.Map; import java.util.Map;
......
package us.codecraft.spider.pipeline; package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
...@@ -19,7 +19,7 @@ import java.util.Map; ...@@ -19,7 +19,7 @@ import java.util.Map;
*/ */
public class FilePipeline implements Pipeline { public class FilePipeline implements Pipeline {
private String path = "/data/temp/spider/"; private String path = "/data/temp/webmagic/";
public FilePipeline(){ public FilePipeline(){
......
package us.codecraft.spider.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.processor; package us.codecraft.webmagic.processor;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.processor; package us.codecraft.webmagic.processor;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.schedular; package us.codecraft.webmagic.schedular;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Request; import us.codecraft.webmagic.Request;
import java.io.*; import java.io.*;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
......
package us.codecraft.spider.schedular; package us.codecraft.webmagic.schedular;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.spider.Request; import us.codecraft.webmagic.Request;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
......
package us.codecraft.spider.schedular; package us.codecraft.webmagic.schedular;
import us.codecraft.spider.Request; import us.codecraft.webmagic.Request;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlCleaner;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.htmlcleaner.*; import org.htmlcleaner.*;
......
package us.codecraft.spider.utils; package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
......
package us.codecraft.spider; package us.codecraft.webmagic;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import us.codecraft.spider.selector.Html; import us.codecraft.webmagic.selector.Html;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider; package us.codecraft.webmagic;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.spider.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.spider.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.spider.samples.HuxiuProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.spider.schedular.FileCacheQueueSchedular; import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
/** /**
* User: cairne * User: cairne
...@@ -24,12 +24,12 @@ public class SpiderTest { ...@@ -24,12 +24,12 @@ public class SpiderTest {
@Test @Test
public void testGlobalSpider(){ public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor(); // PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). // Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run(); // processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
pageProcessor2.getSite().setEncoding("GBK"); pageProcessor2.getSite().setEncoding("GBK");
System.out.println(pageProcessor2.getSite().getEncoding()); System.out.println(pageProcessor2.getSite().getEncoding());
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")). Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")).
processor(pageProcessor2).run(); processor(pageProcessor2).run();
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.samples; package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site; import us.codecraft.webmagic.Site;
import us.codecraft.spider.Page; import us.codecraft.webmagic.Page;
import us.codecraft.spider.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List; import java.util.List;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlCleaner;
...@@ -6,7 +6,6 @@ import org.htmlcleaner.TagNode; ...@@ -6,7 +6,6 @@ import org.htmlcleaner.TagNode;
import org.junit.Test; import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
/** /**
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import junit.framework.Assert; import junit.framework.Assert;
import org.junit.Test; import org.junit.Test;
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Test; import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.net.URL;
/** /**
* User: cairne * User: cairne
......
package us.codecraft.spider.selector; package us.codecraft.webmagic.selector;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
......
package us.codecraft.spider.utils; package us.codecraft.webmagic.utils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.0.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-plugin</artifactId>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.0.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-samples</artifactId>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class DiandianBlogProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
}
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class DianpingBlogProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")){
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc());
}
}
@Override
public Site getSite() {
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class DiaoyuwengProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("shop")){
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc());
}
}
@Override
public Site getSite() {
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午1:48
*/
public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().r("<title>(.*)</title>"));
page.putField("body",page.getHtml().x("//dd[@class='w133']"));
}
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class HuxiuProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().sc());
}
@Override
public Site getSite() {
return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* User: cairne
* Date: 13-5-20
* Time: 下午5:31
*/
public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
page.addTargetRequests("http://kaichiba.com/shop/"+i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
}
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-5-20
* Time: 下午5:31
*/
public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings());
page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
}
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class NjuBBSProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().rs("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
}
@Override
public Site getSite() {
return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午1:48
*/
public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().sc());
page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
}
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午1:48
*/
public class OschinaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
}
@Override
public Site getSite() {
return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午8:08
*/
public class QzoneBlogProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
}
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午1:48
*/
public class SinaBlogProcesser implements PageProcessor {
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings());
page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2"));
page.putField("body",page.getHtml().sc());
//x("//dd[@class='w133']")
page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)"));
page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a"));
}
@Override
public Site getSite() {
return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* User: cairne
* Date: 13-4-21
* Time: 下午1:48
*/
public class TianyaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().sc());
}
@Override
public Site getSite() {
return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
}
}
package us.codecraft.webmagic;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;
/**
* User: cairne
* Date: 13-4-21
* Time: 上午8:42
*/
public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString()));
}
}
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
/**
* User: cairne
* Date: 13-4-20
* Time: 下午7:46
*/
public class SpiderTest {
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
me.run();
}
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
pageProcessor2.getSite().setEncoding("GBK");
System.out.println(pageProcessor2.getSite().getEncoding());
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")).
processor(pageProcessor2).run();
}
@Test
public void test(){
System.out.println(System.getProperty("java.io.tmpdir"));
}
@Ignore
@Test
public void languageSchema() {
/**
*
* _hrefs = rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']")
*
* site.domain = "sh.58.com"
* site.ua=""
* site.cookie="aa:bb"
*
*/
/**
*
*
* if (page == r('') && refer(1) == 1) {
*
* type = _refer(1)
* content = _text.t().c()
* title = x("asd@asd").r("",1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
*
* body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + xs('')) . r('')
*
* _targetUrls.add('' + x('').r(''))
* _sourceUrls.add()
* _header.put("","");
* _cookie.add("asdsadasdsa");
*
*
* }
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
/**
* <condition></>
* <selector>
* <fields>
*
* <type>
* <selector></selector>
* <selector></selector>
* </type>
* </>
* </>
*/
/**
*
* if (model.url('') && model.refer(1) == 1)
* (
*
* model.set(type, model.refer(1))
* content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + xs('') > r('')
* model.setTargetUrl();
*
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment