Commit 00dfebbc authored by yihua.huang's avatar yihua.huang

#424 remove guava dep and add fix docs

parent c2531c68
...@@ -70,16 +70,16 @@ ...@@ -70,16 +70,16 @@
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.5.2</version> <version>4.5.2</version>
</dependency> </dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>0.8.1</version>
</dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>
<artifactId>guava</artifactId> <artifactId>guava</artifactId>
<version>15.0</version> <version>15.0</version>
</dependency> </dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>0.8.1</version>
</dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId> <artifactId>slf4j-api</artifactId>
......
...@@ -20,11 +20,6 @@ ...@@ -20,11 +20,6 @@
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
...@@ -73,12 +68,6 @@ ...@@ -73,12 +68,6 @@
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
<exclusions>
<exclusion>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyPool;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyPool; import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.proxy.SimpleProxyPool;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*; import java.util.*;
...@@ -27,7 +24,7 @@ public class Site { ...@@ -27,7 +24,7 @@ public class Site {
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>(); private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
private Table<String, String, String> cookies = HashBasedTable.create(); private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
private String charset; private String charset;
...@@ -104,7 +101,10 @@ public class Site { ...@@ -104,7 +101,10 @@ public class Site {
* @return this * @return this
*/ */
public Site addCookie(String domain, String name, String value) { public Site addCookie(String domain, String name, String value) {
cookies.put(domain, name, value); if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap<String, String>());
}
cookies.get(domain).put(name, value);
return this; return this;
} }
...@@ -134,7 +134,7 @@ public class Site { ...@@ -134,7 +134,7 @@ public class Site {
* @return get cookies * @return get cookies
*/ */
public Map<String,Map<String, String>> getAllCookies() { public Map<String,Map<String, String>> getAllCookies() {
return cookies.rowMap(); return cookies;
} }
/** /**
...@@ -483,6 +483,7 @@ public class Site { ...@@ -483,6 +483,7 @@ public class Site {
* Set httpProxyPool, String[0]:ip, String[1]:port <br> * Set httpProxyPool, String[0]:ip, String[1]:port <br>
* *
* @param httpProxyList httpProxyList * @param httpProxyList httpProxyList
* @param isUseLastProxy isUseLastProxy
* @return this * @return this
*/ */
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) { public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.apache.http.HttpHost;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
...@@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; ...@@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
...@@ -173,9 +172,9 @@ public class Spider implements Runnable, Task { ...@@ -173,9 +172,9 @@ public class Spider implements Runnable, Task {
* *
* @param scheduler scheduler * @param scheduler scheduler
* @return this * @return this
* @Deprecated
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/ */
@Deprecated
public Spider scheduler(Scheduler scheduler) { public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler); return setScheduler(scheduler);
} }
...@@ -499,7 +498,7 @@ public class Spider implements Runnable, Task { ...@@ -499,7 +498,7 @@ public class Spider implements Runnable, Task {
} }
public <T> T get(String url) { public <T> T get(String url) {
List<String> urls = Lists.newArrayList(url); List<String> urls = WMCollections.newArrayList(url);
List<T> resultItemses = getAll(urls); List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) { if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0); return resultItemses.get(0);
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
...@@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy; ...@@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
...@@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader { ...@@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader {
charset = site.getCharset(); charset = site.getCharset();
headers = site.getHeaders(); headers = site.getHeaders();
} else { } else {
acceptStatCode = Sets.newHashSet(200); acceptStatCode = WMCollections.newHashSet(200);
} }
logger.info("downloading page {}", request.getUrl()); logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null; CloseableHttpResponse httpResponse = null;
......
package us.codecraft.webmagic.scheduler.component; package us.codecraft.webmagic.scheduler.component;
import com.google.common.collect.Sets;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import java.util.Collections;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
...@@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap; ...@@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap;
*/ */
public class HashSetDuplicateRemover implements DuplicateRemover { public class HashSetDuplicateRemover implements DuplicateRemover {
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
@Override @Override
public boolean isDuplicate(Request request, Task task) { public boolean isDuplicate(Request request, Task task) {
......
...@@ -33,11 +33,11 @@ public abstract class Selectors { ...@@ -33,11 +33,11 @@ public abstract class Selectors {
} }
/** /**
* @Deprecated
* @see #xpath(String) * @see #xpath(String)
* @param expr expr * @param expr expr
* @return new selector * @return new selector
*/ */
@Deprecated
public static XpathSelector xsoup(String expr) { public static XpathSelector xsoup(String expr) {
return new XpathSelector(expr); return new XpathSelector(expr);
} }
......
package us.codecraft.webmagic.utils;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:16
*/
public class WMCollections {
public static <T> Set<T> newHashSet(T... t){
Set<T> set = new HashSet<T>(t.length);
for (T t1 : t) {
set.add(t1);
}
return set;
}
public static <T> List<T> newArrayList(T... t){
List<T> set = new ArrayList<T>(t.length);
for (T t1 : t) {
set.add(t1);
}
return set;
}
}
...@@ -15,6 +15,12 @@ ...@@ -15,6 +15,12 @@
<artifactId>jedis</artifactId> <artifactId>jedis</artifactId>
<version>2.9.0</version> <version>2.9.0</version>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<optional>true</optional>
</dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
......
...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable; ...@@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-5
*/ */
public enum ExpressionType { public enum ExpressionType {
......
...@@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*; ...@@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-5
*/ */
public class ExtractRule { public class ExtractRule {
......
...@@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader { ...@@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
* *
* @param phantomJsCommand * @param phantomJsCommand phantomJsCommand
*/ */
public PhantomJSDownloader(String phantomJsCommand) { public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
......
...@@ -9,7 +9,6 @@ import java.util.List; ...@@ -9,7 +9,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-5
*/ */
public class CompositePageProcessor implements PageProcessor { public class CompositePageProcessor implements PageProcessor {
......
...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page; ...@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-5
*/ */
public interface SubPageProcessor extends RequestMatcher { public interface SubPageProcessor extends RequestMatcher {
......
...@@ -45,6 +45,7 @@ public class SpiderMonitor { ...@@ -45,6 +45,7 @@ public class SpiderMonitor {
* *
* @param spiders spiders * @param spiders spiders
* @return this * @return this
* @throws JMException
*/ */
public synchronized SpiderMonitor register(Spider... spiders) throws JMException { public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
for (Spider spider : spiders) { for (Spider spider : spiders) {
......
package us.codecraft.webmagic.scheduler.component; package us.codecraft.webmagic.scheduler;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:23
*/
import com.google.common.hash.BloomFilter; import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels; import com.google.common.hash.Funnels;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
......
...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler; ...@@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
......
...@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; ...@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-9
*/ */
public class BaiduNews { public class BaiduNews {
......
...@@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; ...@@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @date 14-4-11
*/ */
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") @TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) @ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
......
...@@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site; ...@@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import javax.management.JMException; import javax.management.JMException;
import java.util.List; import java.util.List;
......
...@@ -13,7 +13,7 @@ import java.util.List; ...@@ -13,7 +13,7 @@ import java.util.List;
/** /**
* Created by dolphineor on 2014-11-21. * Created by dolphineor on 2014-11-21.
* <p/> * <p>
* 以淘宝为例, 搜索冬装的相关结果 * 以淘宝为例, 搜索冬装的相关结果
*/ */
public class PhantomJSPageProcessor implements PageProcessor { public class PhantomJSPageProcessor implements PageProcessor {
......
...@@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline { ...@@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline {
private PrintWriter printWriter; private PrintWriter printWriter;
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException { public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
this("/data/webmagic/"); this("/data/webmagic/");
} }
......
package us.codecraft.webmagic.scripts; package us.codecraft.webmagic.scripts;
import com.google.common.collect.Sets;
import org.apache.commons.cli.*; import org.apache.commons.cli.*;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
...@@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems; ...@@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.WMCollections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -29,8 +29,8 @@ public class ScriptConsole { ...@@ -29,8 +29,8 @@ public class ScriptConsole {
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>(); private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
static { static {
alias.put(Language.JavaScript, Sets.<String>newHashSet("js", "javascript", "JavaScript", "JS")); alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, Sets.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby")); alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
} }
public void setLanguagefromArg(String arg) { public void setLanguagefromArg(String arg) {
...@@ -93,7 +93,7 @@ public class ScriptConsole { ...@@ -93,7 +93,7 @@ public class ScriptConsole {
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3); pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502)); pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread()); Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() { spider.clearPipeline().addPipeline(new Pipeline() {
@Override @Override
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment