Commit 00dfebbc authored by yihua.huang's avatar yihua.huang

#424 remove guava dep and add fix docs

parent c2531c68
......@@ -70,16 +70,16 @@
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>0.8.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>0.8.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
......
......@@ -20,11 +20,6 @@
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
......@@ -73,12 +68,6 @@
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<exclusions>
<exclusion>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
......
package us.codecraft.webmagic;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyPool;
import org.apache.http.auth.UsernamePasswordCredentials;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.proxy.SimpleProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
......@@ -27,7 +24,7 @@ public class Site {
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
private Table<String, String, String> cookies = HashBasedTable.create();
private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
private String charset;
......@@ -104,7 +101,10 @@ public class Site {
* @return this
*/
public Site addCookie(String domain, String name, String value) {
cookies.put(domain, name, value);
if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap<String, String>());
}
cookies.get(domain).put(name, value);
return this;
}
......@@ -134,7 +134,7 @@ public class Site {
* @return get cookies
*/
public Map<String,Map<String, String>> getAllCookies() {
return cookies.rowMap();
return cookies;
}
/**
......@@ -483,6 +483,7 @@ public class Site {
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
*
* @param httpProxyList httpProxyList
* @param isUseLastProxy isUseLastProxy
* @return this
*/
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
......
package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
......@@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.Closeable;
import java.io.IOException;
......@@ -173,9 +172,9 @@ public class Spider implements Runnable, Task {
*
* @param scheduler scheduler
* @return this
* @Deprecated
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler);
}
......@@ -499,7 +498,7 @@ public class Spider implements Runnable, Task {
}
public <T> T get(String url) {
List<String> urls = Lists.newArrayList(url);
List<String> urls = WMCollections.newArrayList(url);
List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
......
package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
......@@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
import java.nio.charset.Charset;
......@@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader {
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = Sets.newHashSet(200);
acceptStatCode = WMCollections.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
......
package us.codecraft.webmagic.scheduler.component;
import com.google.common.collect.Sets;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
......@@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap;
*/
public class HashSetDuplicateRemover implements DuplicateRemover {
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
@Override
public boolean isDuplicate(Request request, Task task) {
......
......@@ -33,11 +33,11 @@ public abstract class Selectors {
}
/**
* @Deprecated
* @see #xpath(String)
* @param expr expr
* @return new selector
*/
@Deprecated
public static XpathSelector xsoup(String expr) {
return new XpathSelector(expr);
}
......
package us.codecraft.webmagic.utils;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:16
*/
public class WMCollections {
public static <T> Set<T> newHashSet(T... t){
Set<T> set = new HashSet<T>(t.length);
for (T t1 : t) {
set.add(t1);
}
return set;
}
public static <T> List<T> newArrayList(T... t){
List<T> set = new ArrayList<T>(t.length);
for (T t1 : t) {
set.add(t1);
}
return set;
}
}
......@@ -15,6 +15,12 @@
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
......
......@@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public enum ExpressionType {
......
......@@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ExtractRule {
......
......@@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand
* @param phantomJsCommand phantomJsCommand
*/
public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath();
......
......@@ -9,7 +9,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class CompositePageProcessor implements PageProcessor {
......
......@@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public interface SubPageProcessor extends RequestMatcher {
......
......@@ -45,6 +45,7 @@ public class SpiderMonitor {
*
* @param spiders spiders
* @return this
* @throws JMException
*/
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
for (Spider spider : spiders) {
......
package us.codecraft.webmagic.scheduler.component;
package us.codecraft.webmagic.scheduler;
/**
* @author code4crafter@gmail.com
* Date: 16/12/18
* Time: 上午10:23
*/
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.nio.charset.Charset;
import java.util.concurrent.atomic.AtomicInteger;
......@@ -67,4 +74,4 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover {
public int getTotalRequestsCount(Task task) {
return counter.get();
}
}
}
\ No newline at end of file
......@@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
......
......@@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
/**
* @author code4crafter@gmail.com
* @date 14-4-9
*/
public class BaiduNews {
......
......@@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com
* @date 14-4-11
*/
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
......
......@@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
import javax.management.JMException;
import java.util.List;
......
......@@ -13,7 +13,7 @@ import java.util.List;
/**
* Created by dolphineor on 2014-11-21.
* <p/>
* <p>
* 以淘宝为例, 搜索冬装的相关结果
*/
public class PhantomJSPageProcessor implements PageProcessor {
......
......@@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline {
private PrintWriter printWriter;
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
this("/data/webmagic/");
}
......
package us.codecraft.webmagic.scripts;
import com.google.common.collect.Sets;
import org.apache.commons.cli.*;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
......@@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.WMCollections;
import java.util.HashMap;
import java.util.List;
......@@ -29,8 +29,8 @@ public class ScriptConsole {
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
static {
alias.put(Language.JavaScript, Sets.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, Sets.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
......@@ -93,7 +93,7 @@ public class ScriptConsole {
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502));
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment