Commit 6fa82a41 authored by yihua.huang's avatar yihua.huang

#29 seed urls with more information

parent 1446ada7
...@@ -24,7 +24,7 @@ public class Site { ...@@ -24,7 +24,7 @@ public class Site {
/** /**
* startUrls is the urls the crawler to start with. * startUrls is the urls the crawler to start with.
*/ */
private List<String> startUrls = new ArrayList<String>(); private List<Request> startRequests = new ArrayList<Request>();
private int sleepTime = 3000; private int sleepTime = 3000;
...@@ -38,7 +38,7 @@ public class Site { ...@@ -38,7 +38,7 @@ public class Site {
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET; private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String,String> headers = new HashMap<String, String>(); private Map<String, String> headers = new HashMap<String, String>();
public static interface HeaderConst { public static interface HeaderConst {
...@@ -182,9 +182,16 @@ public class Site { ...@@ -182,9 +182,16 @@ public class Site {
* get start urls * get start urls
* *
* @return start urls * @return start urls
* @see #getStartRequests
* @deprecated
*/ */
@Deprecated
public List<String> getStartUrls() { public List<String> getStartUrls() {
return startUrls; return UrlUtils.convertToUrls(startRequests);
}
public List<Request> getStartRequests() {
return startRequests;
} }
/** /**
...@@ -194,11 +201,19 @@ public class Site { ...@@ -194,11 +201,19 @@ public class Site {
* @return this * @return this
*/ */
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl); return addStartRequest(new Request(startUrl));
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
} }
/**
* Add a url to start url.<br>
*
* @param startUrl
* @return this
*/
public Site addStartRequest(Request startRequest) {
this.startRequests.add(startRequest);
if (domain == null && startRequest.getUrl() != null) {
domain = UrlUtils.getDomain(startRequest.getUrl());
} }
return this; return this;
} }
...@@ -241,12 +256,13 @@ public class Site { ...@@ -241,12 +256,13 @@ public class Site {
/** /**
* Put an Http header for downloader. <br/> * Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/> * Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
*
* @param key key of http header, there are some keys constant in {@link HeaderConst} * @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header * @param value value of header
* @return * @return
*/ */
public Site addHeader(String key, String value){ public Site addHeader(String key, String value) {
headers.put(key,value); headers.put(key, value);
return this; return this;
} }
...@@ -279,6 +295,20 @@ public class Site { ...@@ -279,6 +295,20 @@ public class Site {
return this; return this;
} }
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
return Site.this.getDomain();
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
...@@ -286,37 +316,53 @@ public class Site { ...@@ -286,37 +316,53 @@ public class Site {
Site site = (Site) o; Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false; return false;
if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true; return true;
} }
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
return Site.this.getDomain();
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override @Override
public int hashCode() { public int hashCode() {
int result = domain.hashCode(); int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result; return result;
} }
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + cookies +
", charset='" + charset + '\'' +
", startRequests=" + startRequests +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
} }
...@@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; ...@@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.EnvironmentUtil;
import us.codecraft.webmagic.utils.ThreadUtils; import us.codecraft.webmagic.utils.ThreadUtils;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
...@@ -60,7 +61,7 @@ public class Spider implements Runnable, Task { ...@@ -60,7 +61,7 @@ public class Spider implements Runnable, Task {
protected PageProcessor pageProcessor; protected PageProcessor pageProcessor;
protected List<String> startUrls; protected List<Request> startRequests;
protected Site site; protected Site site;
...@@ -107,7 +108,7 @@ public class Spider implements Runnable, Task { ...@@ -107,7 +108,7 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite(); this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls(); this.startRequests = pageProcessor.getSite().getStartRequests();
} }
/** /**
...@@ -119,7 +120,20 @@ public class Spider implements Runnable, Task { ...@@ -119,7 +120,20 @@ public class Spider implements Runnable, Task {
*/ */
public Spider startUrls(List<String> startUrls) { public Spider startUrls(List<String> startUrls) {
checkIfRunning(); checkIfRunning();
this.startUrls = startUrls; this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls
* @return this
*/
public Spider startRequest(List<Request> startRequests) {
checkIfRunning();
this.startRequests = startRequests;
return this; return this;
} }
...@@ -231,11 +245,11 @@ public class Spider implements Runnable, Task { ...@@ -231,11 +245,11 @@ public class Spider implements Runnable, Task {
} }
downloader.setThread(threadNum); downloader.setThread(threadNum);
executorService = ThreadUtils.newFixedThreadPool(threadNum); executorService = ThreadUtils.newFixedThreadPool(threadNum);
if (startUrls != null) { if (startRequests != null) {
for (String startUrl : startUrls) { for (Request request : startRequests) {
scheduler.push(new Request(startUrl), this); scheduler.push(request, this);
} }
startUrls.clear(); startRequests.clear();
} }
} }
...@@ -390,6 +404,20 @@ public class Spider implements Runnable, Task { ...@@ -390,6 +404,20 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
/**
* Add urls with information to crawl.<br/>
*
* @param urls
* @return
*/
public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
return this;
}
private void waitNewUrl() { private void waitNewUrl() {
try { try {
newUrlLock.lock(); newUrlLock.lock();
......
...@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).thread(10).run(); Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
} }
} }
package us.codecraft.webmagic.utils; package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Request;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -18,7 +21,7 @@ public class UrlUtils { ...@@ -18,7 +21,7 @@ public class UrlUtils {
/** /**
* canonicalizeUrl * canonicalizeUrl
* * <p/>
* Borrowed from Jsoup. * Borrowed from Jsoup.
* *
* @param url * @param url
...@@ -85,6 +88,22 @@ public class UrlUtils { ...@@ -85,6 +88,22 @@ public class UrlUtils {
return stringBuilder.toString(); return stringBuilder.toString();
} }
public static List<Request> convertToRequests(List<String> urls) {
List<Request> requestList = new ArrayList<Request>(urls.size());
for (String url : urls) {
requestList.add(new Request(url));
}
return requestList;
}
public static List<String> convertToUrls(List<Request> requests) {
List<String> urlList = new ArrayList<String>(requests.size());
for (Request request : requests) {
urlList.add(request.getUrl());
}
return urlList;
}
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
public static String getCharset(String contentType) { public static String getCharset(String contentType) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment