Commit 6fa82a41 authored by yihua.huang's avatar yihua.huang

#29 seed urls with more information

parent 1446ada7
......@@ -24,7 +24,7 @@ public class Site {
/**
* startUrls is the urls the crawler to start with.
*/
private List<String> startUrls = new ArrayList<String>();
private List<Request> startRequests = new ArrayList<Request>();
private int sleepTime = 3000;
......@@ -38,7 +38,7 @@ public class Site {
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String,String> headers = new HashMap<String, String>();
private Map<String, String> headers = new HashMap<String, String>();
public static interface HeaderConst {
......@@ -182,9 +182,16 @@ public class Site {
* get start urls
*
* @return start urls
* @see #getStartRequests
* @deprecated
*/
@Deprecated
public List<String> getStartUrls() {
return startUrls;
return UrlUtils.convertToUrls(startRequests);
}
public List<Request> getStartRequests() {
return startRequests;
}
/**
......@@ -194,11 +201,19 @@ public class Site {
* @return this
*/
public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl);
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
return addStartRequest(new Request(startUrl));
}
/**
* Add a url to start url.<br>
*
* @param startUrl
* @return this
*/
public Site addStartRequest(Request startRequest) {
this.startRequests.add(startRequest);
if (domain == null && startRequest.getUrl() != null) {
domain = UrlUtils.getDomain(startRequest.getUrl());
}
return this;
}
......@@ -241,12 +256,13 @@ public class Site {
/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
*
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public Site addHeader(String key, String value){
headers.put(key,value);
public Site addHeader(String key, String value) {
headers.put(key, value);
return this;
}
......@@ -279,6 +295,20 @@ public class Site {
return this;
}
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
return Site.this.getDomain();
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
......@@ -286,37 +316,53 @@ public class Site {
Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
return Site.this.getDomain();
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public int hashCode() {
int result = domain.hashCode();
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + cookies +
", charset='" + charset + '\'' +
", startRequests=" + startRequests +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
}
......@@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import us.codecraft.webmagic.utils.ThreadUtils;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable;
import java.io.IOException;
......@@ -60,7 +61,7 @@ public class Spider implements Runnable, Task {
protected PageProcessor pageProcessor;
protected List<String> startUrls;
protected List<Request> startRequests;
protected Site site;
......@@ -107,7 +108,7 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls();
this.startRequests = pageProcessor.getSite().getStartRequests();
}
/**
......@@ -119,7 +120,20 @@ public class Spider implements Runnable, Task {
*/
public Spider startUrls(List<String> startUrls) {
checkIfRunning();
this.startUrls = startUrls;
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls
* @return this
*/
public Spider startRequest(List<Request> startRequests) {
checkIfRunning();
this.startRequests = startRequests;
return this;
}
......@@ -231,11 +245,11 @@ public class Spider implements Runnable, Task {
}
downloader.setThread(threadNum);
executorService = ThreadUtils.newFixedThreadPool(threadNum);
if (startUrls != null) {
for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this);
if (startRequests != null) {
for (Request request : startRequests) {
scheduler.push(request, this);
}
startUrls.clear();
startRequests.clear();
}
}
......@@ -390,6 +404,20 @@ public class Spider implements Runnable, Task {
return this;
}
/**
* Add urls with information to crawl.<br/>
*
* @param urls
* @return
*/
public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
return this;
}
private void waitNewUrl() {
try {
newUrlLock.lock();
......
......@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).thread(10).run();
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
}
}
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Request;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -18,7 +21,7 @@ public class UrlUtils {
/**
* canonicalizeUrl
*
* <p/>
* Borrowed from Jsoup.
*
* @param url
......@@ -85,6 +88,22 @@ public class UrlUtils {
return stringBuilder.toString();
}
public static List<Request> convertToRequests(List<String> urls) {
List<Request> requestList = new ArrayList<Request>(urls.size());
for (String url : urls) {
requestList.add(new Request(url));
}
return requestList;
}
public static List<String> convertToUrls(List<Request> requests) {
List<String> urlList = new ArrayList<String>(requests.size());
for (Request request : requests) {
urlList.add(request.getUrl());
}
return urlList;
}
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
public static String getCharset(String contentType) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment