Commit 8f774afc authored by yihua.huang's avatar yihua.huang

add direct download

parent 86cfefb5
...@@ -68,4 +68,13 @@ public class ResultItems { ...@@ -68,4 +68,13 @@ public class ResultItems {
this.skip = skip; this.skip = skip;
return this; return this;
} }
@Override
public String toString() {
return "ResultItems{" +
"fields=" + fields +
", request=" + request +
", skip=" + skip +
'}';
}
} }
...@@ -43,6 +43,8 @@ public class Site { ...@@ -43,6 +43,8 @@ public class Site {
private HttpHost httpProxy; private HttpHost httpProxy;
private boolean useGzip = true;
public static interface HeaderConst { public static interface HeaderConst {
public static final String REFERER = "Referer"; public static final String REFERER = "Referer";
...@@ -199,7 +201,10 @@ public class Site { ...@@ -199,7 +201,10 @@ public class Site {
/** /**
* Add a url to start url.<br> * Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
* *
* @deprecated
* @see Spider#addUrl(String...)
* @param startUrl * @param startUrl
* @return this * @return this
*/ */
...@@ -209,7 +214,10 @@ public class Site { ...@@ -209,7 +214,10 @@ public class Site {
/** /**
* Add a url to start url.<br> * Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
* *
* @deprecated
* @see Spider#addRequest(Request...)
* @param startUrl * @param startUrl
* @return this * @return this
*/ */
...@@ -312,6 +320,22 @@ public class Site { ...@@ -312,6 +320,22 @@ public class Site {
return this; return this;
} }
public boolean isUseGzip() {
return useGzip;
}
/**
* Whether use gzip. <br>
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip
* @return
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public Task toTask() { public Task toTask() {
return new Task() { return new Task() {
@Override @Override
......
package us.codecraft.webmagic; package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils; ...@@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Condition;
...@@ -85,6 +89,10 @@ public class Spider implements Runnable, Task { ...@@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
protected final static int STAT_STOPPED = 2; protected final static int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock(); private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition(); private Condition newUrlCondition = newUrlLock.newCondition();
...@@ -244,7 +252,9 @@ public class Spider implements Runnable, Task { ...@@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
pipelines.add(new ConsolePipeline()); pipelines.add(new ConsolePipeline());
} }
downloader.setThread(threadNum); downloader.setThread(threadNum);
if (executorService == null || executorService.isShutdown()) {
executorService = ThreadUtils.newFixedThreadPool(threadNum); executorService = ThreadUtils.newFixedThreadPool(threadNum);
}
if (startRequests != null) { if (startRequests != null) {
for (Request request : startRequests) { for (Request request : startRequests) {
scheduler.push(request, this); scheduler.push(request, this);
...@@ -285,10 +295,11 @@ public class Spider implements Runnable, Task { ...@@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
}); });
} }
} }
executorService.shutdown();
stat.set(STAT_STOPPED); stat.set(STAT_STOPPED);
// release some resources // release some resources
destroy(); if (destroyWhenExit) {
close();
}
} }
private void checkRunningStat() { private void checkRunningStat() {
...@@ -303,12 +314,13 @@ public class Spider implements Runnable, Task { ...@@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
} }
} }
protected void destroy() { public void close() {
destroyEach(downloader); destroyEach(downloader);
destroyEach(pageProcessor); destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) { for (Pipeline pipeline : pipelines) {
destroyEach(pipeline); destroyEach(pipeline);
} }
executorService.shutdown();
} }
private void destroyEach(Object object) { private void destroyEach(Object object) {
...@@ -366,7 +378,7 @@ public class Spider implements Runnable, Task { ...@@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
} }
protected void extractAndAddRequests(Page page) { protected void extractAndAddRequests(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) { for (Request request : page.getTargetRequests()) {
addRequest(request); addRequest(request);
} }
...@@ -374,8 +386,10 @@ public class Spider implements Runnable, Task { ...@@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
} }
private void addRequest(Request request) { private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this); scheduler.push(request, this);
} }
protected void checkIfRunning() { protected void checkIfRunning() {
...@@ -391,7 +405,7 @@ public class Spider implements Runnable, Task { ...@@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
} }
/** /**
* Add urls to crawl.<br/> * Add urls to crawl. <br/>
* *
* @param urls * @param urls
* @return * @return
...@@ -404,6 +418,34 @@ public class Spider implements Runnable, Task { ...@@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
/**
* Download urls synchronizing.
*
* @param urls
* @return
*/
public List<ResultItems> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls);
CollectorPipeline collectorPipeline = new CollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollector();
}
public ResultItems get(String url) {
List<String> urls = Lists.newArrayList(url);
List<ResultItems> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/** /**
* Add urls with information to crawl.<br/> * Add urls with information to crawl.<br/>
* *
...@@ -492,6 +534,24 @@ public class Spider implements Runnable, Task { ...@@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl
* @return
* @since 0.4.0
*/
public Spider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override @Override
public String getUUID() { public String getUUID() {
if (uuid != null) { if (uuid != null) {
...@@ -500,7 +560,8 @@ public class Spider implements Runnable, Task { ...@@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
if (site != null) { if (site != null) {
return site.getDomain(); return site.getDomain();
} }
return null; uuid = UUID.randomUUID().toString();
return uuid;
} }
@Override @Override
......
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.http.*; import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore; import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.config.Registry; import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder; import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.ConnectionSocketFactory;
...@@ -19,7 +20,7 @@ import java.util.Map; ...@@ -19,7 +20,7 @@ import java.util.Map;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.3.3 * @since 0.4.0
*/ */
public class HttpClientGenerator { public class HttpClientGenerator {
...@@ -46,6 +47,7 @@ public class HttpClientGenerator { ...@@ -46,6 +47,7 @@ public class HttpClientGenerator {
} else { } else {
httpClientBuilder.setUserAgent(""); httpClientBuilder.setUserAgent("");
} }
if (site == null || site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process( public void process(
...@@ -56,32 +58,37 @@ public class HttpClientGenerator { ...@@ -56,32 +58,37 @@ public class HttpClientGenerator {
} }
} }
}).addInterceptorFirst(new HttpResponseInterceptor() {
public void process(
final HttpResponse response,
final HttpContext context) throws HttpException, IOException {
HttpEntity entity = response.getEntity();
if (entity != null) {
Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
response.setEntity(
new GzipDecompressingEntity(response.getEntity()));
return;
}
}
}
}
}
}); });
if (site!=null){
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
} }
generateCookie(httpClientBuilder,site); // httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() {
//
// public void process(
// final HttpResponse response,
// final HttpContext context) throws HttpException, IOException {
// if (response.getStatusLine().getStatusCode() != 200) {
// return;
// }
// HttpEntity entity = response.getEntity();
// if (entity != null) {
// Header ceheader = entity.getContentEncoding();
// if (ceheader != null) {
// HeaderElement[] codecs = ceheader.getElements();
// for (int i = 0; i < codecs.length; i++) {
// if (codecs[i].getName().equalsIgnoreCase("gzip")) {
// response.setEntity(
// new GzipDecompressingEntity(response.getEntity()));
// return;
// }
// }
// }
// }
// }
//
// });
if (site != null) {
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
}
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build(); return httpClientBuilder.build();
} }
......
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class CollectorPipeline implements Pipeline{
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
public List<ResultItems> getCollector() {
return collector;
}
}
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class BaiduBaikePageProcesser implements PageProcessor {
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override
public void process(Page page) {
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
List<String> list = new ArrayList<String>();
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电"));
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"众数"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
}
...@@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/ */
public class GithubRepoPageProcesser implements PageProcessor { public class GithubRepoPageProcesser implements PageProcessor {
private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override @Override
public void process(Page page) { public void process(Page page) {
...@@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor { ...@@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new GithubRepoPageProcesser()).thread(5).run(); Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run();
} }
} }
...@@ -12,7 +12,7 @@ import java.util.List; ...@@ -12,7 +12,7 @@ import java.util.List;
*/ */
public class OschinaBlogPageProcesser implements PageProcessor { public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); private Site site = Site.me().setDomain("my.oschina.net");
@Override @Override
public void process(Page page) { public void process(Page page) {
...@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { ...@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
} }
} }
...@@ -7,6 +7,7 @@ import java.net.MalformedURLException; ...@@ -7,6 +7,7 @@ import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -88,7 +89,7 @@ public class UrlUtils { ...@@ -88,7 +89,7 @@ public class UrlUtils {
return stringBuilder.toString(); return stringBuilder.toString();
} }
public static List<Request> convertToRequests(List<String> urls) { public static List<Request> convertToRequests(Collection<String> urls) {
List<Request> requestList = new ArrayList<Request>(urls.size()); List<Request> requestList = new ArrayList<Request>(urls.size());
for (String url : urls) { for (String url : urls) {
requestList.add(new Request(url)); requestList.add(new Request(url));
...@@ -96,7 +97,7 @@ public class UrlUtils { ...@@ -96,7 +97,7 @@ public class UrlUtils {
return requestList; return requestList;
} }
public static List<String> convertToUrls(List<Request> requests) { public static List<String> convertToUrls(Collection<Request> requests) {
List<String> urlList = new ArrayList<String>(requests.size()); List<String> urlList = new ArrayList<String>(requests.size());
for (Request request : requests) { for (Request request : requests) {
urlList.add(request.getUrl()); urlList.add(request.getUrl());
......
...@@ -11,7 +11,7 @@ import java.util.ArrayList; ...@@ -11,7 +11,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* @since 0.3.3 * @since 0.4.0
* NO implement yet!!!!!!!!!!!! * NO implement yet!!!!!!!!!!!!
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment