Commit c6661899 authored by yihua.huang's avatar yihua.huang

new thread pool #110

parent 179baa7a
package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
......@@ -15,7 +13,7 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.ThreadUtils;
import us.codecraft.webmagic.selector.thread.ThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable;
......@@ -79,7 +77,7 @@ public class Spider implements Runnable, Task {
protected Logger logger = LoggerFactory.getLogger(getClass());
protected ExecutorService executorService;
protected ThreadPool threadPool;
protected int threadNum = 1;
......@@ -101,8 +99,6 @@ public class Spider implements Runnable, Task {
private Condition newUrlCondition = newUrlLock.newCondition();
private final AtomicInteger threadAlive = new AtomicInteger(0);
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
......@@ -283,8 +279,8 @@ public class Spider implements Runnable, Task {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
if (executorService == null || executorService.isShutdown()) {
executorService = ThreadUtils.newFixedThreadPool(threadNum);
if (threadPool == null || threadPool.isShutdown()) {
threadPool = new ThreadPool(threadNum);
}
if (startRequests != null) {
for (Request request : startRequests) {
......@@ -292,7 +288,6 @@ public class Spider implements Runnable, Task {
}
startRequests.clear();
}
threadAlive.set(0);
}
@Override
......@@ -303,15 +298,14 @@ public class Spider implements Runnable, Task {
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
if (request == null) {
if (threadAlive.get() == 0 && exitWhenComplete) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
}
// wait until new url added
waitNewUrl();
} else {
final Request requestFinal = request;
threadAlive.incrementAndGet();
executorService.execute(new Runnable() {
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
......@@ -321,7 +315,6 @@ public class Spider implements Runnable, Task {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
} finally {
threadAlive.decrementAndGet();
pageCount.incrementAndGet();
signalNewUrl();
}
......@@ -370,7 +363,7 @@ public class Spider implements Runnable, Task {
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
executorService.shutdown();
threadPool.shutdown();
}
private void destroyEach(Object object) {
......@@ -522,7 +515,7 @@ public class Spider implements Runnable, Task {
newUrlLock.lock();
try {
//double check
if (threadAlive.get() == 0 && exitWhenComplete) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
return;
}
newUrlCondition.await();
......@@ -644,7 +637,7 @@ public class Spider implements Runnable, Task {
* @since 0.4.1
*/
public int getThreadAlive() {
return threadAlive.get();
return threadPool.getThreadAlive();
}
/**
......@@ -674,7 +667,7 @@ public class Spider implements Runnable, Task {
}
public Spider setExecutorService(ExecutorService executorService) {
this.executorService = executorService;
this.threadPool.setExecutorService(executorService);
return this;
}
......
......@@ -11,11 +11,12 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
......
package us.codecraft.webmagic.selector.thread;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public class ThreadPool {
private int threadNum;
private int threadAlive;
private ReentrantLock reentrantLock = new ReentrantLock();
private Condition condition = reentrantLock.newCondition();
public ThreadPool(int threadNum) {
this.threadNum = threadNum;
this.executorService = Executors.newFixedThreadPool(threadNum);
}
public ThreadPool(int threadNum, ExecutorService executorService) {
this.threadNum = threadNum;
this.executorService = executorService;
}
public void setExecutorService(ExecutorService executorService) {
this.executorService = executorService;
}
public int getThreadAlive() {
return threadAlive;
}
public int getThreadNum() {
return threadNum;
}
private ExecutorService executorService;
public void execute(Runnable runnable) {
try {
reentrantLock.lock();
while (threadAlive >= threadNum) {
try {
condition.await();
} catch (InterruptedException e) {
}
}
threadAlive++;
executorService.execute(runnable);
} finally {
condition.notify();
threadAlive--;
reentrantLock.unlock();
}
}
public boolean isShutdown() {
return executorService.isShutdown();
}
public void shutdown() {
executorService.shutdown();
}
}
......@@ -19,7 +19,6 @@ public class ThreadUtils {
}
if (threadSize == 1) {
return MoreExecutors.sameThreadExecutor();
}
return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS,
new SynchronousQueue<Runnable>(), new ThreadPoolExecutor.CallerRunsPolicy());
......
......@@ -240,7 +240,7 @@ public class SpiderMonitor {
//Others will be registered
spiderMonitor.server().jmxStart();
oschinaSpider.start();
githubSpider.start();
githubSpider.thread(10).start();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment