Commit cf62d707 authored by yihua.huang's avatar yihua.huang

#36 Spider does not exit when success

parent a0131293
......@@ -186,8 +186,9 @@ public class Page {
return rawText;
}
public void setRawText(String rawText) {
public Page setRawText(String rawText) {
this.rawText = rawText;
return this;
}
@Override
......
......@@ -98,6 +98,8 @@ public class Spider implements Runnable, Task {
private Condition newUrlCondition = newUrlLock.newCondition();
private final AtomicInteger threadAlive = new AtomicInteger(0);
/**
* create a spider with pageProcessor.
*
......@@ -276,6 +278,7 @@ public class Spider implements Runnable, Task {
}
startRequests.clear();
}
threadAlive.set(0);
}
@Override
......@@ -283,7 +286,6 @@ public class Spider implements Runnable, Task {
checkRunningStat();
initComponent();
logger.info("Spider " + getUUID() + " started!");
final AtomicInteger threadAlive = new AtomicInteger(0);
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
if (request == null) {
......@@ -369,7 +371,7 @@ public class Spider implements Runnable, Task {
return;
}
// for cycle retry
if (page.getHtml() == null) {
if (page.getRawText() == null) {
extractAndAddRequests(page);
sleep(site.getSleepTime());
return;
......@@ -485,6 +487,10 @@ public class Spider implements Runnable, Task {
private void waitNewUrl() {
try {
newUrlLock.lock();
//double check
if (threadAlive.get() == 0 && exitWhenComplete) {
return;
}
try {
newUrlCondition.await();
} catch (InterruptedException e) {
......
......@@ -2,8 +2,14 @@ package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com
......@@ -26,4 +32,62 @@ public class SpiderTest {
spider.start();
Thread.sleep(10000);
}
@Ignore("long time")
@Test
public void testWaitAndNotify() throws InterruptedException {
for (int i = 0; i < 10000; i++) {
System.out.println("round" + i);
testRound();
}
}
private void testRound() {
Spider spider = Spider.create(new PageProcessor() {
private AtomicInteger count = new AtomicInteger();
@Override
public void process(Page page) {
page.setSkip(true);
}
@Override
public Site getSite() {
return Site.me().setSleepTime(0);
}
}).setDownloader(new Downloader() {
@Override
public Page download(Request request, Task task) {
return new Page().setRawText("");
}
@Override
public void setThread(int threadNum) {
}
}).setScheduler(new Scheduler() {
private AtomicInteger count = new AtomicInteger();
private Random random = new Random();
@Override
public void push(Request request, Task task) {
}
@Override
public synchronized Request poll(Task task) {
if (count.incrementAndGet() > 1000) {
return null;
}
if (random.nextInt(100)>90){
return null;
}
return new Request("test");
}
}).thread(10);
spider.run();
}
}
package us.codecraft.webmagic;
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
/**
* @author code4crafter@gmail.com
*/
public class MockDownloader implements Downloader{
public class MockGithubDownloader implements Downloader{
private String html = "\n" +
"\n" +
......
......@@ -2,7 +2,7 @@ package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo;
......@@ -22,6 +22,6 @@ public class GithubRepoTest {
Assert.assertEquals(86, o.getStar());
Assert.assertEquals(70, o.getFork());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
}
......@@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
......@@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment