Commit cf62d707 authored by yihua.huang's avatar yihua.huang

#36 Spider does not exit when success

parent a0131293
...@@ -186,8 +186,9 @@ public class Page { ...@@ -186,8 +186,9 @@ public class Page {
return rawText; return rawText;
} }
public void setRawText(String rawText) { public Page setRawText(String rawText) {
this.rawText = rawText; this.rawText = rawText;
return this;
} }
@Override @Override
......
...@@ -98,6 +98,8 @@ public class Spider implements Runnable, Task { ...@@ -98,6 +98,8 @@ public class Spider implements Runnable, Task {
private Condition newUrlCondition = newUrlLock.newCondition(); private Condition newUrlCondition = newUrlLock.newCondition();
private final AtomicInteger threadAlive = new AtomicInteger(0);
/** /**
* create a spider with pageProcessor. * create a spider with pageProcessor.
* *
...@@ -276,6 +278,7 @@ public class Spider implements Runnable, Task { ...@@ -276,6 +278,7 @@ public class Spider implements Runnable, Task {
} }
startRequests.clear(); startRequests.clear();
} }
threadAlive.set(0);
} }
@Override @Override
...@@ -283,7 +286,6 @@ public class Spider implements Runnable, Task { ...@@ -283,7 +286,6 @@ public class Spider implements Runnable, Task {
checkRunningStat(); checkRunningStat();
initComponent(); initComponent();
logger.info("Spider " + getUUID() + " started!"); logger.info("Spider " + getUUID() + " started!");
final AtomicInteger threadAlive = new AtomicInteger(0);
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this); Request request = scheduler.poll(this);
if (request == null) { if (request == null) {
...@@ -369,7 +371,7 @@ public class Spider implements Runnable, Task { ...@@ -369,7 +371,7 @@ public class Spider implements Runnable, Task {
return; return;
} }
// for cycle retry // for cycle retry
if (page.getHtml() == null) { if (page.getRawText() == null) {
extractAndAddRequests(page); extractAndAddRequests(page);
sleep(site.getSleepTime()); sleep(site.getSleepTime());
return; return;
...@@ -485,6 +487,10 @@ public class Spider implements Runnable, Task { ...@@ -485,6 +487,10 @@ public class Spider implements Runnable, Task {
private void waitNewUrl() { private void waitNewUrl() {
try { try {
newUrlLock.lock(); newUrlLock.lock();
//double check
if (threadAlive.get() == 0 && exitWhenComplete) {
return;
}
try { try {
newUrlCondition.await(); newUrlCondition.await();
} catch (InterruptedException e) { } catch (InterruptedException e) {
......
...@@ -2,8 +2,14 @@ package us.codecraft.webmagic; ...@@ -2,8 +2,14 @@ package us.codecraft.webmagic;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
...@@ -26,4 +32,62 @@ public class SpiderTest { ...@@ -26,4 +32,62 @@ public class SpiderTest {
spider.start(); spider.start();
Thread.sleep(10000); Thread.sleep(10000);
} }
@Ignore("long time")
@Test
public void testWaitAndNotify() throws InterruptedException {
for (int i = 0; i < 10000; i++) {
System.out.println("round" + i);
testRound();
}
}
private void testRound() {
Spider spider = Spider.create(new PageProcessor() {
private AtomicInteger count = new AtomicInteger();
@Override
public void process(Page page) {
page.setSkip(true);
}
@Override
public Site getSite() {
return Site.me().setSleepTime(0);
}
}).setDownloader(new Downloader() {
@Override
public Page download(Request request, Task task) {
return new Page().setRawText("");
}
@Override
public void setThread(int threadNum) {
}
}).setScheduler(new Scheduler() {
private AtomicInteger count = new AtomicInteger();
private Random random = new Random();
@Override
public void push(Request request, Task task) {
}
@Override
public synchronized Request poll(Task task) {
if (count.incrementAndGet() > 1000) {
return null;
}
if (random.nextInt(100)>90){
return null;
}
return new Request("test");
}
}).thread(10);
spider.run();
}
} }
package us.codecraft.webmagic; package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class MockDownloader implements Downloader{ public class MockGithubDownloader implements Downloader{
private String html = "\n" + private String html = "\n" +
"\n" + "\n" +
......
...@@ -2,7 +2,7 @@ package us.codecraft.webmagic.model; ...@@ -2,7 +2,7 @@ package us.codecraft.webmagic.model;
import junit.framework.Assert; import junit.framework.Assert;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.MockDownloader; import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo; import us.codecraft.webmagic.example.GithubRepo;
...@@ -22,6 +22,6 @@ public class GithubRepoTest { ...@@ -22,6 +22,6 @@ public class GithubRepoTest {
Assert.assertEquals(86, o.getStar()); Assert.assertEquals(86, o.getStar());
Assert.assertEquals(70, o.getFork()); Assert.assertEquals(70, o.getFork());
} }
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); }, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
} }
} }
...@@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor; ...@@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor;
import junit.framework.Assert; import junit.framework.Assert;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.*; import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
...@@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor { ...@@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor {
Assert.assertEquals("78",((String)resultItems.get("star")).trim()); Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
} }
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment