Commit 6428e205 authored by yihua.huang's avatar yihua.huang

add id

parent 632ca0ef
...@@ -12,13 +12,18 @@ public class Site { ...@@ -12,13 +12,18 @@ public class Site {
private String domain; private String domain;
/**
* for identify a task
*/
private String identifier;
private String userAgent; private String userAgent;
private Map<String,String> cookies = new LinkedHashMap<String, String>(); private Map<String, String> cookies = new LinkedHashMap<String, String>();
private String encoding; private String encoding;
private List<String> startUrls; private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000; private int sleepTime = 3000;
...@@ -34,8 +39,8 @@ public class Site { ...@@ -34,8 +39,8 @@ public class Site {
return new Site(); return new Site();
} }
public Site setCookie(String name,String value) { public Site setCookie(String name, String value) {
cookies.put(name,value); cookies.put(name, value);
return this; return this;
} }
...@@ -44,7 +49,7 @@ public class Site { ...@@ -44,7 +49,7 @@ public class Site {
return this; return this;
} }
public Map<String,String> getCookies() { public Map<String, String> getCookies() {
return cookies; return cookies;
} }
...@@ -61,6 +66,15 @@ public class Site { ...@@ -61,6 +66,15 @@ public class Site {
return this; return this;
} }
public String getIdentifier() {
return identifier;
}
public Site setIdentifier(String identifier) {
this.identifier = identifier;
return this;
}
public String getEncoding() { public String getEncoding() {
return encoding; return encoding;
} }
......
...@@ -85,7 +85,6 @@ public class Spider implements Runnable { ...@@ -85,7 +85,6 @@ public class Spider implements Runnable {
Thread.sleep(time); Thread.sleep(time);
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
;
} }
} }
......
...@@ -33,7 +33,7 @@ public class FilePipeline implements Pipeline { ...@@ -33,7 +33,7 @@ public class FilePipeline implements Pipeline {
public void process(Page page, Site site) { public void process(Page page, Site site) {
String domain = site.getDomain(); String domain = site.getDomain();
domain = UrlUtils.getDomain(domain); domain = UrlUtils.getDomain(domain);
String path = this.path + "" + domain + "/"; String path = this.path + "" + domain + "#" + site.getIdentifier() + "/";
File file = new File(path); File file = new File(path);
if (!file.exists()) { if (!file.exists()) {
file.mkdirs(); file.mkdirs();
...@@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline { ...@@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline {
} }
printWriter.close(); printWriter.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. e.printStackTrace();
} }
} }
......
...@@ -60,7 +60,7 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -60,7 +60,7 @@ public class FileCacheQueueSchedular implements Schedular {
private void init() { private void init() {
File file = new File(filePath); File file = new File(filePath);
if (!file.exists()){ if (!file.exists()) {
file.mkdirs(); file.mkdirs();
} }
readFile(); readFile();
...@@ -81,8 +81,8 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -81,8 +81,8 @@ public class FileCacheQueueSchedular implements Schedular {
private void initWriter() { private void initWriter() {
try { try {
fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true)); fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true));
fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false)); fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false));
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("init cache schedular error", e); throw new RuntimeException("init cache schedular error", e);
} }
...@@ -100,7 +100,7 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -100,7 +100,7 @@ public class FileCacheQueueSchedular implements Schedular {
private void readUrlFile() throws IOException { private void readUrlFile() throws IOException {
String line; String line;
BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName)); BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
int lineReaded = 0; int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) { while ((line = fileUrlReader.readLine()) != null) {
urls.add(line.trim()); urls.add(line.trim());
...@@ -112,7 +112,7 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -112,7 +112,7 @@ public class FileCacheQueueSchedular implements Schedular {
} }
private void readCursorFile() throws IOException { private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor)); BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
String line = null; String line = null;
//read the last number //read the last number
while ((line = fileCursorReader.readLine()) != null) { while ((line = fileCursorReader.readLine()) != null) {
...@@ -120,8 +120,12 @@ public class FileCacheQueueSchedular implements Schedular { ...@@ -120,8 +120,12 @@ public class FileCacheQueueSchedular implements Schedular {
} }
} }
private String getFileName(String filename) {
return filePath + site.getDomain() + "#" + site.getIdentifier() + filename;
}
@Override @Override
public synchronized void push(Request request,Site site) { public synchronized void push(Request request, Site site) {
if (!inited.get()) { if (!inited.get()) {
init(); init();
} }
......
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
...@@ -17,7 +16,6 @@ import java.io.IOException; ...@@ -17,7 +16,6 @@ import java.io.IOException;
*/ */
public class DiaoyuwengProcessorTest { public class DiaoyuwengProcessorTest {
@Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment