Commit 6428e205 authored by yihua.huang's avatar yihua.huang

add id

parent 632ca0ef
......@@ -12,13 +12,18 @@ public class Site {
private String domain;
/**
* for identify a task
*/
private String identifier;
private String userAgent;
private Map<String,String> cookies = new LinkedHashMap<String, String>();
private Map<String, String> cookies = new LinkedHashMap<String, String>();
private String encoding;
private List<String> startUrls;
private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000;
......@@ -34,8 +39,8 @@ public class Site {
return new Site();
}
public Site setCookie(String name,String value) {
cookies.put(name,value);
public Site setCookie(String name, String value) {
cookies.put(name, value);
return this;
}
......@@ -44,7 +49,7 @@ public class Site {
return this;
}
public Map<String,String> getCookies() {
public Map<String, String> getCookies() {
return cookies;
}
......@@ -61,6 +66,15 @@ public class Site {
return this;
}
public String getIdentifier() {
return identifier;
}
public Site setIdentifier(String identifier) {
this.identifier = identifier;
return this;
}
public String getEncoding() {
return encoding;
}
......
......@@ -85,7 +85,6 @@ public class Spider implements Runnable {
Thread.sleep(time);
} catch (InterruptedException e) {
e.printStackTrace();
;
}
}
......
......@@ -33,7 +33,7 @@ public class FilePipeline implements Pipeline {
public void process(Page page, Site site) {
String domain = site.getDomain();
domain = UrlUtils.getDomain(domain);
String path = this.path + "" + domain + "/";
String path = this.path + "" + domain + "#" + site.getIdentifier() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
......@@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline {
}
printWriter.close();
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
e.printStackTrace();
}
}
......
......@@ -60,7 +60,7 @@ public class FileCacheQueueSchedular implements Schedular {
private void init() {
File file = new File(filePath);
if (!file.exists()){
if (!file.exists()) {
file.mkdirs();
}
readFile();
......@@ -81,8 +81,8 @@ public class FileCacheQueueSchedular implements Schedular {
private void initWriter() {
try {
fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true));
fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false));
fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true));
fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false));
} catch (IOException e) {
throw new RuntimeException("init cache schedular error", e);
}
......@@ -100,7 +100,7 @@ public class FileCacheQueueSchedular implements Schedular {
private void readUrlFile() throws IOException {
String line;
BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName));
BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
urls.add(line.trim());
......@@ -112,7 +112,7 @@ public class FileCacheQueueSchedular implements Schedular {
}
private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor));
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
String line = null;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
......@@ -120,8 +120,12 @@ public class FileCacheQueueSchedular implements Schedular {
}
}
private String getFileName(String filename) {
return filePath + site.getDomain() + "#" + site.getIdentifier() + filename;
}
@Override
public synchronized void push(Request request,Site site) {
public synchronized void push(Request request, Site site) {
if (!inited.get()) {
init();
}
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
......@@ -17,7 +16,6 @@ import java.io.IOException;
*/
public class DiaoyuwengProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment