Commit b75e64a6 authored by yihua.huang's avatar yihua.huang

t push origin masterMerge branch 'yxssfxwzy-proxy'

parents eb89d665 074d767f
...@@ -424,6 +424,8 @@ public class Spider implements Runnable, Task { ...@@ -424,6 +424,8 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this); pipeline.process(page.getResultItems(), this);
} }
} }
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime()); sleep(site.getSleepTime());
} }
......
...@@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit; ...@@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
/** /**
* >>>>Proxy Status * >>>> Proxy lifecycle
+----------+ +-----+ +----------+ +-----+
| last use | | new | | last use | | new |
+-----+----+ +---+-+ +-----+----+ +---+-+
...@@ -44,13 +45,22 @@ import org.apache.http.HttpHost; ...@@ -44,13 +45,22 @@ import org.apache.http.HttpHost;
| |+-------------------+ | |+-------------------+
+--------+ +--------+
*/ */
/**
* Object has these status of lifecycle above.<br>
*
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see ProxyPool
*/
public class Proxy implements Delayed, Serializable { public class Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L; private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403; public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404; public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000; public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001; public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200; public static final int SUCCESS = 200;
private final HttpHost httpHost; private final HttpHost httpHost;
...@@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable { ...@@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable {
private Long canReuseTime = 0L; private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis(); private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L; private Long responseTime = 0L;
private Long idleTime = 0L;
private int failedNum = 0; private int failedNum = 0;
private int successNum = 0; private int successNum = 0;
...@@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable { ...@@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable {
@Override @Override
public long getDelay(TimeUnit unit) { public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS); return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
} }
@Override @Override
......
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost; import java.io.File;
import org.slf4j.Logger; import java.io.FileInputStream;
import org.slf4j.LoggerFactory; import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.*; import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.*; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Timer;
import java.util.TimerTask;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue; import java.util.concurrent.DelayQueue;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.ProxyUtils;
/** /**
* ClassName:ProxyPool * Pooled Proxy Object
* *
* @see * @author yxssfxwzy@sina.com <br>
* @Function: TODO ADD FUNCTION * @since 0.5.1
* @author ch * @see Proxy
* @version Ver 1.0
* @Date 2014-2-14 下午01:10:04
*/ */
public class ProxyPool { public class ProxyPool {
...@@ -31,10 +44,14 @@ public class ProxyPool { ...@@ -31,10 +44,14 @@ public class ProxyPool {
private int reuseInterval = 1500;// ms private int reuseInterval = 1500;// ms
private int reviveTime = 2 * 60 * 60 * 1000;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms
private int saveProxyInterval = 10 * 60 * 1000;// ms
private boolean isEnable = false; private boolean isEnable = false;
private boolean validateWhenInit = false; private boolean validateWhenInit = false;
private String proxyFile = "data/lastUse.proxy"; // private boolean isUseLastProxy = true;
private String proxyFilePath = "/data/webmagic/lastUse.proxy";
private FilePersistentBase fBase = new FilePersistentBase();
private Timer timer = new Timer(true); private Timer timer = new Timer(true);
private TimerTask saveProxyTask = new TimerTask() { private TimerTask saveProxyTask = new TimerTask() {
...@@ -47,13 +64,46 @@ public class ProxyPool { ...@@ -47,13 +64,46 @@ public class ProxyPool {
}; };
public ProxyPool() { public ProxyPool() {
this(null, true);
} }
public ProxyPool(List<String[]> httpProxyList) { public ProxyPool(List<String[]> httpProxyList) {
readProxyList(); this(httpProxyList, true);
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); }
timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
if (httpProxyList != null) {
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
}
if (isUseLastProxy) {
if (!new File(proxyFilePath).exists()) {
setFilePath();
}
setFilePath();
readProxyList();
timer.schedule(saveProxyTask, 0, saveProxyInterval);
}
}
private void setFilePath() {
String tmpDir = System.getProperty("java.io.tmpdir");
String path = tmpDir + "webmagic\\lastUse.proxy";
if (tmpDir != null && new File(tmpDir).isDirectory()) {
fBase.setPath(tmpDir + "webmagic");
File f = fBase.getFile(path);
if (!f.exists()) {
try {
f.createNewFile();
} catch (IOException e) {
logger.error("proxy file create error", e);
}
}
} else {
logger.error("java tmp dir not exists");
}
this.proxyFilePath = path;
} }
private void saveProxyList() { private void saveProxyList() {
...@@ -61,7 +111,7 @@ public class ProxyPool { ...@@ -61,7 +111,7 @@ public class ProxyPool {
return; return;
} }
try { try {
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile)); ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
os.writeObject(prepareForSaving()); os.writeObject(prepareForSaving());
os.close(); os.close();
logger.info("save proxy"); logger.info("save proxy");
...@@ -84,15 +134,15 @@ public class ProxyPool { ...@@ -84,15 +134,15 @@ public class ProxyPool {
private void readProxyList() { private void readProxyList() {
try { try {
ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile)); ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
addProxy((Map<String, Proxy>) is.readObject()); addProxy((Map<String, Proxy>) is.readObject());
is.close(); is.close();
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("proxy file not found", e); logger.info("last use proxy file not found", e);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); // e.printStackTrace();
} catch (ClassNotFoundException e) { } catch (ClassNotFoundException e) {
e.printStackTrace(); // e.printStackTrace();
} }
} }
...@@ -103,7 +153,7 @@ public class ProxyPool { ...@@ -103,7 +153,7 @@ public class ProxyPool {
if (allProxy.containsKey(entry.getKey())) { if (allProxy.containsKey(entry.getKey())) {
continue; continue;
} }
if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) { if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
entry.getValue().setFailedNum(0); entry.getValue().setFailedNum(0);
entry.getValue().setReuseTimeInterval(reuseInterval); entry.getValue().setReuseTimeInterval(reuseInterval);
proxyQueue.add(entry.getValue()); proxyQueue.add(entry.getValue());
...@@ -124,7 +174,7 @@ public class ProxyPool { ...@@ -124,7 +174,7 @@ public class ProxyPool {
continue; continue;
} }
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
if (!validateWhenInit || ProxyUtil.validateProxy(item)) { if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval); Proxy p = new Proxy(item, reuseInterval);
proxyQueue.add(p); proxyQueue.add(p);
allProxy.put(s[0], p); allProxy.put(s[0], p);
...@@ -173,7 +223,7 @@ public class ProxyPool { ...@@ -173,7 +223,7 @@ public class ProxyPool {
p.successNumIncrement(1); p.successNumIncrement(1);
break; break;
case Proxy.ERROR_403: case Proxy.ERROR_403:
// banned,try larger interval // banned,try longer interval
p.fail(Proxy.ERROR_403); p.fail(Proxy.ERROR_403);
p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
...@@ -185,7 +235,7 @@ public class ProxyPool { ...@@ -185,7 +235,7 @@ public class ProxyPool {
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break; break;
case Proxy.ERROR_404: case Proxy.ERROR_404:
//p.fail(Proxy.ERROR_404); // p.fail(Proxy.ERROR_404);
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
break; break;
default: default:
...@@ -193,14 +243,12 @@ public class ProxyPool { ...@@ -193,14 +243,12 @@ public class ProxyPool {
break; break;
} }
if (p.getFailedNum() > 20) { if (p.getFailedNum() > 20) {
// allProxy.remove(host.getAddress().getHostAddress());
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
} }
if (p.getFailedNum()%5==0) { if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
if (!ProxyUtil.validateProxy(host)) { if (!ProxyUtils.validateProxy(host)) {
// allProxy.remove(host.getAddress().getHostAddress());
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
...@@ -219,7 +267,6 @@ public class ProxyPool { ...@@ -219,7 +267,6 @@ public class ProxyPool {
re += entry.getValue().toString() + "\n"; re += entry.getValue().toString() + "\n";
} }
return re; return re;
} }
public int getIdleNum() { public int getIdleNum() {
...@@ -234,57 +281,44 @@ public class ProxyPool { ...@@ -234,57 +281,44 @@ public class ProxyPool {
this.reuseInterval = reuseInterval; this.reuseInterval = reuseInterval;
} }
public static List<String[]> getProxyList() { public void enable(boolean isEnable) {
List<String[]> proxyList = new ArrayList<String[]>(); this.isEnable = isEnable;
BufferedReader br = null; }
try {
br = new BufferedReader(new FileReader(new File("proxy.txt")));
String line = ""; public boolean isEnable() {
while ((line = br.readLine()) != null) { return isEnable;
proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return proxyList;
} }
public static void main(String[] args) throws IOException { public int getReviveTime() {
ProxyPool proxyPool = new ProxyPool(getProxyList()); return reviveTime;
proxyPool.setReuseInterval(10000); }
// proxyPool.saveProxyList();
while (true) {
List<HttpHost> httphostList = new ArrayList<HttpHost>();
System.in.read();
int i = 0;
while (proxyPool.getIdleNum() > 2) {
HttpHost httphost = proxyPool.getProxy();
httphostList.add(httphost);
// proxyPool.proxyPool.use(httphost);
proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
i++;
}
System.out.println(proxyPool.allProxyStatus());
System.in.read();
for (i = 0; i < httphostList.size(); i++) {
proxyPool.returnProxy(httphostList.get(i), 200);
proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
}
System.out.println(proxyPool.allProxyStatus());
System.in.read();
}
public void setReviveTime(int reviveTime) {
this.reviveTime = reviveTime;
} }
public void enable(boolean isEnable) { public boolean isValidateWhenInit() {
this.isEnable = isEnable; return validateWhenInit;
} }
public boolean isEnable() { public void validateWhenInit(boolean validateWhenInit) {
return isEnable; this.validateWhenInit = validateWhenInit;
}
public int getSaveProxyInterval() {
return saveProxyInterval;
} }
public void setSaveProxyInterval(int saveProxyInterval) {
this.saveProxyInterval = saveProxyInterval;
}
public String getProxyFilePath() {
return proxyFilePath;
}
public void setProxyFilePath(String proxyFilePath) {
this.proxyFilePath = proxyFilePath;
}
} }
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.utils;
import java.io.IOException; import java.io.IOException;
import java.net.Inet6Address; import java.net.Inet6Address;
...@@ -7,36 +7,54 @@ import java.net.InetSocketAddress; ...@@ -7,36 +7,54 @@ import java.net.InetSocketAddress;
import java.net.NetworkInterface; import java.net.NetworkInterface;
import java.net.Socket; import java.net.Socket;
import java.net.SocketException; import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.regex.Pattern;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
/** /**
* ClassName:ProxyUtil * Pooled Proxy Object
* *
* @see * @author yxssfxwzy@sina.com <br>
* @author ch * @since 0.5.1
* @version Ver 1.0
* @Date 2014-2-16 下午04:20:07
*/ */
public class ProxyUtil {
// TODO 改为单例 public class ProxyUtils {
private static InetAddress localAddr; private static InetAddress localAddr;
private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class); private static String networkInterface = "eth7";
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
static { static {
init(); init();
} }
private static void init() { private static void init() {
// first way to get local IP
try {
localAddr = InetAddress.getLocalHost();
logger.info("local IP:" + localAddr.getHostAddress());
} catch (UnknownHostException e) {
logger.info("try again\n");
}
if (localAddr != null) {
return;
}
// other way to get local IP
Enumeration<InetAddress> localAddrs; Enumeration<InetAddress> localAddrs;
try { try {
NetworkInterface ni = NetworkInterface.getByName("eth7"); // modify your network interface name
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
if (ni == null) { if (ni == null) {
logger.error("choose NetworkInterface\n" + getNetworkInterface()); return;
} }
localAddrs = ni.getInetAddresses(); localAddrs = ni.getInetAddresses();
if (localAddrs == null || !localAddrs.hasMoreElements()) {
logger.error("choose NetworkInterface\n" + getNetworkInterface());
return;
}
while (localAddrs.hasMoreElements()) { while (localAddrs.hasMoreElements()) {
InetAddress tmp = localAddrs.nextElement(); InetAddress tmp = localAddrs.nextElement();
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) { if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
...@@ -49,12 +67,11 @@ public class ProxyUtil { ...@@ -49,12 +67,11 @@ public class ProxyUtil {
logger.error("Failure when init ProxyUtil", e); logger.error("Failure when init ProxyUtil", e);
logger.error("choose NetworkInterface\n" + getNetworkInterface()); logger.error("choose NetworkInterface\n" + getNetworkInterface());
} }
} }
public static boolean validateProxy(HttpHost p) { public static boolean validateProxy(HttpHost p) {
if (localAddr == null) { if (localAddr == null) {
logger.error("cannot get local ip"); logger.error("cannot get local IP");
return false; return false;
} }
boolean isReachable = false; boolean isReachable = false;
...@@ -81,7 +98,8 @@ public class ProxyUtil { ...@@ -81,7 +98,8 @@ public class ProxyUtil {
} }
private static String getNetworkInterface() { private static String getNetworkInterface() {
String networkInterfaceName = "";
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
Enumeration<NetworkInterface> enumeration = null; Enumeration<NetworkInterface> enumeration = null;
try { try {
enumeration = NetworkInterface.getNetworkInterfaces(); enumeration = NetworkInterface.getNetworkInterfaces();
...@@ -90,10 +108,14 @@ public class ProxyUtil { ...@@ -90,10 +108,14 @@ public class ProxyUtil {
} }
while (enumeration.hasMoreElements()) { while (enumeration.hasMoreElements()) {
NetworkInterface networkInterface = enumeration.nextElement(); NetworkInterface networkInterface = enumeration.nextElement();
networkInterfaceName += networkInterface.toString() + '\n';
Enumeration<InetAddress> addr = networkInterface.getInetAddresses(); Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
while (addr.hasMoreElements()) { while (addr.hasMoreElements()) {
networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n"; String s = addr.nextElement().getHostAddress();
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
}
} }
} }
return networkInterfaceName; return networkInterfaceName;
......
package us.codecraft.webmagic.proxy;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
import us.codecraft.webmagic.Request;
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
*/
public class ProxyTest {
private static List<String[]> httpProxyList = new ArrayList<String[]>();
@BeforeClass
public static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
for (String line : source) {
httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
}
}
@Test
public void testAddProxy() {
}
@Test
public void testProxy() {
ProxyPool proxyPool = new ProxyPool(httpProxyList);
proxyPool.setReuseInterval(500);
assertThat(proxyPool.getIdleNum()).isEqualTo(4);
assertThat(new File(proxyPool.getProxyFilePath()).exists()).isEqualTo(true);
for (int i = 0; i < 2; i++) {
List<Fetch> fetchList = new ArrayList<Fetch>();
while (proxyPool.getIdleNum() != 0) {
HttpHost httphost = proxyPool.getProxy();
// httphostList.add(httphost);
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
Fetch tmp = new Fetch(httphost);
tmp.start();
fetchList.add(tmp);
}
for (Fetch fetch : fetchList) {
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
}
System.out.println(proxyPool.allProxyStatus());
}
}
class Fetch extends Thread {
HttpHost hp;
public Fetch(HttpHost hp) {
this.hp = hp;
}
@Override
public void run() {
try {
System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment