在进入今天的正题之前,先来简单介绍下Zookeeper:
Zookeeper是一个分布式应用程序协调服务,保证数据的一致性,其提供的功能包括:配置维护、域名维护、分布式同步、组服务等。
watch监控机制是zookeeper的关键技术之一,本文将通过zk的部分源码来简单了解下watch机制的实现原理。
watch监控机制的实现原理
当今时代,发布订阅场景到处可见,像微信中的公众号消息订阅,或者网购场景下库存消息的订阅通知等等,这些都是属于发布订阅的场景。
watch监控机制是zk的一个关键技术,zk通过它来实现发布订阅的功能,通过watch我们可以联想到设计模式中的观察者模式,二者确实有点类似,你可以将其看成是分布式场景下的观察者模式。
客户端watch的注册和回调
客户端watch注册实现过程:
发送一个带有watch事件的请求——>DataWatchRegistration保存watch事件——>将请求封装成Packet并放入一个队列等待发送——>调用SendThread中的readResponse——>ZKWatchManager将该watch事件进行存储
//Zookeeper.java
public byte[] getData(String path, Watcher watcher, Stat stat) throws KeeperException, InterruptedException {
PathUtils.validatePath(path);
ZooKeeper.WatchRegistration wcb = null;
if (watcher != null) {
//注册watch
wcb = new ZooKeeper.DataWatchRegistration(watcher, path);
}
String serverPath = this.prependChroot(path);
RequestHeader h = new RequestHeader();
h.setType(4);
GetDataRequest request = new GetDataRequest();
request.setPath(serverPath);
request.setWatch(watcher != null);
GetDataResponse response = new GetDataResponse();
ReplyHeader r = this.cnxn.submitRequest(h, request, response, wcb);
if (r.getErr() != 0) {
throw KeeperException.create(Code.get(r.getErr()), path);
} else {
if (stat != null) {
DataTree.copyStat(response.getStat(), stat);
}
return response.getData();
}
}
public ReplyHeader submitRequest(RequestHeader h, Record request, Record response, WatchRegistration watchRegistration, WatchDeregistration watchDeregistration) throws InterruptedException {
ReplyHeader r = new ReplyHeader();
ClientCnxn.Packet packet = this.queuePacket(h, r, request, response, (AsyncCallback)null, (String)null, (String)null, (Object)null, watchRegistration, watchDeregistration);
synchronized(packet) {
while(!packet.finished) {
packet.wait();
}
return r;
}
}
public ClientCnxn.Packet queuePacket(RequestHeader h, ReplyHeader r, Record request, Record response, AsyncCallback cb, String clientPath, String serverPath, Object ctx, WatchRegistration watchRegistration, WatchDeregistration watchDeregistration) {
ClientCnxn.Packet packet = null;
packet = new ClientCnxn.Packet(h, r, request, response, watchRegistration);
packet.cb = cb;
packet.ctx = ctx;
packet.clientPath = clientPath;
packet.serverPath = serverPath;
packet.watchDeregistration = watchDeregistration;
synchronized(this.state) {
if (this.state.isAlive() && !this.closing) {
if (h.getType() == -11) {
this.closing = true;
}
this.outgoingQueue.add(packet);
} else {
this.conLossPacket(packet);
}
}
this.sendThread.getClientCnxnSocket().packetAdded();
return packet;
}
class SendThread extends ZooKeeperThread {
.....
void readResponse(ByteBuffer incomingBuffer) throws IOException {
ByteBufferInputStream bbis = new ByteBufferInputStream(incomingBuffer);
BinaryInputArchive bbia = BinaryInputArchive.getArchive(bbis);
ReplyHeader replyHdr = new ReplyHeader();
replyHdr.deserialize(bbia, "header");
if (replyHdr.getXid() == -2) {
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got ping response for sessionid: 0x" + Long.toHexString(ClientCnxn.this.sessionId) + " after " + (System.nanoTime() - this.lastPingSentNs) / 1000000L + "ms");
}
} else if (replyHdr.getXid() == -4) {
if (replyHdr.getErr() == Code.AUTHFAILED.intValue()) {
ClientCnxn.this.state = States.AUTH_FAILED;
ClientCnxn.this.eventThread.queueEvent(new WatchedEvent(EventType.None, KeeperState.AuthFailed, (String)null));
}
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got auth sessionid:0x" + Long.toHexString(ClientCnxn.this.sessionId));
}
} else if (replyHdr.getXid() == -1) {
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got notification sessionid:0x" + Long.toHexString(ClientCnxn.this.sessionId));
}
WatcherEvent event = new WatcherEvent();
event.deserialize(bbia, "response");
if (ClientCnxn.this.chrootPath != null) {
String serverPath = event.getPath();
if (serverPath.compareTo(ClientCnxn.this.chrootPath) == 0) {
event.setPath("/");
} else if (serverPath.length() > ClientCnxn.this.chrootPath.length()) {
event.setPath(serverPath.substring(ClientCnxn.this.chrootPath.length()));
} else {
ClientCnxn.LOG.warn("Got server path " + event.getPath() + " which is too short for chroot path " + ClientCnxn.this.chrootPath);
}
}
WatchedEvent we = new WatchedEvent(event);
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got " + we + " for sessionid 0x" + Long.toHexString(ClientCnxn.this.sessionId));
}
ClientCnxn.this.eventThread.queueEvent(we);
} else if (this.tunnelAuthInProgress()) {
GetSASLRequest request = new GetSASLRequest();
request.deserialize(bbia, "token");
ClientCnxn.this.zooKeeperSaslClient.respondToServer(request.getToken(), ClientCnxn.this);
} else {
ClientCnxn.Packet packet;
synchronized(ClientCnxn.this.pendingQueue) {
if (ClientCnxn.this.pendingQueue.size() == 0) {
throw new IOException("Nothing in the queue, but got " + replyHdr.getXid());
}
packet = (ClientCnxn.Packet)ClientCnxn.this.pendingQueue.remove();
}
try {
if (packet.requestHeader.getXid() != replyHdr.getXid()) {
packet.replyHeader.setErr(Code.CONNECTIONLOSS.intValue());
throw new IOException("Xid out of order. Got Xid " + replyHdr.getXid() + " with err " + replyHdr.getErr() + " expected Xid " + packet.requestHeader.getXid() + " for a packet with details: " + packet);
}
packet.replyHeader.setXid(replyHdr.getXid());
packet.replyHeader.setErr(replyHdr.getErr());
packet.replyHeader.setZxid(replyHdr.getZxid());
if (replyHdr.getZxid() > 0L) {
ClientCnxn.this.lastZxid = replyHdr.getZxid();
}
if (packet.response != null && replyHdr.getErr() == 0) {
packet.response.deserialize(bbia, "response");
}
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Reading reply sessionid:0x" + Long.toHexString(ClientCnxn.this.sessionId) + ", packet:: " + packet);
}
} finally {
ClientCnxn.this.finishPacket(packet);
}
}
}
private void finishPacket(ClientCnxn.Packet p) {
int err = p.replyHeader.getErr();
if (p.watchRegistration != null) {
p.watchRegistration.register(err);
}
if (p.watchDeregistration != null) {
Map materializedWatchers = null;
try {
materializedWatchers = p.watchDeregistration.unregister(err);
Iterator i$ = materializedWatchers.entrySet().iterator();
while(i$.hasNext()) {
Entry<EventType, Set<Watcher>> entry = (Entry)i$.next();
Set<Watcher> watchers = (Set)entry.getValue();
if (watchers.size() > 0) {
this.queueEvent(p.watchDeregistration.getClientPath(), err, watchers, (EventType)entry.getKey());
p.replyHeader.setErr(Code.OK.intValue());
}
}
} catch (NoWatcherException var9) {
LOG.error("Failed to find watcher!", var9);
p.replyHeader.setErr(var9.code().intValue());
} catch (KeeperException var10) {
LOG.error("Exception when removing watcher", var10);
p.replyHeader.setErr(var10.code().intValue());
}
}
if (p.cb == null) {
synchronized(p) {
p.finished = true;
p.notifyAll();
}
} else {
p.finished = true;
this.eventThread.queuePacket(p);
}
}
客户端回调处理过程:
在SendThread.readResponse()中的xid=-1来进行处理——>调用 eventThread.queueEvent()进行处理
class SendThread extends ZooKeeperThread {
.....
void readResponse(ByteBuffer incomingBuffer) throws IOException {
ByteBufferInputStream bbis = new ByteBufferInputStream(incomingBuffer);
BinaryInputArchive bbia = BinaryInputArchive.getArchive(bbis);
ReplyHeader replyHdr = new ReplyHeader();
replyHdr.deserialize(bbia, "header");
if (replyHdr.getXid() == -2) {
....
} else if (replyHdr.getXid() == -1) {
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got notification sessionid:0x" + Long.toHexString(ClientCnxn.this.sessionId));
}
WatcherEvent event = new WatcherEvent();
event.deserialize(bbia, "response");
if (ClientCnxn.this.chrootPath != null) {
String serverPath = event.getPath();
if (serverPath.compareTo(ClientCnxn.this.chrootPath) == 0) {
event.setPath("/");
} else if (serverPath.length() > ClientCnxn.this.chrootPath.length()) {
event.setPath(serverPath.substring(ClientCnxn.this.chrootPath.length()));
} else {
ClientCnxn.LOG.warn("Got server path " + event.getPath() + " which is too short for chroot path " + ClientCnxn.this.chrootPath);
}
}
WatchedEvent we = new WatchedEvent(event);
if (ClientCnxn.LOG.isDebugEnabled()) {
ClientCnxn.LOG.debug("Got " + we + " for sessionid 0x" + Long.toHexString(ClientCnxn.this.sessionId));
}
ClientCnxn.this.eventThread.queueEvent(we);
}
....
}
服务端watch的注册和触发
服务端watch注册实现过程:
判断请求是否带有watch注册事件——>通过FinalRequestProcessor中的processRequest解析请求——>若有watch,则调用zks.getZKDatabase().getData将事件注册到WatchManager
//FinalRequestProcessor.java
public void processRequest(Request request) {
....
ProcessTxnResult rc = null;
synchronized(this.zks.outstandingChanges) {
rc = this.zks.processTxn(request);
if (request.getHdr() != null) {
TxnHeader hdr = request.getHdr();
Record txn = request.getTxn();
long zxid = hdr.getZxid();
while(!this.zks.outstandingChanges.isEmpty() && ((ChangeRecord)this.zks.outstandingChanges.get(0)).zxid <= zxid) {
ChangeRecord cr = (ChangeRecord)this.zks.outstandingChanges.remove(0);
if (cr.zxid < zxid) {
LOG.warn("Zxid outstanding " + cr.zxid + " is less than current " + zxid);
}
if (this.zks.outstandingChangesForPath.get(cr.path) == cr) {
this.zks.outstandingChangesForPath.remove(cr.path);
}
}
}
if (request.isQuorum()) {
this.zks.getZKDatabase().addCommittedProposal(request);
}
}
if (request.type != -11 || !this.connClosedByClient(request) || !this.closeSession(this.zks.serverCnxnFactory, request.sessionId) && !this.closeSession(this.zks.secureServerCnxnFactory, request.sessionId)) {
if (request.cnxn != null) {
ServerCnxn cnxn = request.cnxn;
String lastOp = "NA";
this.zks.decInProcess();
Code err = Code.OK;
Object rsp = null;
try {
if (request.getHdr() != null && request.getHdr().getType() == -1) {
if (request.getException() != null) {
throw request.getException();
}
throw KeeperException.create(Code.get(((ErrorTxn)request.getTxn()).getErr()));
}
KeeperException ke = request.getException();
if (ke != null && request.type != 14) {
throw ke;
}
if (LOG.isDebugEnabled()) {
LOG.debug("{}", request);
}
boolean removed;
String msg;
WatcherType type;
Stat stat;
DataNode n;
List children;
Stat stat;
label170:
switch(request.type) {
...
case 4:
lastOp = "GETD";
GetDataRequest getDataRequest = new GetDataRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, getDataRequest);
n = this.zks.getZKDatabase().getNode(getDataRequest.getPath());
if (n == null) {
throw new NoNodeException();
}
PrepRequestProcessor.checkACL(this.zks, this.zks.getZKDatabase().aclForNode(n), 1, request.authInfo);
stat = new Stat();
byte[] b = this.zks.getZKDatabase().getData(getDataRequest.getPath(), stat, getDataRequest.getWatch() ? cnxn : null);
rsp = new GetDataResponse(b, stat);
break;
....
} catch (SessionMovedException var15) {
....
}
long lastZxid = this.zks.getZKDatabase().getDataTreeLastProcessedZxid();
ReplyHeader hdr = new ReplyHeader(request.cxid, lastZxid, err.intValue());
this.zks.serverStats().updateLatency(request.createTime);
cnxn.updateStatsForResponse((long)request.cxid, lastZxid, lastOp, request.createTime, Time.currentElapsedTime());
try {
cnxn.sendResponse(hdr, (Record)rsp, "response");
if (request.type == -11) {
cnxn.sendCloseSession();
}
} catch (IOException var14) {
LOG.error("FIXMSG", var14);
}
}
}
}
服务端watch事件的触发过程:
setData()对节点数据变更——>调用 WatchManager.triggerWatch 触发事件——>触发之后删除(客户端watch机制是一次性的)
//DataTree.java
public Stat setData(String path, byte[] data, int version, long zxid, long time) throws NoNodeException {
Stat s = new Stat();
DataNode n = (DataNode)this.nodes.get(path);
if (n == null) {
throw new NoNodeException();
} else {
byte[] lastdata = null;
byte[] lastdata;
synchronized(n) {
lastdata = n.data;
n.data = data;
n.stat.setMtime(time);
n.stat.setMzxid(zxid);
n.stat.setVersion(version);
n.copyStat(s);
}
String lastPrefix = this.getMaxPrefixWithQuota(path);
if (lastPrefix != null) {
this.updateBytes(lastPrefix, (long)((data == null ? 0 : data.length) - (lastdata == null ? 0 : lastdata.length)));
}
this.dataWatches.triggerWatch(path, EventType.NodeDataChanged);
return s;
}
}
//WatchManager.java
Set<Watcher> triggerWatch(String path, EventType type, Set<Watcher> supress) {
WatchedEvent e = new WatchedEvent(type, KeeperState.SyncConnected, path);
HashSet watchers;
synchronized(this) {
watchers = (HashSet)this.watchTable.remove(path);
if (watchers == null || watchers.isEmpty()) {
if (LOG.isTraceEnabled()) {
ZooTrace.logTraceMessage(LOG, 64L, "No watchers for " + path);
}
return null;
}
Iterator i$ = watchers.iterator();
while(i$.hasNext()) {
Watcher w = (Watcher)i$.next();
HashSet<String> paths = (HashSet)this.watch2Paths.get(w);
if (paths != null) {
paths.remove(path);
}
}
}
Iterator i$ = watchers.iterator();
while(true) {
Watcher w;
do {
if (!i$.hasNext()) {
return watchers;
}
w = (Watcher)i$.next();
} while(supress != null && supress.contains(w));
w.process(e);
}
}
//DataTree.java
public void deleteNode(String path, long zxid) throws NoNodeException {
int lastSlash = path.lastIndexOf(47);
String parentName = path.substring(0, lastSlash);
String childName = path.substring(lastSlash + 1);
DataNode node = (DataNode)this.nodes.get(path);
if (node == null) {
throw new NoNodeException();
} else {
this.nodes.remove(path);
synchronized(node) {
this.aclCache.removeUsage(node.acl);
}
DataNode parent = (DataNode)this.nodes.get(parentName);
if (parent == null) {
throw new NoNodeException();
} else {
synchronized(parent) {
parent.removeChild(childName);
parent.stat.setPzxid(zxid);
long eowner = node.stat.getEphemeralOwner();
EphemeralType ephemeralType = EphemeralType.get(eowner);
if (ephemeralType == EphemeralType.CONTAINER) {
this.containers.remove(path);
} else if (ephemeralType == EphemeralType.TTL) {
this.ttls.remove(path);
} else if (eowner != 0L) {
HashSet<String> nodes = (HashSet)this.ephemerals.get(eowner);
if (nodes != null) {
synchronized(nodes) {
nodes.remove(path);
}
}
}
}
if (parentName.startsWith("/zookeeper") && "zookeeper_limits".equals(childName)) {
this.pTrie.deletePath(parentName.substring("/zookeeper/quota".length()));
}
String lastPrefix = this.getMaxPrefixWithQuota(path);
if (lastPrefix != null) {
this.updateCount(lastPrefix, -1);
int bytes = false;
int bytes;
synchronized(node) {
bytes = node.data == null ? 0 : -node.data.length;
}
this.updateBytes(lastPrefix, (long)bytes);
}
if (LOG.isTraceEnabled()) {
ZooTrace.logTraceMessage(LOG, 64L, "dataWatches.triggerWatch " + path);
ZooTrace.logTraceMessage(LOG, 64L, "childWatches.triggerWatch " + parentName);
}
Set<Watcher> processed = this.dataWatches.triggerWatch(path, EventType.NodeDeleted);
this.childWatches.triggerWatch(path, EventType.NodeDeleted, processed);
this.childWatches.triggerWatch("".equals(parentName) ? "/" : parentName, EventType.NodeChildrenChanged);
}
}
}
总结
客户端和服务端各自处理watch事件,并将所需信息分别在两端,这样一来,能够减少彼此之间的通信内容和频率,大大提升了服务的处理性能。