Redis watchメカニズムの解析
21383 ワード
Redis watchメカニズムの解析
我々はredisのwatchとmultiを用いていくつかの同時操作を処理し,redisのwatch+multiは実際には楽観的なロックであり,今日はその実現メカニズムを分析する.
共通のコードセグメント
フローチャート
クライアントAとクライアントBが同時にこのコードを実行する場合、トランザクションの実行がシリアルであるため、AクライアントがBより先に実行されると仮定すると、A実行が完了すると、クライアントAはwatchがこのkeyを持つリストから削除され、リスト内のすべてのクライアントがCLIENT_に設定されるDIRTY_CAS、その後Bが実行されると、トランザクションはBの状態がCLIENT_であることを発見するDIRTY_CASは、トランザクションを終了し、失敗を返します.
きおく redisハッシュテーブル+チェーンテーブルでwatchキーを格納したクライアント: ハッシュテーブルkeyはredisのkeyであり、ハッシュテーブルのvalueはクライアントからなるチェーンテーブルである
位置決めkeyの複雑度O(1)、クライアントの検索と処理の複雑度O(n)
Key1 => (client1->client2->client3...)
Key2 => (client1->client2->client3...)各クライアントも、watch済みkey を格納するためのチェーンテーブルを維持する.
関連ソース
関連ファイル
multi.h
multi.c
db.c
t_string.c
watch
multi
exec
set,hsetはsetコマンドで?
我々はredisのwatchとmultiを用いていくつかの同時操作を処理し,redisのwatch+multiは実際には楽観的なロックであり,今日はその実現メカニズムを分析する.
共通のコードセグメント
$key = 'xxxx';
$redis->watch($key);
$redis->multi();
// key
$redis->set($key);
$flag = $redis->exec();
// false
if ($flag === false) {
} else {
}
フローチャート
クライアントAとクライアントBが同時にこのコードを実行する場合、トランザクションの実行がシリアルであるため、AクライアントがBより先に実行されると仮定すると、A実行が完了すると、クライアントAはwatchがこのkeyを持つリストから削除され、リスト内のすべてのクライアントがCLIENT_に設定されるDIRTY_CAS、その後Bが実行されると、トランザクションはBの状態がCLIENT_であることを発見するDIRTY_CASは、トランザクションを終了し、失敗を返します.
きおく
位置決めkeyの複雑度O(1)、クライアントの検索と処理の複雑度O(n)
Key1 => (client1->client2->client3...)
Key2 => (client1->client2->client3...)
関連ソース
関連ファイル
multi.h
multi.c
db.c
t_string.c
watch
/* watch */
void watchCommand(client *c) {
int j;
if (c->flags & CLIENT_MULTI) {
addReplyError(c,"WATCH inside MULTI is not allowed");
return;
}
for (j = 1; j < c->argc; j++)
watchForKey(c,c->argv[j]);
addReply(c,shared.ok);
}
typedef struct watchedKey {
robj *key;
redisDb *db;
} watchedKey;
/* watch key */
void watchForKey(client *c, robj *key) {
list *clients = NULL;
listIter li;
listNode *ln;
watchedKey *wk;
/* key watch watch */
//
listRewind(c->watched_keys,&li);
// watch key
while((ln = listNext(&li))) {
wk = listNodeValue(ln);
// key,
if (wk->db == c->db && equalStringObjects(key,wk->key))
return; /* Key already watched */
}
/* watch, */
// hash key
clients = dictFetchValue(c->db->watched_keys,key);
// ,
if (!clients) {
clients = listCreate();
dictAdd(c->db->watched_keys,key,clients);
incrRefCount(key);
}
//
listAddNodeTail(clients,c);
/* watch_keys */
wk = zmalloc(sizeof(*wk));
wk->key = key;
wk->db = c->db;
incrRefCount(key);
listAddNodeTail(c->watched_keys,wk);
}
multi
/* multi */
void multiCommand(client *c) {
// , multi
if (c->flags & CLIENT_MULTI) {
addReplyError(c,"MULTI calls can not be nested");
return;
}
// CLIENT_MULTI
c->flags |= CLIENT_MULTI;
addReply(c,shared.ok);
}
/* */
void initClientMultiState(client *c) {
c->mstate.commands = NULL;
c->mstate.count = 0;
}
exec
/* exec */
void execCommand(client *c) {
int j;
robj **orig_argv;
int orig_argc;
struct redisCommand *orig_cmd;
int must_propagate = 0; /* Need to propagate MULTI/EXEC to AOF / slaves? */
int was_master = server.masterhost == NULL;
// multi,
if (!(c->flags & CLIENT_MULTI)) {
addReplyError(c,"EXEC without MULTI");
return;
}
/*
*
* ,
* 1. CLIENT_DIRTY_CAS => watch key touch
* 2. CLIENT_DIRTY_EXEC =>
*/
/* Check if we need to abort the EXEC because:
* 1) Some WATCHed key was touched.
* 2) There was a previous error while queueing commands.
* A failed EXEC in the first case returns a multi bulk nil object
* (technically it is not an error but a special behavior), while
* in the second an EXECABORT error is returned. */
if (c->flags & (CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC)) {
addReply(c, c->flags & CLIENT_DIRTY_EXEC ? shared.execaborterr :
shared.nullmultibulk);
//
discardTransaction(c);
goto handle_monitor;
}
/* */
// watch key, hash node
unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
orig_argv = c->argv;
orig_argc = c->argc;
orig_cmd = c->cmd;
addReplyMultiBulkLen(c,c->mstate.count);
//
for (j = 0; j < c->mstate.count; j++) {
c->argc = c->mstate.commands[j].argc;
c->argv = c->mstate.commands[j].argv;
c->cmd = c->mstate.commands[j].cmd;
/* Propagate a MULTI request once we encounter the first command which
* is not readonly nor an administrative one.
* This way we'll deliver the MULTI/..../EXEC block as a whole and
* both the AOF and the replication link will have the same consistency
* and atomicity guarantees. */
if (!must_propagate && !(c->cmd->flags & (CMD_READONLY|CMD_ADMIN))) {
execCommandPropagateMulti(c);
must_propagate = 1;
}
// call
// , , hash watch key CLIENT_DIRTY_CAS
call(c,CMD_CALL_FULL);
/* Commands may alter argc/argv, restore mstate. */
c->mstate.commands[j].argc = c->argc;
c->mstate.commands[j].argv = c->argv;
c->mstate.commands[j].cmd = c->cmd;
}
c->argv = orig_argv;
c->argc = orig_argc;
c->cmd = orig_cmd;
discardTransaction(c);
/* Make sure the EXEC command will be propagated as well if MULTI
* was already propagated. */
if (must_propagate) {
int is_master = server.masterhost == NULL;
server.dirty++;
/* If inside the MULTI/EXEC block this instance was suddenly
* switched from master to slave (using the SLAVEOF command), the
* initial MULTI was propagated into the replication backlog, but the
* rest was not. We need to make sure to at least terminate the
* backlog with the final EXEC. */
if (server.repl_backlog && was_master && !is_master) {
char *execcmd = "*1\r
$4\r
EXEC\r
";
feedReplicationBacklog(execcmd,strlen(execcmd));
}
}
handle_monitor:
/* Send EXEC to clients waiting data from MONITOR. We do it here
* since the natural order of commands execution is actually:
* MUTLI, EXEC, ... commands inside transaction ...
* Instead EXEC is flagged as CMD_SKIP_MONITOR in the command
* table, and we do it here with correct ordering. */
if (listLength(server.monitors) && !server.loading)
replicationFeedMonitors(c,server.monitors,c->db->id,c->argv,c->argc);
}
/* */
void discardTransaction(client *c) {
freeClientMultiState(c);
initClientMultiState(c);
c->flags &= ~(CLIENT_MULTI|CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC);
unwatchAllKeys(c);
}
/* Unwatch all the keys watched by this client. To clean the EXEC dirty
* flag is up to the caller. */
void unwatchAllKeys(client *c) {
listIter li;
listNode *ln;
if (listLength(c->watched_keys) == 0) return;
listRewind(c->watched_keys,&li);
while((ln = listNext(&li))) {
list *clients;
watchedKey *wk;
/* Lookup the watched key -> clients list and remove the client
* from the list */
wk = listNodeValue(ln);
clients = dictFetchValue(wk->db->watched_keys, wk->key);
serverAssertWithInfo(c,NULL,clients != NULL);
listDelNode(clients,listSearchKey(clients,c));
/* Kill the entry at all if this was the only client */
if (listLength(clients) == 0)
dictDelete(wk->db->watched_keys, wk->key);
/* Remove this watched key from the client->watched list */
listDelNode(c->watched_keys,ln);
decrRefCount(wk->key);
zfree(wk);
}
}
set,hsetはsetコマンドで?
void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire, int unit, robj *ok_reply, robj *abort_reply) {
long long milliseconds = 0; /* initialized to avoid any harmness warning */
if (expire) {
if (getLongLongFromObjectOrReply(c, expire, &milliseconds, NULL) != C_OK)
return;
if (milliseconds <= 0) {
addReplyErrorFormat(c,"invalid expire time in %s",c->cmd->name);
return;
}
if (unit == UNIT_SECONDS) milliseconds *= 1000;
}
if ((flags & OBJ_SET_NX && lookupKeyWrite(c->db,key) != NULL) ||
(flags & OBJ_SET_XX && lookupKeyWrite(c->db,key) == NULL))
{
addReply(c, abort_reply ? abort_reply : shared.nullbulk);
return;
}
// ? string
setKey(c->db,key,val);
server.dirty++;
if (expire) setExpire(c,c->db,key,mstime()+milliseconds);
notifyKeyspaceEvent(NOTIFY_STRING,"set",key,c->db->id);
if (expire) notifyKeyspaceEvent(NOTIFY_GENERIC,
"expire",key,c->db->id);
addReply(c, ok_reply ? ok_reply : shared.ok);
}
/* SET key value [NX] [XX] [EX ] [PX ] */
void setCommand(client *c) {
int j;
robj *expire = NULL;
int unit = UNIT_SECONDS;
int flags = OBJ_SET_NO_FLAGS;
for (j = 3; j < c->argc; j++) {
char *a = c->argv[j]->ptr;
robj *next = (j == c->argc-1) ? NULL : c->argv[j+1];
if ((a[0] == 'n' || a[0] == 'N') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_XX))
{
flags |= OBJ_SET_NX;
} else if ((a[0] == 'x' || a[0] == 'X') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_NX))
{
flags |= OBJ_SET_XX;
} else if ((a[0] == 'e' || a[0] == 'E') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_PX) && next)
{
flags |= OBJ_SET_EX;
unit = UNIT_SECONDS;
expire = next;
j++;
} else if ((a[0] == 'p' || a[0] == 'P') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_EX) && next)
{
flags |= OBJ_SET_PX;
unit = UNIT_MILLISECONDS;
expire = next;
j++;
} else {
addReply(c,shared.syntaxerr);
return;
}
}
c->argv[2] = tryObjectEncoding(c->argv[2]);
setGenericCommand(c,flags,c->argv[1],c->argv[2],expire,unit,NULL,NULL);
}
/* High level Set operation. This function can be used in order to set
* a key, whatever it was existing or not, to a new object.
*
* 1) The ref count of the value object is incremented.
* 2) clients WATCHing for the destination key notified.
* 3) The expire time of the key is reset (the key is made persistent).
*
* All the new keys in the database should be craeted via this interface. */
void setKey(redisDb *db, robj *key, robj *val) {
if (lookupKeyWrite(db,key) == NULL) {
dbAdd(db,key,val);
} else {
dbOverwrite(db,key,val);
}
incrRefCount(val);
removeExpire(db,key);
// ? hash watch key CLIENT_DIRTY_CAS
// 1, set 1, 。 。
signalModifiedKey(db,key);
}
void signalModifiedKey(redisDb *db, robj *key) {
touchWatchedKey(db,key);
}
/* hash CLIENT_DIRTY_CAS */
void touchWatchedKey(redisDb *db, robj *key) {
list *clients;
listIter li;
listNode *ln;
if (dictSize(db->watched_keys) == 0) return;
clients = dictFetchValue(db->watched_keys, key);
if (!clients) return;
/* Mark all the clients watching this key as CLIENT_DIRTY_CAS */
/* Check if we are already watching for this key */
listRewind(clients,&li);
while((ln = listNext(&li))) {
client *c = listNodeValue(ln);
c->flags |= CLIENT_DIRTY_CAS;
}
}