Redis watchメカニズムの解析

21383 ワード

Redis watchメカニズムの解析
我々はredisのwatchとmultiを用いていくつかの同時操作を処理し,redisのwatch+multiは実際には楽観的なロックであり,今日はその実現メカニズムを分析する.
共通のコードセグメント
$key = 'xxxx';
$redis->watch($key);
$redis->multi();
//    key
$redis->set($key);
$flag = $redis->exec();

//           false
if ($flag === false) {
    
} else {
    
}

フローチャート
クライアントAとクライアントBが同時にこのコードを実行する場合、トランザクションの実行がシリアルであるため、AクライアントがBより先に実行されると仮定すると、A実行が完了すると、クライアントAはwatchがこのkeyを持つリストから削除され、リスト内のすべてのクライアントがCLIENT_に設定されるDIRTY_CAS、その後Bが実行されると、トランザクションはBの状態がCLIENT_であることを発見するDIRTY_CASは、トランザクションを終了し、失敗を返します.
きおく
  • redisハッシュテーブル+チェーンテーブルでwatchキーを格納したクライアント:
  • ハッシュテーブルkeyはredisのkeyであり、ハッシュテーブルのvalueはクライアントからなるチェーンテーブルである
    位置決めkeyの複雑度O(1)、クライアントの検索と処理の複雑度O(n)
    Key1 => (client1->client2->client3...)
    Key2 => (client1->client2->client3...)
  • 各クライアントも、watch済みkey
  • を格納するためのチェーンテーブルを維持する.
    関連ソース
    関連ファイル
    multi.h
    multi.c
    db.c
    t_string.c
    watch
    /* watch   */
    void watchCommand(client *c) {
        int j;
    
        if (c->flags & CLIENT_MULTI) {
            addReplyError(c,"WATCH inside MULTI is not allowed");
            return;
        }
        for (j = 1; j < c->argc; j++)
            watchForKey(c,c->argv[j]);
        addReply(c,shared.ok);
    }
    
    typedef struct watchedKey {
        robj *key;
        redisDb *db;
    } watchedKey;
    
    /* watch  key */
    void watchForKey(client *c, robj *key) {
        list *clients = NULL;
        listIter li;
        listNode *ln;
        watchedKey *wk;
    
        /*   key    watch     watch      */
        //        
        listRewind(c->watched_keys,&li);
        //        watch key
        while((ln = listNext(&li))) {
            wk = listNodeValue(ln);
            //         key,    
            if (wk->db == c->db && equalStringObjects(key,wk->key))
                return; /* Key already watched */
        }
        /*    watch,       */
        //   hash    key      
        clients = dictFetchValue(c->db->watched_keys,key);
        //      ,           
        if (!clients) {
            clients = listCreate();
            dictAdd(c->db->watched_keys,key,clients);
            incrRefCount(key);
        }
        //             
        listAddNodeTail(clients,c);
        /*        watch_keys    */
        wk = zmalloc(sizeof(*wk));
        wk->key = key;
        wk->db = c->db;
        incrRefCount(key);
        listAddNodeTail(c->watched_keys,wk);
    }
    

    multi
    /* multi    */
    void multiCommand(client *c) {
        //              ,      multi    
        if (c->flags & CLIENT_MULTI) {
            addReplyError(c,"MULTI calls can not be nested");
            return;
        }
        //          CLIENT_MULTI
        c->flags |= CLIENT_MULTI;
        addReply(c,shared.ok);
    }
    
    /*                   */
    void initClientMultiState(client *c) {
        c->mstate.commands = NULL;
        c->mstate.count = 0;
    }
    

    exec
    /* exec    */
    void execCommand(client *c) {
        int j;
        robj **orig_argv;
        int orig_argc;
        struct redisCommand *orig_cmd;
        int must_propagate = 0; /* Need to propagate MULTI/EXEC to AOF / slaves? */
        int was_master = server.masterhost == NULL;
    	
        //    multi,   
        if (!(c->flags & CLIENT_MULTI)) {
            addReplyError(c,"EXEC without MULTI");
            return;
        }
    	
        /*
         *   
         *                      ,            
         * 1. CLIENT_DIRTY_CAS =>    watch key touch 
         * 2. CLIENT_DIRTY_EXEC =>              
         */
        
        /* Check if we need to abort the EXEC because:
         * 1) Some WATCHed key was touched.
         * 2) There was a previous error while queueing commands.
         * A failed EXEC in the first case returns a multi bulk nil object
         * (technically it is not an error but a special behavior), while
         * in the second an EXECABORT error is returned. */
        if (c->flags & (CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC)) {
            addReply(c, c->flags & CLIENT_DIRTY_EXEC ? shared.execaborterr :
                                                      shared.nullmultibulk);
            // 
            discardTransaction(c);
            goto handle_monitor;
        }
    
        /*          */
        //            watch  key, hash     node
        unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
        orig_argv = c->argv;
        orig_argc = c->argc;
        orig_cmd = c->cmd;
        addReplyMultiBulkLen(c,c->mstate.count);
        //         
        for (j = 0; j < c->mstate.count; j++) {
            c->argc = c->mstate.commands[j].argc;
            c->argv = c->mstate.commands[j].argv;
            c->cmd = c->mstate.commands[j].cmd;
    
            /* Propagate a MULTI request once we encounter the first command which
             * is not readonly nor an administrative one.
             * This way we'll deliver the MULTI/..../EXEC block as a whole and
             * both the AOF and the replication link will have the same consistency
             * and atomicity guarantees. */
            if (!must_propagate && !(c->cmd->flags & (CMD_READONLY|CMD_ADMIN))) {
                execCommandPropagateMulti(c);
                must_propagate = 1;
            }
    		//    call     
            //              ,        ,   hash  watch key         CLIENT_DIRTY_CAS
            call(c,CMD_CALL_FULL);
    
            /* Commands may alter argc/argv, restore mstate. */
            c->mstate.commands[j].argc = c->argc;
            c->mstate.commands[j].argv = c->argv;
            c->mstate.commands[j].cmd = c->cmd;
        }
        c->argv = orig_argv;
        c->argc = orig_argc;
        c->cmd = orig_cmd;
        discardTransaction(c);
    
        /* Make sure the EXEC command will be propagated as well if MULTI
         * was already propagated. */
        if (must_propagate) {
            int is_master = server.masterhost == NULL;
            server.dirty++;
            /* If inside the MULTI/EXEC block this instance was suddenly
             * switched from master to slave (using the SLAVEOF command), the
             * initial MULTI was propagated into the replication backlog, but the
             * rest was not. We need to make sure to at least terminate the
             * backlog with the final EXEC. */
            if (server.repl_backlog && was_master && !is_master) {
                char *execcmd = "*1\r
    $4\r
    EXEC\r
    "
    ; feedReplicationBacklog(execcmd,strlen(execcmd)); } } handle_monitor: /* Send EXEC to clients waiting data from MONITOR. We do it here * since the natural order of commands execution is actually: * MUTLI, EXEC, ... commands inside transaction ... * Instead EXEC is flagged as CMD_SKIP_MONITOR in the command * table, and we do it here with correct ordering. */ if (listLength(server.monitors) && !server.loading) replicationFeedMonitors(c,server.monitors,c->db->id,c->argv,c->argc); } /* */ void discardTransaction(client *c) { freeClientMultiState(c); initClientMultiState(c); c->flags &= ~(CLIENT_MULTI|CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC); unwatchAllKeys(c); } /* Unwatch all the keys watched by this client. To clean the EXEC dirty * flag is up to the caller. */ void unwatchAllKeys(client *c) { listIter li; listNode *ln; if (listLength(c->watched_keys) == 0) return; listRewind(c->watched_keys,&li); while((ln = listNext(&li))) { list *clients; watchedKey *wk; /* Lookup the watched key -> clients list and remove the client * from the list */ wk = listNodeValue(ln); clients = dictFetchValue(wk->db->watched_keys, wk->key); serverAssertWithInfo(c,NULL,clients != NULL); listDelNode(clients,listSearchKey(clients,c)); /* Kill the entry at all if this was the only client */ if (listLength(clients) == 0) dictDelete(wk->db->watched_keys, wk->key); /* Remove this watched key from the client->watched list */ listDelNode(c->watched_keys,ln); decrRefCount(wk->key); zfree(wk); } }

    set,hsetはsetコマンドで?
    void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire, int unit, robj *ok_reply, robj *abort_reply) {
        long long milliseconds = 0; /* initialized to avoid any harmness warning */
    
        if (expire) {
            if (getLongLongFromObjectOrReply(c, expire, &milliseconds, NULL) != C_OK)
                return;
            if (milliseconds <= 0) {
                addReplyErrorFormat(c,"invalid expire time in %s",c->cmd->name);
                return;
            }
            if (unit == UNIT_SECONDS) milliseconds *= 1000;
        }
    
        if ((flags & OBJ_SET_NX && lookupKeyWrite(c->db,key) != NULL) ||
            (flags & OBJ_SET_XX && lookupKeyWrite(c->db,key) == NULL))
        {
            addReply(c, abort_reply ? abort_reply : shared.nullbulk);
            return;
        }
        //    ?   string  
        setKey(c->db,key,val);
        server.dirty++;
        if (expire) setExpire(c,c->db,key,mstime()+milliseconds);
        notifyKeyspaceEvent(NOTIFY_STRING,"set",key,c->db->id);
        if (expire) notifyKeyspaceEvent(NOTIFY_GENERIC,
            "expire",key,c->db->id);
        addReply(c, ok_reply ? ok_reply : shared.ok);
    }
    
    /* SET key value [NX] [XX] [EX ] [PX ] */
    void setCommand(client *c) {
        int j;
        robj *expire = NULL;
        int unit = UNIT_SECONDS;
        int flags = OBJ_SET_NO_FLAGS;
    
        for (j = 3; j < c->argc; j++) {
            char *a = c->argv[j]->ptr;
            robj *next = (j == c->argc-1) ? NULL : c->argv[j+1];
    
            if ((a[0] == 'n' || a[0] == 'N') &&
                (a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
                !(flags & OBJ_SET_XX))
            {
                flags |= OBJ_SET_NX;
            } else if ((a[0] == 'x' || a[0] == 'X') &&
                       (a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
                       !(flags & OBJ_SET_NX))
            {
                flags |= OBJ_SET_XX;
            } else if ((a[0] == 'e' || a[0] == 'E') &&
                       (a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
                       !(flags & OBJ_SET_PX) && next)
            {
                flags |= OBJ_SET_EX;
                unit = UNIT_SECONDS;
                expire = next;
                j++;
            } else if ((a[0] == 'p' || a[0] == 'P') &&
                       (a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
                       !(flags & OBJ_SET_EX) && next)
            {
                flags |= OBJ_SET_PX;
                unit = UNIT_MILLISECONDS;
                expire = next;
                j++;
            } else {
                addReply(c,shared.syntaxerr);
                return;
            }
        }
    
        c->argv[2] = tryObjectEncoding(c->argv[2]);
        setGenericCommand(c,flags,c->argv[1],c->argv[2],expire,unit,NULL,NULL);
    }
    
    /* High level Set operation. This function can be used in order to set
     * a key, whatever it was existing or not, to a new object.
     *
     * 1) The ref count of the value object is incremented.
     * 2) clients WATCHing for the destination key notified.
     * 3) The expire time of the key is reset (the key is made persistent).
     *
     * All the new keys in the database should be craeted via this interface. */
    void setKey(redisDb *db, robj *key, robj *val) {
        if (lookupKeyWrite(db,key) == NULL) {
            dbAdd(db,key,val);
        } else {
            dbOverwrite(db,key,val);
        }
        incrRefCount(val);
        removeExpire(db,key);
        //    ?   hash      watch  key         CLIENT_DIRTY_CAS
        //         1,  set 1,        。           。
        signalModifiedKey(db,key);
    }
    
    void signalModifiedKey(redisDb *db, robj *key) {
        touchWatchedKey(db,key);
    }
    
    /*   hash           CLIENT_DIRTY_CAS */
    void touchWatchedKey(redisDb *db, robj *key) {
        list *clients;
        listIter li;
        listNode *ln;
    
        if (dictSize(db->watched_keys) == 0) return;
        clients = dictFetchValue(db->watched_keys, key);
        if (!clients) return;
    
        /* Mark all the clients watching this key as CLIENT_DIRTY_CAS */
        /* Check if we are already watching for this key */
        listRewind(clients,&li);
        while((ln = listNext(&li))) {
            client *c = listNodeValue(ln);
    
            c->flags |= CLIENT_DIRTY_CAS;
        }
    }