ブロック方式


異なるブロック方式の処理
コンテンツベースブロック
CDC(content-defined chunking)アルゴリズムは、Rabin指紋などのデータ指紋を適用してファイルを長さの異なるブロックポリシーに分割する変長ブロックアルゴリズムである.定長ブロックアルゴリズムとは異なり、ファイル内容に基づいてブロック分割されるため、ブロックサイズは変化可能である.アルゴリズムの実行中、CDCは、48バイトなどの固定サイズのスライドウィンドウを使用してファイルデータに対してデータフィンガープリントを計算する.指紋がある条件を満たし、その値モデルが定めた整数が予め設定された数に等しい場合、ウィンドウ位置をブロックの境界とする.CDCアルゴリズムは,指紋条件が満たされず,ブロック境界が決定されず,データブロックが大きすぎるという病態現象を生じる可能性がある.実装では,データブロックのサイズを限定し,上下限を設定してこの問題を解決することができる.CDCアルゴリズムはファイル内容の変化に敏感ではなく,データの挿入や削除は検出の少ないデータブロックにのみ影響し,残りのデータブロックは影響を受けない.CDCアルゴリズムにも欠陥があり,データブロックサイズの決定が困難であり,粒度が細すぎるとオーバーヘッドが大きく,粒子が粗すぎるとdedup効果がよくない.どのように両者の間で折衷を比較するかは、難しい点です.DeduputilにおけるCDCブロックアルゴリズムコードは以下の通りである. 
リファレンスhttp://blog.csdn.net/liuaigui/article/details/5829083

/* * content-defined chunking:        * 1. BLOCK_MIN_SIZE <= block_size <= BLOCK_MAX_SIZE * 2. hash(block) % d == r */


static int file_chunk_cdc(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num,
    block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len)
{
    char buf[BUF_MAX_SIZE] = {0};
    char buf_bz[BUF_MAX_SIZE] = {0};
    char block_buf[BLOCK_MAX_SIZE * 2] = {0};
    char win_buf[BLOCK_WIN_SIZE + 1] = {0};
    char md5_str[33] = {0};
    char adler_pre_char;
    unsigned char md5_checksum[32 + 1] = {0};
    unsigned int bpos = 0;
    unsigned int rwsize = 0, bzsize = 0;
    unsigned int exp_rwsize = BUF_MAX_SIZE;
    unsigned int head, tail;
    unsigned int block_sz = 0, old_block_sz = 0;
    unsigned int hkey = 0;
    int ret = 0;

    while(rwsize = read(fd, buf + bpos, exp_rwsize))
    {
        /* last chunk @@@@@@*/
        if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE)
            break;

        head = 0;
        tail = bpos + rwsize;
        /* avoid unnecessary computation and comparsion */
        if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE))

            old_block_sz = block_sz;
            block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? 
                    BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head;  
            memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
            head += (block_sz - old_block_sz);
        }

        while ((head + BLOCK_WIN_SIZE) <= tail)
        {
            memcpy(win_buf, buf + head, BLOCK_WIN_SIZE);
            /* * Firstly, i think rabinhash is the best. However, it's performance is very bad. * After some testing, i found ELF_hash is better both on performance and dedup rate. * So, EFL_hash is default. Now, adler_hash as default. */
            if (g_rolling_hash)
            {
                hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) :
                    adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]);
            }
            else 
                hkey = g_cdc_chunk_hashfunc(win_buf);

            /* get a normal chunk */
            if ((hkey % g_block_size) == CHUNK_CDC_R)
            {
                memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE);
                head += BLOCK_WIN_SIZE;
                block_sz += BLOCK_WIN_SIZE;
                if (block_sz >= BLOCK_MIN_SIZE)
                {
                    /* compress block is -z flag is given */
                    if (g_bz) 
                    {
                        bzsize = BUF_MAX_SIZE;
                        if (Z_OK != zlib_compress_block(block_buf, block_sz, buf_bz, &bzsize))
                        {
                            ret = -1;
                            goto _FILE_CHUNK_CDC_EXIT;
                        }
                        memcpy(block_buf, buf_bz, bzsize);
                        block_sz = bzsize;
                    }

                    md5(block_buf, block_sz, md5_checksum);
                    md5_2_str(md5_checksum);
                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz, 
                        md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_reggile_block_process in file_chunk_cdc");
                        goto _FILE_CHUNK_CDC_EXIT;
                    }
                    block_sz = 0;
                }
            }
            else 
            {
                block_buf[block_sz++] = buf[head++];
                /* get an abnormal chunk */
                if (block_sz >= BLOCK_MAX_SIZE)
                {
                    /* compress block if -z flag is given */
                    if (g_bz)
                    {
                        bzsize = BUF_MAX_SIZE;
                        if (Z_OK != zlib_compress_block(block_buf, block_sz, buf_bz, &bzsize))
                        {
                            ret = -1;
                            goto _FILE_CHUNK_CDC_EXIT;
                        }
                        memcpy(block_buf, buf_bz, bzsize);
                        block_sz = bzsize;
                    }

                    md5(block_buf, block_sz, md5_checksum);
                    md5_2_str(md5_checksum);
                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz, 
                        md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_reggile_block_process in file_chunk_cdc");
                        goto _FILE_CHUNK_CDC_EXIT;
                    }
                    block_sz = 0;
                }
            }

            /* avoid unnecessary computation and comparsion */
            if (block_sz == 0)
            {

                block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? 
                    BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head;
                memcpy(block_buf, buf + head, block_sz);
                head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? 
                    head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail;
            }

            adler_pre_char = buf[head -1];
        }

        /* read expected data from file to full up buf */
        bpos = tail - head;
        exp_rwsize = BUF_MAX_SIZE - bpos;
        adler_pre_char = buf[head -1];
        memmove(buf, buf + head, bpos);
    }
    /* last chunk */
    *last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0;
    if (*last_block_len > 0)
    {
        memcpy(last_block_buf, block_buf, block_sz);
        memcpy(last_block_buf + block_sz, buf, rwsize + bpos);
    }

_FILE_CHUNK_CDC_EXIT:
    return ret;
}

スライドブロック

/* * slideing block chunking, performance is a big issue due to too many hash lookup. * *    ,        ,         。 */
static int file_chunk_sb(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num,
         block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len)
{
    char buf[BUF_MAX_SIZE] = {0};
    char buf_bz[BUF_MAX_SIZE] = {0};
    char win_buf[BLOCK_MAX_SIZE * 2] = {0};
    char block_buf[BLOCK_MAX_SIZE * 2] = {0};
    char adler_pre_char;
    unsigned char md5_checksum[32 + 1] = {0};
    unsigned char md5_checksum1[32 + 1] = {0};
    unsigned char crc_checksum[16] = {0};
    unsigned int bpos = 0;
    unsigned int slide_sz = 0;
    unsigned int rwsize = 0, bzsize = 0, bzsize_f = 0;
    unsigned int exp_rwsize = BUF_MAX_SIZE;
    unsigned int head, tail;
    unsigned int hkey = 0;
    unsigned int bflag = 0;
    int ret = 0;

    while(rwsize = read(fd, buf + bpos, exp_rwsize))
    {
        /* last chunk */
        if ((rwsize + bpos + slide_sz) < g_block_size)
            break;

        head = 0;
        tail = bpos + rwsize;
        while ((head + g_block_size) <= tail)
        {
            memcpy(win_buf, buf + head, g_block_size);
            hkey = (slide_sz == 0) ? adler32_checksum(win_buf, g_block_size) : 
                adler32_rolling_checksum(hkey, g_block_size, adler_pre_char, buf[head+g_block_size-1]);
            uint_2_str(hkey, crc_checksum);

            /* bflag: 0, both CRC and MD5 are not idenitical 1, both CRC and MD5 are identical 2, CRC is identical and MD5 is not */
            bflag = 0;

            /* this block maybe is duplicate */
            bzsize = g_block_size;
            if (hash_exist(g_sb_htable_crc, crc_checksum))
            {   
                bflag = 2;
                /* compress block if -z flag is given */
                if (g_bz) 
                {
                    bzsize = BUF_MAX_SIZE;
                    if (Z_OK != zlib_compress_block(win_buf, g_block_size, buf_bz, &bzsize))
                    {
                        ret = -1;
                        goto _FILE_CHUNK_SB_EXIT;
                    }
                    memcpy(win_buf, buf_bz, bzsize);
                }

                md5(win_buf, bzsize, md5_checksum);
                md5_2_str(md5_checksum);
                if (hash_exist(htable, md5_checksum))
                {
                    /* insert fragment */
                    if (slide_sz != 0)
                    {
                        /* compress block if -z flag is given */
                        if (g_bz)
                        {
                            bzsize_f = BUF_MAX_SIZE;
                            if (Z_OK != zlib_compress_block(block_buf, slide_sz, buf_bz, &bzsize_f))
                            {
                                ret = -1;
                                goto _FILE_CHUNK_SB_EXIT;
                            }
                            memcpy(block_buf, buf_bz, bzsize_f);
                            slide_sz = bzsize_f;
                        }

                        md5(block_buf, slide_sz, md5_checksum1);
                        md5_2_str(md5_checksum1);
                        if (0 != (ret = dedup_regfile_block_process(block_buf, slide_sz, md5_checksum1, 
                            fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                        {
                            perror("dedup_regfile_block_process in file_chunk_sb");
                            goto _FILE_CHUNK_SB_EXIT;
                        }
                    }

                    /* insert fixed-size block */
                    if (0 != (ret = dedup_regfile_block_process(win_buf, bzsize, md5_checksum, 
                        fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_regfile_block_process in file_chunk_sb");
                        goto _FILE_CHUNK_SB_EXIT;
                    }

                    head += g_block_size;
                    slide_sz = 0;
                    bflag = 1;
                }
            }

            /* this block is not duplicate */
            if (bflag != 1)
            {
                block_buf[slide_sz++] = buf[head++];
                if (slide_sz == g_block_size)
                {
                    bzsize = g_block_size;

                    /* calculate checksum and check in */
                    hkey = adler32_checksum(block_buf, bzsize);
                    uint_2_str(hkey, crc_checksum);
                    hash_checkin(g_sb_htable_crc, crc_checksum);

                    /* compress block if -z flag is given */
                    if (g_bz)
                    {
                        bzsize = BUF_MAX_SIZE;
                        if (Z_OK != zlib_compress_block(block_buf, g_block_size, buf_bz, &bzsize))
                        {
                            ret = -1;
                            goto _FILE_CHUNK_SB_EXIT;
                        }
                        memcpy(block_buf, buf_bz, bzsize);
                    }
                    md5(block_buf, bzsize, md5_checksum);
                    md5_2_str(md5_checksum);

                    if (0 != (ret = dedup_regfile_block_process(block_buf, bzsize, md5_checksum, 
                        fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_regfile_block_process in file_chunk_sb");
                        goto _FILE_CHUNK_SB_EXIT;
                    }

                    slide_sz = 0;
                }
            }

            adler_pre_char = buf[head - 1];
        }

        /* read expected data from file to full up buf */
        bpos = tail - head;
        exp_rwsize = BUF_MAX_SIZE - bpos;
        adler_pre_char = buf[head - 1];
        memmove(buf, buf + head, bpos);
    }
    /* last chunk */
    *last_block_len = ((rwsize + bpos + slide_sz) > 0) ? rwsize + bpos + slide_sz : 0;
    if (*last_block_len > 0)
    {
        memcpy(last_block_buf, block_buf, slide_sz);
        memcpy(last_block_buf + slide_sz, buf, rwsize + bpos);
    }

_FILE_CHUNK_SB_EXIT:
    lseek(fd, 0, SEEK_SET);
    return ret;
}

こていぶんかつ
/* * fixed-sized file chunking *     */
static int file_chunk_fsp(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num, 
    block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len)
{
    int ret = 0;
    unsigned int rwsize, bzsize;
    unsigned char md5_checksum[32 + 1] = {0};
    char *buf = NULL, *buf_bz = NULL;//buf_bz       

    buf = (char *)malloc(g_block_size * 2);//4K*2
    buf_bz = (char *)malloc(g_block_size * 2);//4K*2
    if (buf == NULL || buf_bz == NULL)
    {
        perror("malloc in file_chunk_fsp");
        return errno;
    }

    while (rwsize = read(fd, buf, g_block_size))//
    {
         /*if the last block           */
        if (rwsize != g_block_size)
            break;

        /* compress block if -z flag is given */
        /* if (g_bz) { bzsize = g_block_size * 2; if (Z_OK != zlib_compress_block(buf, rwsize, buf_bz, &bzsize)) { ret = -1; goto _FILE_CHUNK_FSP_EXIT; } memcpy(buf, buf_bz, bzsize); rwsize = bzsize; } */
        /* calculate md5 */
        md5(buf, rwsize, md5_checksum);
        md5_2_str(md5_checksum);
        if (0 != (ret = dedup_regfile_block_process(buf, rwsize, md5_checksum, fd_ldata, 
            fd_bdata, pos, block_num, metadata, htable)))
        {
            perror("dedup_regfile_block_process in file_chunk_fsp");
            goto _FILE_CHUNK_FSP_EXIT;
        }
    }
    *last_block_len = (rwsize > 0) ? rwsize : 0;
    if ((*last_block_len)) memcpy(last_block_buf, buf, *last_block_len);

_FILE_CHUNK_FSP_EXIT:
    if (buf) free(buf);
    return ret;
}