time 33 hashアルゴリズム
6265 ワード
php,apache,perl,bsddbはtime 33ハッシュを用いる.
最も単純なバージョン
uint32_t time33(char const *str, int len) { unsigned long hash = 0; for (int i = 0; i < len; i++) { hash = hash *33 + (unsigned long) str[i]; } return hash; }
このバージョンはtime 33のアルゴリズムの考え方を最も体現することができて、とても簡単です.
乗算操作をビット操作に置き換える
unsigned long time33(char const *str, int len)
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = ((hash <<5) + hash) + (unsigned long) str[i];
}
return hash;
}
59文字1000万回実行(gccは最適化をオンにしていません.最適化をオンにした後、2つの関数の実際のコードが同じになるためです)
1つ目:
real 0m4.389s user 0m4.388s sys 0m0.000s
2つ目:
real 0m4.137s user 0m4.120s sys 0m0.000s
gcc-O 2最適化後
real 0m1.367s user 0m1.360s sys 0m0.000s
phpバージョン
inline unsigned time33(char const*str, int len) { unsigned long hash = 5381; /* variant with the hash unrolled eight times */ for (; len >= 8; len -= 8) { hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; } switch (len) { case 7: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 6: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 5: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 4: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 3: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 2: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 1: hash = ((hash << 5) + hash) + *str++; break; case 0: break; } return hash; }
59文字、1000万回
real 0m1.088s user 0m1.068s sys 0m0.000s
速度アップは主にループ展開であり,短い文字ではこれは明らかではない.
phpバージョンのhash初期値は5381です.
Magic Constant 5381:
1. odd number
2. prime number
3. deficient number
4. 001/010/100/000/101 b
Apacheバージョン
unsigned long time33(char const *str, int *len) { unsigned long hash = 0;
const char *p=str; if (*len<=0) { for(p = str; *p; p++) { hash = hash * 33 + *p; } *len = p - str; } else { int i = *len; for (p = str;i; i--, p++) { hash = hash * 33 + *p; } } return hash; }
テスト結果
real 0m1.418s user 0m1.412s sys 0m0.004s
以上、私の改良バージョン
#define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0)/phpバージョンunsigned long time 33(char const*str,int len=-1){unsigned long hash=5381;/*variant with the hash unrolled eight times*/char const*p=str;if(unlikely(len<0)){for(;*p;p++){ hash = hash * 33 + *p; } return hash; }
#define TIME33_HASH_MIXED_CH() hash = ((hash<<5)+hash) + *p++ for (; len >= 8; len -= 8) { TIME33_HASH_MIXED_CH();//1 TIME33_HASH_MIXED_CH();//2 TIME33_HASH_MIXED_CH();//3 TIME33_HASH_MIXED_CH();//4 TIME33_HASH_MIXED_CH();//5 TIME33_HASH_MIXED_CH();//6 TIME33_HASH_MIXED_CH();//7 TIME33_HASH_MIXED_CH();//8 } switch (len) { case 7: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 6: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 5: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 4: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 3: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 2: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 1: TIME33_HASH_MIXED_CH(); break; case 0: break; } return hash; }
#undef TIME33_HASH_MIXED_CH
テスト結果
real 0m1.072s user 0m1.064s sys 0m0.000s
テストしましたが、繰返し率は1/2000です.
なぜ33の倍数なのか、PHPでは
DJBX33A (Daniel J. Bernstein, Times 33 with Addition) This is Daniel J. Bernstein's popular `times 33' hash function as posted by him years ago on comp.lang.c. It basically uses a function like ``hash(i) = hash(i-1) * 33 + str[i]''. This is one of the best known hash functions for strings. Because it is both computed very fast and distributes very well. The magic of number 33, i.e. why it works better than many other constants, prime or not, has never been adequately explained by anyone. So I try an explanation: if one experimentally tests all multipliers between 1 and 256 (as RSE did now) one detects that even numbers are not useable at all. The remaining 128 odd numbers (except for the number 1) work more or less all equally well. They all distribute in an acceptable way and this way fill a hash table with an average percent of approx. 86%. If one compares the Chi^2 values of the variants, the number 33 not even has the best value. But the number 33 and a few other equally good numbers like 17, 31, 63, 127 and 129 have nevertheless a great advantage to the remaining numbers in the large set of possible multipliers: their multiply operation can be replaced by a faster operation based on just one shift plus either a single addition or subtraction operation. And because a hash function has to both distribute good _and_ has to be very fast to compute, those few numbers should be preferred and seems to be the reason why Daniel J. Bernstein also preferred it. -- Ralf S. Engelschall [email protected]
その他の倍数
Ngixはtime 31を使用しています
Tokyo Cabinetはtime 37を使用しています
Bobは彼の文章で、小文字の英語の語彙は33に適しており、大文字と小文字が混ざって65を使っていると述べた.time 33が似合うのは英語の語彙のhashです.
最も単純なバージョン
uint32_t time33(char const *str, int len) { unsigned long hash = 0; for (int i = 0; i < len; i++) { hash = hash *33 + (unsigned long) str[i]; } return hash; }
このバージョンはtime 33のアルゴリズムの考え方を最も体現することができて、とても簡単です.
乗算操作をビット操作に置き換える
unsigned long time33(char const *str, int len)
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = ((hash <<5) + hash) + (unsigned long) str[i];
}
return hash;
}
59文字1000万回実行(gccは最適化をオンにしていません.最適化をオンにした後、2つの関数の実際のコードが同じになるためです)
1つ目:
real 0m4.389s user 0m4.388s sys 0m0.000s
2つ目:
real 0m4.137s user 0m4.120s sys 0m0.000s
gcc-O 2最適化後
real 0m1.367s user 0m1.360s sys 0m0.000s
phpバージョン
inline unsigned time33(char const*str, int len) { unsigned long hash = 5381; /* variant with the hash unrolled eight times */ for (; len >= 8; len -= 8) { hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; hash = ((hash << 5) + hash) + *str++; } switch (len) { case 7: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 6: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 5: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 4: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 3: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 2: hash = ((hash << 5) + hash) + *str++;/* fallthrough... */ case 1: hash = ((hash << 5) + hash) + *str++; break; case 0: break; } return hash; }
59文字、1000万回
real 0m1.088s user 0m1.068s sys 0m0.000s
速度アップは主にループ展開であり,短い文字ではこれは明らかではない.
phpバージョンのhash初期値は5381です.
Magic Constant 5381:
1. odd number
2. prime number
3. deficient number
4. 001/010/100/000/101 b
Apacheバージョン
unsigned long time33(char const *str, int *len) { unsigned long hash = 0;
const char *p=str; if (*len<=0) { for(p = str; *p; p++) { hash = hash * 33 + *p; } *len = p - str; } else { int i = *len; for (p = str;i; i--, p++) { hash = hash * 33 + *p; } } return hash; }
テスト結果
real 0m1.418s user 0m1.412s sys 0m0.004s
以上、私の改良バージョン
#define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0)/phpバージョンunsigned long time 33(char const*str,int len=-1){unsigned long hash=5381;/*variant with the hash unrolled eight times*/char const*p=str;if(unlikely(len<0)){for(;*p;p++){ hash = hash * 33 + *p; } return hash; }
#define TIME33_HASH_MIXED_CH() hash = ((hash<<5)+hash) + *p++ for (; len >= 8; len -= 8) { TIME33_HASH_MIXED_CH();//1 TIME33_HASH_MIXED_CH();//2 TIME33_HASH_MIXED_CH();//3 TIME33_HASH_MIXED_CH();//4 TIME33_HASH_MIXED_CH();//5 TIME33_HASH_MIXED_CH();//6 TIME33_HASH_MIXED_CH();//7 TIME33_HASH_MIXED_CH();//8 } switch (len) { case 7: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 6: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 5: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 4: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 3: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 2: TIME33_HASH_MIXED_CH();/* fallthrough... */ case 1: TIME33_HASH_MIXED_CH(); break; case 0: break; } return hash; }
#undef TIME33_HASH_MIXED_CH
テスト結果
real 0m1.072s user 0m1.064s sys 0m0.000s
テストしましたが、繰返し率は1/2000です.
なぜ33の倍数なのか、PHPでは
DJBX33A (Daniel J. Bernstein, Times 33 with Addition) This is Daniel J. Bernstein's popular `times 33' hash function as posted by him years ago on comp.lang.c. It basically uses a function like ``hash(i) = hash(i-1) * 33 + str[i]''. This is one of the best known hash functions for strings. Because it is both computed very fast and distributes very well. The magic of number 33, i.e. why it works better than many other constants, prime or not, has never been adequately explained by anyone. So I try an explanation: if one experimentally tests all multipliers between 1 and 256 (as RSE did now) one detects that even numbers are not useable at all. The remaining 128 odd numbers (except for the number 1) work more or less all equally well. They all distribute in an acceptable way and this way fill a hash table with an average percent of approx. 86%. If one compares the Chi^2 values of the variants, the number 33 not even has the best value. But the number 33 and a few other equally good numbers like 17, 31, 63, 127 and 129 have nevertheless a great advantage to the remaining numbers in the large set of possible multipliers: their multiply operation can be replaced by a faster operation based on just one shift plus either a single addition or subtraction operation. And because a hash function has to both distribute good _and_ has to be very fast to compute, those few numbers should be preferred and seems to be the reason why Daniel J. Bernstein also preferred it. -- Ralf S. Engelschall [email protected]
その他の倍数
Ngixはtime 31を使用しています
Tokyo Cabinetはtime 37を使用しています
Bobは彼の文章で、小文字の英語の語彙は33に適しており、大文字と小文字が混ざって65を使っていると述べた.time 33が似合うのは英語の語彙のhashです.