追想録(メモリバリアサンプルコード)
16475 ワード
【声明:著作権所有、転載歓迎、商業用途に使用しないでください.連絡ポスト:[email protected]】
メモリバリアは、英語ではmemory barrierと呼ばれています.メモリバリアが発生する原因は多く,コンパイラが乱符号化を最適化したため,cpu乱順によるもの,cpu cacheが順番に同期していないためである.コンパイラが最適化したcodeは分かりやすいが,cpu乱順がメモリバリアをどのようにもたらすかは,ネット上で関連する説明コードを見ることは少ない.今日、偶然ネット上で網易の何登成さんが書いた「CPU Cache and Memory Ordering」を見て、その中にこのコードが含まれていて、啓発されました.皆さんはダウンロードして、結果を見て、考えてみてください.コードコンパイルの環境はlinuxですが、これは注意が必要です.コンパイルコマンドは文にも含まれています.
機会があれば、学生たちはネットでこのドキュメントを見つけて見ることができます.本文の1つの観点は、私は特に同意します:すべてのLOCK prefix命令のコードは、生まれつきのmemory barrierを構成します.
btw:
以前のバージョンはlinuxでしか実行できなかったため、今日はwindowsで実行されているバージョンを特定しました.
メモリバリアは、英語ではmemory barrierと呼ばれています.メモリバリアが発生する原因は多く,コンパイラが乱符号化を最適化したため,cpu乱順によるもの,cpu cacheが順番に同期していないためである.コンパイラが最適化したcodeは分かりやすいが,cpu乱順がメモリバリアをどのようにもたらすかは,ネット上で関連する説明コードを見ることは少ない.今日、偶然ネット上で網易の何登成さんが書いた「CPU Cache and Memory Ordering」を見て、その中にこのコードが含まれていて、啓発されました.皆さんはダウンロードして、結果を見て、考えてみてください.コードコンパイルの環境はlinuxですが、これは注意が必要です.コンパイルコマンドは文にも含まれています.
// compile with: g++ -o ordering -O2 ordering.cpp -lpthread
#include <pthread.h>
#include <semaphore.h>
#include <stdio.h>
#include <stdlib.h>
// Set either of these to 1 to prevent CPU reordering
#define USE_CPU_FENCE 0
#define USE_SINGLE_HW_THREAD 0 // Supported on Linux, but not Cygwin or PS3
#if USE_SINGLE_HW_THREAD
#include <sched.h>
#endif
//-------------------------------------
// MersenneTwister
// A thread-safe random number generator with good randomness
// in a small number of instructions. We'll use it to introduce
// random timing delays.
//-------------------------------------
#define MT_IA 397
#define MT_LEN 624
class MersenneTwister
{
unsigned int m_buffer[MT_LEN];
int m_index;
public:
MersenneTwister(unsigned int seed);
// Declare noinline so that the function call acts as a compiler barrier:
unsigned int integer() __attribute__((noinline));
};
MersenneTwister::MersenneTwister(unsigned int seed)
{
// Initialize by filling with the seed, then iterating
// the algorithm a bunch of times to shuffle things up.
for (int i = 0; i < MT_LEN; i++)
m_buffer[i] = seed;
m_index = 0;
for (int i = 0; i < MT_LEN * 100; i++)
integer();
}
unsigned int MersenneTwister::integer()
{
// Indices
int i = m_index;
int i2 = m_index + 1; if (i2 >= MT_LEN) i2 = 0; // wrap-around
int j = m_index + MT_IA; if (j >= MT_LEN) j -= MT_LEN; // wrap-around
// Twist
unsigned int s = (m_buffer[i] & 0x80000000) | (m_buffer[i2] & 0x7fffffff);
unsigned int r = m_buffer[j] ^ (s >> 1) ^ ((s & 1) * 0x9908B0DF);
m_buffer[m_index] = r;
m_index = i2;
// Swizzle
r ^= (r >> 11);
r ^= (r << 7) & 0x9d2c5680UL;
r ^= (r << 15) & 0xefc60000UL;
r ^= (r >> 18);
return r;
}
//-------------------------------------
// Main program, as decribed in the post
//-------------------------------------
sem_t beginSema1;
sem_t beginSema2;
sem_t endSema;
int X, Y;
int r1, r2;
/* thread1Func, thread2Func for StoreLoad */
/* thread3Func, thread4Func for StoreStore LoadLoad */
/* thread5Func, thread6Func for LoadStore */
void *thread1Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
sem_wait(&beginSema1); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
X = 1;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering
#endif
r1 = Y;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread2Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
sem_wait(&beginSema2); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
Y = 1;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering
#endif
r2 = X;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread3Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
sem_wait(&beginSema1); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
X = 1;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering only
#endif
Y = 1;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread4Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
sem_wait(&beginSema2); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r1 = Y;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering only
#endif
r2 = X;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread5Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
sem_wait(&beginSema1); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r1 = X;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering only
#endif
Y = 1;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread6Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
sem_wait(&beginSema2); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r2 = Y;
#if USE_CPU_FENCE
asm volatile("mfence" ::: "memory"); // Prevent CPU reordering
#else
asm volatile("" ::: "memory"); // Prevent compiler reordering only
#endif
X = 1;
sem_post(&endSema); // Notify transaction complete
}
return NULL; // Never returns
};
int main(int argc, char** argv)
{
// Check Argument
if (argc > 2)
{
printf("Too Many Arguments: Only Need One.
");
return 0;
}
if (argc == 1)
{
printf("You Should Give an Argument: 1 or 2 or 3.
");
return 0;
}
int i;
i = atoi(argv[1]);
if (i < 1 || i > 3)
{
printf("Wrong Argument: Only 1 or 2 or 3 Can be Input.
");
return 0;
}
// Initialize the semaphores
sem_init(&beginSema1, 0, 0);
sem_init(&beginSema2, 0, 0);
sem_init(&endSema, 0, 0);
// Spawn the threads
pthread_t thread1, thread2;
// 1. StoreLoad Reorder Tests
if (i == 1)
{
pthread_create(&thread1, NULL, thread1Func, NULL);
pthread_create(&thread2, NULL, thread2Func, NULL);
}
// 2. LoadLoad && StoreStore Reorder Tests
else if (i == 2)
{
pthread_create(&thread1, NULL, thread3Func, NULL);
pthread_create(&thread2, NULL, thread4Func, NULL);
}
// 3. LoadStore Reorder Tests
else
{
pthread_create(&thread1, NULL, thread5Func, NULL);
pthread_create(&thread2, NULL, thread6Func, NULL);
}
#if USE_SINGLE_HW_THREAD
// Force thread affinities to the same cpu core.
cpu_set_t cpus;
CPU_ZERO(&cpus);
CPU_SET(0, &cpus);
pthread_setaffinity_np(thread1, sizeof(cpu_set_t), &cpus);
pthread_setaffinity_np(thread2, sizeof(cpu_set_t), &cpus);
#endif
// Repeat the experiment ad infinitum
int detected = 0;
for (int iterations = 1; ; iterations++)
{
// Reset X and Y
X = 0;
Y = 0;
// Signal both threads
sem_post(&beginSema1);
sem_post(&beginSema2);
// Wait for both threads
sem_wait(&endSema);
sem_wait(&endSema);
// Check if there was a simultaneous reorder
// 1. StoreLoad Reorder
if (i == 1)
{
if (r1 == 0 && r2 == 0)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
// 2. LoadLoad && StoreStore Reorder
else if (i == 2)
{
if (r1 == 1 && r2 == 0)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
// 3. LoadStore Reorder
else
{
if (r1 == 1 && r2 == 1)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
}
return 0; // Never returns
}
機会があれば、学生たちはネットでこのドキュメントを見つけて見ることができます.本文の1つの観点は、私は特に同意します:すべてのLOCK prefix命令のコードは、生まれつきのmemory barrierを構成します.
btw:
以前のバージョンはlinuxでしか実行できなかったため、今日はwindowsで実行されているバージョンを特定しました.
// compile with vc 6.0
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
// Set either of these to 1 to prevent CPU reordering
#define USE_CPU_FENCE 0
#define USE_SINGLE_HW_THREAD 0 // Supported on windows
//-------------------------------------
// MersenneTwister
// A thread-safe random number generator with good randomness
// in a small number of instructions. We'll use it to introduce
// random timing delays.
//-------------------------------------
#define MT_IA 397
#define MT_LEN 624
class MersenneTwister
{
unsigned int m_buffer[MT_LEN];
int m_index;
public:
MersenneTwister(unsigned int seed);
// Declare noinline so that the function call acts as a compiler barrier:
unsigned int integer();
};
MersenneTwister::MersenneTwister(unsigned int seed)
{
int i;
// Initialize by filling with the seed, then iterating
// the algorithm a bunch of times to shuffle things up.
for (i = 0; i < MT_LEN; i++)
m_buffer[i] = seed;
m_index = 0;
for (i = 0; i < MT_LEN * 100; i++)
integer();
}
unsigned int MersenneTwister::integer()
{
// Indices
int i = m_index;
int i2 = m_index + 1; if (i2 >= MT_LEN) i2 = 0; // wrap-around
int j = m_index + MT_IA; if (j >= MT_LEN) j -= MT_LEN; // wrap-around
// Twist
unsigned int s = (m_buffer[i] & 0x80000000) | (m_buffer[i2] & 0x7fffffff);
unsigned int r = m_buffer[j] ^ (s >> 1) ^ ((s & 1) * 0x9908B0DF);
m_buffer[m_index] = r;
m_index = i2;
// Swizzle
r ^= (r >> 11);
r ^= (r << 7) & 0x9d2c5680UL;
r ^= (r << 15) & 0xefc60000UL;
r ^= (r >> 18);
return r;
}
//-------------------------------------
// Main program, as decribed in the post
//-------------------------------------
HANDLE beginSema1;
HANDLE beginSema2;
HANDLE endSema;
int X, Y;
int r1, r2;
/* thread1Func, thread2Func for StoreLoad */
/* thread3Func, thread4Func for StoreStore LoadLoad */
/* thread5Func, thread6Func for LoadStore */
void *thread1Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
WaitForSingleObject(beginSema1, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
X = 1;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
__asm {}; // Prevent compiler reordering
#endif
r1 = Y;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread2Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
WaitForSingleObject(beginSema2, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
Y = 1;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
_asm {}; // Prevent compiler reordering
#endif
r2 = X;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread3Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
WaitForSingleObject(beginSema1, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
X = 1;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
__asm {}; // Prevent compiler reordering only
#endif
Y = 1;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread4Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
WaitForSingleObject(beginSema2, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r1 = Y;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
__asm {}; // Prevent compiler reordering only
#endif
r2 = X;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread5Func(void *param)
{
MersenneTwister random(1);
for (;;)
{
WaitForSingleObject(beginSema1, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r1 = X;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
__asm {}; // Prevent compiler reordering only
#endif
Y = 1;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
void *thread6Func(void *param)
{
MersenneTwister random(2);
for (;;)
{
WaitForSingleObject(beginSema2, INFINITE); // Wait for signal
while (random.integer() % 8 != 0) {} // Random delay
// ----- THE TRANSACTION! -----
r2 = Y;
#if USE_CPU_FENCE
__asm {cpuid}; // Prevent CPU reordering
#else
__asm {}; // Prevent compiler reordering only
#endif
X = 1;
ReleaseSemaphore(endSema, 1, 0); // Notify transaction complete
}
return NULL; // Never returns
};
int main(int argc, char** argv)
{
// Check Argument
if (argc > 2)
{
printf("Too Many Arguments: Only Need One.
");
return 0;
}
if (argc == 1)
{
printf("You Should Give an Argument: 1 or 2 or 3.
");
return 0;
}
int i;
i = atoi(argv[1]);
if (i < 1 || i > 3)
{
printf("Wrong Argument: Only 1 or 2 or 3 Can be Input.
");
return 0;
}
// Initialize the semaphores
beginSema1 = CreateSemaphore(NULL, 0, 1, 0);
beginSema2 = CreateSemaphore(NULL, 0, 1, 0);
endSema = CreateSemaphore(NULL, 0, 2, 0);
// Spawn the threads
HANDLE thread1, thread2;
// 1. StoreLoad Reorder Tests
if (i == 1)
{
thread1 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread1Func, 0, 0, NULL);
thread2 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread2Func, 0, 0, NULL);
}
// 2. LoadLoad && StoreStore Reorder Tests
else if (i == 2)
{
thread1 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread3Func, 0, 0, NULL);
thread2 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread4Func, 0, 0, NULL);
}
// 3. LoadStore Reorder Tests
else
{
thread1 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread5Func, 0, 0, NULL);
thread2 = CreateThread(NULL, 0, ( LPTHREAD_START_ROUTINE ) thread6Func, 0, 0, NULL);
}
#if USE_SINGLE_HW_THREAD
// Force thread affinities to the same cpu core.
SetThreadAffinityMask(thread1, 0x1 << 0);
SetThreadAffinityMask(thread2, 0x1 << 0);
#endif
// Repeat the experiment ad infinitum
int detected = 0;
for (int iterations = 1; ; iterations++)
{
// Reset X and Y
X = 0;
Y = 0;
// Signal both threads
ReleaseSemaphore(beginSema1, 1, 0);
ReleaseSemaphore(beginSema2, 1, 0);
// Wait for both threads
WaitForSingleObject(endSema, INFINITE);
WaitForSingleObject(endSema, INFINITE);
// Check if there was a simultaneous reorder
// 1. StoreLoad Reorder
if (i == 1)
{
if (r1 == 0 && r2 == 0)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
// 2. LoadLoad && StoreStore Reorder
else if (i == 2)
{
if (r1 == 1 && r2 == 0)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
// 3. LoadStore Reorder
else
{
if (r1 == 1 && r2 == 1)
{
detected++;
printf("%d reorders detected after %d iterations
", detected, iterations);
}
}
}
return 0; // Never returns
}