最大K個数を探す(C言語実装)

4072 ワード

100億個の整数、最大の1万個の数を求めて、そしてアルゴリズムの時間の複雑さを言い出します

アルゴリズム:100億個の数をすべてメモリに読み込むと、100万0000*4 Bの約40 Gのメモリが必要になります.これは明らかに現実的ではありません.
ファイルから1つの数を読むたびに、最小スタックのスタック要素と比較して、スタック要素より大きい場合は、メモリに10000サイズの最小スタックを維持できます.
スタックトップ要素を置き換え、スタックを調整します.最後に残ったスタック内の要素は最大の1万個数であり,アルゴリズムの複雑度はO(Nlogn)である.

実装:ファイルからデータを読むにはこだわりがあり、毎回1つの数だけ読み取り、効率が低すぎる場合は、入力バッファを維持し、大きなデータをメモリに一度に読み込むことができます.
使い終わったらまたファイルから読んで、このように効率が高くて、バッファの大きさもこだわりがあって、普通は4 KBの整数倍にして、ディスクのブロックの大きさが普通なためです
4 KBです

/*    gcc main.c
 *       : dd if=/dev/urandom of=random.dat bs=1M count=1024
 *   : ./a.out random.dat 100
 */
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>

static unsigned int BUF_PAGES;		/*        page */
static unsigned int PAGE_SIZE;		/* page    */
static unsigned int BUF_SIZE;		/*       , BUF_SIZE = BUF_PAGES*PAGE_SIZE */

static int *buffer;					/*       */
static int *heap;					/*     */

long get_time_usecs();
void init_heap(int n);
void adjust(int n, int i);

int main(int argc, char **argv)
{
	unsigned int		K, idx, length;
	int					fd, i, bytes, element;

	long start_usecs = get_time_usecs();

	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		printf("can't open file %s
",argv[1]);
		exit(0);
	}

	PAGE_SIZE = 4096;							/* page = 4KB */
	BUF_PAGES = 512;
	BUF_SIZE = PAGE_SIZE*BUF_PAGES;				/* 4KB*512 = 2M */
	buffer = (int *)malloc(BUF_SIZE);
	if (buffer == NULL) exit(0);

	K = atoi(argv[2]);
	heap = (int *)malloc(sizeof(int)*(K+1));
	if (heap == NULL) {
		free(buffer);
		exit(0);
	}

	bytes = read(fd, heap+1, K*sizeof(int));
	if (bytes < K*sizeof(int)) {
		printf("data size is too small
");
		exit(0);
	}
	init_heap(K);

	idx = length = 0;
	for (;;) {
		if (idx == length) {	/*         */
			bytes = read(fd, buffer, BUF_SIZE);
			if (bytes == 0) break;
			length = bytes/4;
			idx = 0;
		}
		// buffer     ，          ，    
		element = buffer[idx++];
		if (element > heap[1]) {
			heap[1] = element;
			adjust(K, 1);
		}
	}

	long end_usecs = get_time_usecs();

	printf("the top %d numbers are: 
", K);
	for (i = 1; i <= K; i++) {
		printf("%d ", heap[i]);
		if (i % 6 == 0) {
			printf("
");
		}
	}
	printf("
");

	free(buffer);
	free(heap);
	close(fd);

	double secs = (double)(end_usecs - start_usecs) / (double)1000000;
	printf("program tooks %.02f seconds.
", secs);

	return 0;
}

void init_heap(int n) 
{
	int		i;

	for (i = n/2; i > 0; i--) {
		adjust(n, i);
	}
}

/*   i             ,     
 *        i           */
void adjust(int n, int i)
{
	heap[0] = heap[i];
	i <<= 1;
	while (i <= n) {
		if (i < n && heap[i+1] < heap[i]) {
			i++;
		}
		if (heap[i] >= heap[0]) {
			break;
		}
		heap[i>>1] = heap[i];
		i <<= 1;
	}
	heap[i>>1] = heap[0];
}

long get_time_usecs()
{
	struct timeval time;
	struct timezone tz;
	memset(&tz, '\0', sizeof(struct timezone));
	gettimeofday(&time, &tz);
	long usecs = time.tv_sec*1000000 + time.tv_usec;

	return usecs;
}

テストと実行:コードのコメントを参照してください.主にテストデータを生成する方法です.
linuxはテストデータを生成するコマンドを提供し、直接呼び出せばいいです.
dd if=/dev/urandom of=random.dat bs=1M count=1024
このようにして生成されたテストデータはバイナリであり,美4バイトを1つの整数として用いた.

Markdown, wiki markup, html5そしてWINI: 簡易マークアップ言語書法の比較早見表（チートシート）

phpファイル名を一括変更する方法