大規模文字列取得-trieツリーの圧縮
74196 ワード
本稿では,圧縮trieツリーを用いて文字列検索機能を実現する.まず文字列を符号化によりバイナリ列に変換し、その後trieツリーにバイナリ列を挿入し、挿入中に圧縮機能を同時に実現する.
文字符号化はHuffmanを用いたが,最終試験ではHuffmanを用いない方法で符号化時間を節約するだけでなくtrieツリーの挿入時間も減少することが分かった.
文字符号化はHuffmanを用いたが,最終試験ではHuffmanを用いない方法で符号化時間を節約するだけでなくtrieツリーの挿入時間も減少することが分かった.
1 /**
2
3 */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include "huffman.h"
8 #include "compress_trie.h"
9 //#include <time.h>
10
11 #define NUM_OF_HUFFMAN 81
12 #define LENGTH_OF_LINE 10000
13 #define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"
14 //#define EMAIL "strpool.dat"
15 //#define CHECKED_EMAIL "checkedemail.dat"
16 #define RESULT "result.dat"
17
18 void str_to_bin(char buf[],char binary[],huffman_node hufm[]);
19
20
21 int main(int argc, char *argv[])
22 {
23 //time_t time_start,time_end;
24 //time_start = time(NULL);
25
26 char* EMAIL = argv[1];
27 char* CHECKED_EMAIL = argv[2];
28
29 huffman_node hufm[NUM_OF_HUFFMAN];
30 hufm_init(hufm,NUM_OF_HUFFMAN);
31 char buf[LENGTH_OF_LINE];
32 char binary[LENGTH_OF_LINE];
33
34 FILE* fin_of_huffman;
35 fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");
36 if(fin_of_huffman == NULL)
37 {
38 hufm_init(hufm,NUM_OF_HUFFMAN);
39 int i;
40 for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)
41 {
42 hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;
43 }
44 huffman_coding(hufm,NUM_OF_HUFFMAN);
45 }
46 else
47 {
48 char temp_char;
49 int i;
50 for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)
51 {
52 fgets(buf,sizeof(buf),fin_of_huffman);
53 sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);
54 }
55 }
56 fclose(fin_of_huffman);
57
58 printf("building trie...");
59 FILE* fin_of_email;
60 fin_of_email = fopen(EMAIL,"r");
61 trie_node *root;
62 root = (trie_node*)malloc(sizeof(trie_node));
63 trie_node_init(&root);
64
65 while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)
66 {
67 str_to_bin(buf,binary,hufm);
68 trie_insert(&root,binary);
69 }
70 fclose(fin_of_email);
71 printf("\r");
72 printf("build trie success.
");
73
74 FILE *fin_of_checked,*fout_of_result;
75 fin_of_checked = fopen(CHECKED_EMAIL,"r");
76 fout_of_result = fopen(RESULT,"w");
77 int num_yes = 0;
78 int num_no = 0;
79 while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)
80 {
81 str_to_bin(buf,binary,hufm);
82 if(trie_search(root,binary))
83 {
84 fprintf(fout_of_result,"YES
");
85 num_yes++;
86 }
87 else
88 {
89 fprintf(fout_of_result,"NO
");
90 num_no++;
91 }
92 }
93 fprintf(fout_of_result,"num of YES is:%d
",num_yes);
94 fprintf(fout_of_result,"num of NO is:%d
",num_no);
95 printf("search success!
");
96 fclose(fin_of_checked);
97 fclose(fout_of_result);
98 //time_end = time(NULL);
99 //printf(" :%.0lfs
", difftime(time_end, time_start));
100 return 0;
101 }
102
103
104 void str_to_bin(char buf[],char binary[],huffman_node hufm[])
105 {
106 int i;
107 binary[0] = '\0';
108 for(i=strlen(buf)-1;i>=0;i--)
109 {
110 if(buf[i]>='a' && buf[i]<='z')
111 {
112 strcat(binary,hufm[buf[i]-'a'].code);
113 }
114 else if(buf[i]>='A' && buf[i]<='Z')
115 {
116 strcat(binary,hufm[buf[i]-'A'].code);
117 }
118 else if(buf[i]>='0' && buf[i]<='9')
119 {
120 strcat(binary,hufm[26+buf[i]-'0'].code);
121 }
122 else if(buf[i]=='_')
123 {
124 strcat(binary,hufm[36].code);
125 }
126 else if(buf[i]=='-')
127 {
128 strcat(binary,hufm[37].code);
129 }
130 else if(buf[i]=='.')
131 {
132 strcat(binary,hufm[38].code);
133 }
134 else if(buf[i]=='@')
135 {
136 strcat(binary,hufm[39].code);
137 }
138 else
139 {
140 strcat(binary,hufm[40].code);
141 }
142 }
143 }
1 /**
2 trie , 。
3 */
4
5 typedef struct TRIE_NODE
6 {
7 char is_str;
8 unsigned short num_of_bit;
9 unsigned char* compress_of_bit;
10 struct TRIE_NODE *point_of_zero,*point_of_one;
11 }trie_node;
12
13 //long int temp_of_new = 0;
14
15
16 void trie_node_init(trie_node **root);
17 int trie_insert(trie_node **root,char* bit_of_insert);
18 int trie_search(trie_node *root,char* bit_of_insert);
19 void trie_delete(trie_node *root);
20 void compress(trie_node *root,char* bit_of_insert);
21 int compare_of_bit(trie_node *root,char* bit_of_insert);
22 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop);
23
24
25
26
27 void trie_node_init(trie_node **root)
28 {
29 (*root)->is_str = (char)0;
30 (*root)->num_of_bit = 0;
31 (*root)->compress_of_bit = NULL;
32 (*root)->point_of_zero = NULL;
33 (*root)->point_of_one = NULL;
34 }
35
36 void compress(trie_node *root,char* bit_of_insert)
37 {
38 int i,j,len_of_insert;
39 len_of_insert = strlen(bit_of_insert);
40 root->num_of_bit = len_of_insert;
41 if(root->num_of_bit<=32)
42 {
43 int temp;
44 for(i=len_of_insert-1,j=0;i>=0;i--,j++)
45 {
46 if(bit_of_insert[i] == '0')
47 {
48 clearbit(temp,j);
49 }
50 else
51 {
52 setbit(temp,j);
53 }
54 }
55 root->compress_of_bit = (unsigned char*)temp;
56 }
57 else
58 {
59 root->compress_of_bit = (unsigned char*)malloc((len_of_insert%8)?(len_of_insert/8+1):(len_of_insert/8));
60 for(i=len_of_insert-1,j=0;i>=0;i--,j++)
61 {
62 if(bit_of_insert[i] == '0')
63 {
64 clearbit(root->compress_of_bit[j/8],j%8);
65 }
66 else
67 {
68 setbit(root->compress_of_bit[j/8],j%8);
69 }
70 }
71 }
72 }
73
74
75 int trie_insert(trie_node **root,char* bit_of_insert)
76 {
77 int ret;
78 char bit_of_pop[10000];
79 if(root == NULL)
80 {
81 ret = 0;
82 }
83 else
84 {
85 if((*root)->num_of_bit == 0)
86 {
87 if(!(*bit_of_insert))
88 {
89 (*root)->is_str = (char)1;
90 ret = 1;
91 }
92 else
93 {
94 if((*root)->is_str == 0
95 && (*root)->point_of_zero == NULL
96 && (*root)->point_of_one == NULL)
97 {
98 compress((*root),bit_of_insert);
99 (*root)->is_str = (char)1;
100 ret = 1;
101 }
102 else
103 {
104 if(*bit_of_insert == '0')
105 {
106 if((*root)->point_of_zero == NULL)
107 {
108 (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
109 trie_node_init(&(*root)->point_of_zero);
110 //temp_of_new++;
111 }
112 ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);
113 }
114 else
115 {
116 if((*root)->point_of_one == NULL)
117 {
118 (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
119 trie_node_init(&(*root)->point_of_one);
120 //temp_of_new++;
121 }
122 ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);
123 }
124 }
125 }
126 }
127 else
128 {
129 int ans_of_compare = compare_of_bit((*root),bit_of_insert);
130 if(ans_of_compare == 0)
131 {
132 trie_node *father = (trie_node*)malloc(sizeof(trie_node));
133 trie_node_init(&father);
134 //temp_of_new++;
135 pop_bit((*root),bit_of_pop,1);
136 if(bit_of_pop[0] == '0')
137 {
138 father->point_of_zero = (*root);
139 }
140 else
141 {
142 father->point_of_one = (*root);
143 }
144 if(!(*bit_of_insert))
145 {
146 father->is_str = (char)1;
147 ret = 1;
148 }
149 else
150 {
151 if(*bit_of_insert == '0')
152 {
153 father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
154 trie_node_init(&father->point_of_zero);
155 //temp_of_new++;
156 ret = trie_insert(&father->point_of_zero,bit_of_insert+1);
157 }
158 else
159 {
160 father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
161 trie_node_init(&father->point_of_one);
162 //temp_of_new++;
163 ret = trie_insert(&father->point_of_one,bit_of_insert+1);
164 }
165 }
166 (*root) = father;
167 }
168 else
169 {
170 if(ans_of_compare == (int)(*root)->num_of_bit
171 && ans_of_compare == strlen(bit_of_insert))
172 {
173 (*root)->is_str = (char)1;
174 ret = 1;
175 }
176 else if(ans_of_compare == (int)(*root)->num_of_bit)
177 {
178 bit_of_insert += ans_of_compare;
179 if(*bit_of_insert == '0')
180 {
181 if((*root)->point_of_zero == NULL)
182 {
183 (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
184 trie_node_init(&(*root)->point_of_zero);
185 //temp_of_new++;
186 }
187 ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);
188 }
189 else
190 {
191 if((*root)->point_of_one == NULL)
192 {
193 (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
194 trie_node_init(&(*root)->point_of_one);
195 //temp_of_new++;
196 }
197 ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);
198 }
199 }
200 else if(ans_of_compare == strlen(bit_of_insert))
201 {
202 trie_node *father = (trie_node*)malloc(sizeof(trie_node));
203 trie_node_init(&father);
204 //temp_of_new++;
205 pop_bit((*root),bit_of_pop,ans_of_compare);
206 compress(father,bit_of_pop);
207 father->is_str = (char)1;
208 pop_bit((*root),bit_of_pop,1);
209 if(bit_of_pop[0] == '0')
210 {
211 father->point_of_zero = (*root);
212 }
213 else
214 {
215 father->point_of_one = (*root);
216 }
217 (*root) = father;
218 }
219 else
220 {
221 trie_node *father = (trie_node*)malloc(sizeof(trie_node));
222 trie_node_init(&father);
223 //temp_of_new++;
224 pop_bit((*root),bit_of_pop,ans_of_compare);
225 compress(father,bit_of_pop);
226 pop_bit((*root),bit_of_pop,1);
227 bit_of_insert += ans_of_compare+1;
228
229 if(bit_of_pop[0] == '0')
230 {
231 father->point_of_zero = (*root);
232 father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
233 trie_node_init(&father->point_of_one);
234 //temp_of_new++;
235 ret = trie_insert(&father->point_of_one,bit_of_insert);
236 }
237 else
238 {
239 father->point_of_one = (*root);
240 father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
241 trie_node_init(&father->point_of_zero);
242 //temp_of_new++;
243 ret = trie_insert(&father->point_of_zero,bit_of_insert);
244 }
245 (*root) = father;
246 }
247 }
248 }
249 }
250 return ret;
251 }
252
253
254 int trie_search(trie_node *root,char *bit_of_search)
255 {
256 trie_node *p = root;
257 while(p!=NULL && *bit_of_search)
258 {
259 if(p->num_of_bit!=0)
260 {
261 if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))
262 {
263 bit_of_search += (int)p->num_of_bit;
264 }
265 else
266 {
267 p=NULL;
268 break;
269 }
270 }
271 if(!(*bit_of_search))
272 {
273 break;
274 }
275 if(bit_of_search[0]=='0')
276 {
277 p = p->point_of_zero;
278 bit_of_search++;
279 }
280 else if(bit_of_search[0]=='1')
281 {
282 p = p->point_of_one;
283 bit_of_search++;
284 }
285 if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=0)
286 {
287 p=NULL;
288 break;
289 }
290 }
291 if(p!=NULL)
292 {
293 return p->is_str;
294 }
295 else
296 {
297 return 0;
298 }
299 }
300
301
302 void trie_delete(trie_node *root)
303 {
304 if(root == NULL)
305 return;
306 trie_delete(root->point_of_zero);
307 trie_delete(root->point_of_one);
308 free(root);
309 }
310
311
312 int compare_of_bit(trie_node *root,char* bit_of_insert)
313 {
314 int len_of_insert = strlen(bit_of_insert);
315 int i,j,tempbit;
316 if(root->num_of_bit<=32)
317 {
318 for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)
319 {
320 tempbit = getbit((int)root->compress_of_bit,j);
321 if(bit_of_insert[i]-'0' != tempbit)
322 {
323 break;
324 }
325 }
326 }
327 else
328 {
329 for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)
330 {
331 tempbit = getbit(root->compress_of_bit[j/8],j%8);
332 if(bit_of_insert[i]-'0' != tempbit)
333 {
334 break;
335 }
336 }
337 }
338 return i;
339 }
340
341 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)
342 {
343 int i,j;
344 short num_of_bit = root->num_of_bit - (short)len_of_pop;
345 if(root->num_of_bit<=32)
346 {
347 for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)
348 {
349 bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'0';
350 }
351 bit_of_pop[i] = '\0';
352 }
353 else
354 {
355 for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)
356 {
357 bit_of_pop[i] = getbit(root->compress_of_bit[j/8],j%8) +'0';
358 }
359 bit_of_pop[i] = '\0';
360
361 if(num_of_bit == 0)
362 {
363 free(root->compress_of_bit);
364 }
365 else if(num_of_bit<=32)
366 {
367 int temp;
368 for(j=num_of_bit-1;j>=0;j--)
369 {
370 if(getbit(root->compress_of_bit[j/8],j%8) == 0)
371 {
372 clearbit(temp,j);
373 }
374 else
375 {
376 setbit(temp,j);
377 }
378 }
379 free(root->compress_of_bit);
380 root->compress_of_bit = (unsigned char*)temp;
381 }
382 else
383 {
384 unsigned char *p;
385 short num_of_byte = (num_of_bit%8)?(num_of_bit/8+1):(num_of_bit/8);
386 if(((root->num_of_bit%8)?(root->num_of_bit/8+1):(root->num_of_bit/8)) != num_of_byte)
387 {
388 p = (unsigned char*)malloc(num_of_byte);
389 short i;
390 for(i=0;i<num_of_byte;i++)
391 {
392 p[i] = root->compress_of_bit[i];
393 }
394 free(root->compress_of_bit);
395 root->compress_of_bit = p;
396 }
397 }
398 }
399 root->num_of_bit = num_of_bit;
400 }