CUDA Thrustで時系列データのタイムスタンプでヒストグラムを作成する


CUDA ThrustはC++ STLのGPU版のようなもので、とても便利だ。しかもメチャクチャ速い。(`ー´)b

↓ このようなデータのタイムスタンプごとのヒストグラムを作成したいとする。


"2019/07/02 04:00:00.000","2019/07/02 04:04:04","2019-07-02T04:04:04Z","841","23.152.114.42","25846","9I","158.179.169.214","51321","ee","2hD","JFG2o2oas","MzG","MmWuH","ScsPLgdi","8","8Sd
dEBaWjIbLvBqnlX9j3LQN5I","912","198","336","769","278","554","rand-pa1"
"2019/07/02 04:00:00.000","2019/07/02 04:04:40","2019-07-02T04:04:40Z","478","248.99.6.15","41214","91","30.219.176.148","23907","xY","yJb","dguBRDbrb","N8i","fnSSU","uge0R9Ud","5","GkR6VA
Zr3vbshujEEFYYkSrMn7","953","917","636","718","142","607","rand-pa1"

Thrustでは、このように書くと。。

+c++
102 thrust::sort(key_in.begin(), key_in.end());
103
104 auto new_end = thrust::reduce_by_key(key_in.begin(),
105 key_in.end(),
106 value_in.begin(),
107 key_out.begin(),
108 value_out.begin());
+

↓ こういう感じで出力される。


20190702000000000,24236,
20190702010000000,15036,
20190702020000000,20739,
20190702030000000,18314,

2019/07/02の1時間ごとのヒストグラムになる。

サンプルはこちら↓
https://github.com/RuoAndo/qiita/tree/master/gpu/thrust/histogram

コードを見てみる。。。

 1  #include <cublas_v2.h>

 2  #include <string>
 3  #include <cstring>
 4  #include <cctype>
 5  #include <cstdlib>
 6  #include <cstdio>
 7  #include <iostream>
 8  #include <fstream>
 9  #include <bitset>

10  #include <thrust/host_vector.h>
11  #include <thrust/device_vector.h>
12  #include <thrust/generate.h>
13  #include <thrust/reduce.h>
14  #include <thrust/functional.h>
15  #include <thrust/random.h>
16  #include <thrust/sequence.h>

17  #include <stdio.h>
18  #include <iostream>

19  /*
20  #include "Utilities.cuh"
21  #include "TimingGPU.cuh"
22  */

23  #include "csv.hpp"
24  using namespace std;

25  int main( int argc, char* argv[] )
26  {
27    int N = atoi(argv[2]);
28    
29    thrust::host_vector<unsigned long long> h_vec_1(N);
30    thrust::host_vector<long> h_vec_2(N);   

31    const string csv_file = std::string(argv[1]); 
32    vector<vector<string>> data; 

33    Csv objCsv(csv_file);
34    if (!objCsv.getCsv(data)) {
35       cout << "read ERROR" << endl;
36       return 1;
37    }

38    for (int row = 0; row < data.size(); row++) {
39        vector<string> rec = data[row]; 
40        std::string timestamp = rec[0];
41        std::string bytes = rec[3];

42        for(size_t c = timestamp.find_first_of("\""); c != string::npos; c = c = timestamp.find_first_of("\"")){
43            timestamp.erase(c,1);
44          }

45        for(size_t c = timestamp.find_first_of("\/"); c != string::npos; c = c = timestamp.find_first_of("\/")){
46            timestamp.erase(c,1);
47          }

48        for(size_t c = timestamp.find_first_of("\:"); c != string::npos; c = c = timestamp.find_first_of("\:")){
49            timestamp.erase(c,1);
50          }

51        for(size_t c = timestamp.find_first_of(" "); c != string::npos; c = c = timestamp.find_first_of(" ")){
52            timestamp.erase(c,1);
53          }

54        for(size_t c = timestamp.find_first_of("."); c != string::npos; c = c = timestamp.find_first_of(".")){
55            timestamp.erase(c,1);
56          }

57        for(size_t c = bytes.find_first_of("\""); c != string::npos; c = c = bytes.find_first_of("\"")){
58            bytes.erase(c,1);
59          }

60  /*
61        std::cout << timestamp << std::endl;
62        std::cout << bytes << std::endl;
63  */

64        // h_vec_1.push_back(std::stoull(timestamp.c_str()));
65        h_vec_1[row] = std::stoull(timestamp.c_str());
66        h_vec_2[row] = std::stol(bytes);
67    }

68    int in_size = N;

69    thrust::device_vector<unsigned long long> key_in(N);
70    thrust::device_vector<long> value_in(N);

71    /*
72    thrust::device_vector<unsigned long long> key_in(in_size) = h_vec_1;
73    thrust::device_vector<long> value_in(in_size) = h_vec_2;
74    */

75    thrust::copy(h_vec_1.begin(), h_vec_1.end(), key_in.begin());
76    thrust::copy(h_vec_2.begin(), h_vec_2.end(), value_in.begin());

77    thrust::device_vector<unsigned long long> key_out(in_size, 0);
78    thrust::device_vector<long> value_out(in_size, 0);

79    thrust::sort(key_in.begin(), key_in.end());

80    auto new_end = thrust::reduce_by_key(key_in.begin(),
81                                       key_in.end(),
82                                       value_in.begin(),
83                                       key_out.begin(),
84                                       value_out.begin());

85    long new_size = new_end.first - key_out.begin();
86    
87    for(long i=0; i < new_size;i++)
88    {
89     std::cout << key_out[i] << "," << value_out[i] << "," << std::endl;
90    }
91     std::cout << std::endl;

92    return 0;
93  }

68行目までで、

h_vec_1に20190702000000000のような時刻データ
h_vec_2に24のような値が入っていることになる。(サンプルコード参照してください)

ここから、75-76行目でGPUに値を転送する。

68    int in_size = N;

69    thrust::device_vector<unsigned long long> key_in(N);
70    thrust::device_vector<long> value_in(N);

71    /*
72    thrust::device_vector<unsigned long long> key_in(in_size) = h_vec_1;
73    thrust::device_vector<long> value_in(in_size) = h_vec_2;
74    */

75    thrust::copy(h_vec_1.begin(), h_vec_1.end(), key_in.begin());
76    thrust::copy(h_vec_2.begin(), h_vec_2.end(), value_in.begin());

実行してみる。。。



# g++ random_data.cpp
# time ./a.out 1000

# ./build-gpu.sh 4

# ./4 random_data.txt 1000
20190702000000000,24236,
20190702010000000,15036,
20190702020000000,20739,
20190702030000000,18314,
20190702040000000,25253,
20190702050000000,26673,
20190702060000000,22629,
20190702070000000,21106,
20190702080000000,25918,
20190702090000000,20456,
20190702100000000,20267,
20190702110000000,21369,
20190702120000000,14971,
20190702130000000,24321,
20190702140000000,14919,
20190702150000000,15799,
20190702160000000,27511,
20190702170000000,14720,
20190702180000000,21536,
20190702190000000,17116,
20190702200000000,16300,
20190702210000000,20417,
20190702220000000,22010,
20190702230000000,17901,

# time ./a.out 1000

# ./4 random_data.txt 1000
20190702000000000,22897,
20190702010000000,14236,
20190702020000000,25371,
20190702030000000,16396,
20190702040000000,16508,
20190702050000000,19085,
20190702060000000,22892,
20190702070000000,17482,
20190702080000000,26165,
20190702090000000,22288,
20190702100000000,15953,
20190702110000000,22415,
20190702120000000,22263,
20190702130000000,15309,
20190702140000000,17042,
20190702150000000,25743,
20190702160000000,28322,
20190702170000000,23870,
20190702180000000,24243,
20190702190000000,20595,
20190702200000000,21645,
20190702210000000,12285,
20190702220000000,18816,
20190702230000000,17696,

(`ー´)b