DepthwiseConvolutional層をdarknetフレームワークに基づいて実現

39375 ワード

darknetフレームワークに基づくDepthwiseConvolutionalの実装


詳細は次のとおりです.https://github.com/ChenYingpeng/darknet-mobilenetMobilenet v 1のプロファイルとモデルが含まれており、Imagenetでテストしたtop 1:72.03%、top 5:90.514%である.

一、depthwiseを追加するconvolutional_layer


1は/src/parser.cに次のコードを追加します。


1)ヘッダファイルの追加
#include "utils.h"
++ #include "depthwise_convolutional_layer.h" //added by chen

2)string_to_layer_type関数で、ネットワーク層タイプ解析を追加します.
 
if (strcmp(type, "[upsample]")==0) return UPSAMPLE;
++ if (strcmp(type, "[depthwise_convolutional]") == 0) return DEPTHWISE_CONVOLUTIONAL; //added by chen
    return BLANK;

3)parse_network_cfg関数では、ネットワーク層解析を追加して構築します.
if(lt == CONVOLUTIONAL){
    l = parse_convolutional(options, params);
}
++ else if (lt == DEPTHWISE_CONVOLUTIONAL) {
++    l = parse_depthwise_convolutional(options, params); //added by chen
++ }
else if(lt == DECONVOLUTIONAL){
    l = parse_deconvolutional(options, params);
}

parse_depthwise_convolutionalは自分で書く必要があります.以下はそのコードです.
//added by chen
depthwise_convolutional_layer parse_depthwise_convolutional(list *options, size_params params)
{
	int size = option_find_int(options, "size", 1);
	int stride = option_find_int(options, "stride", 1);
	int pad = option_find_int_quiet(options, "pad", 0);
	int padding = option_find_int_quiet(options, "padding", 0);
	if (pad) padding = size / 2;

	char *activation_s = option_find_str(options, "activation", "logistic");
	ACTIVATION activation = get_activation(activation_s);

	int batch, h, w, c;
	h = params.h;
	w = params.w;
	c = params.c;
	batch = params.batch;
	if (!(h && w && c)) error("Layer before convolutional layer must output image.");
	int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);


	depthwise_convolutional_layer layer = make_depthwise_convolutional_layer(batch, h, w, c, size, stride, padding, activation, batch_normalize);
	layer.flipped = option_find_int_quiet(options, "flipped", 0);
	layer.dot = option_find_float_quiet(options, "dot", 0);

	return layer;
}

4)load_weights_upto関数では、ネットワーク層読み出しパラメータ関数を追加します.
 if (l.dontload) continue;
 ++ if (l.type == DEPTHWISE_CONVOLUTIONAL) {
 ++   	load_depthwise_convolutional_weights(l, fp);//added by chen
 ++   }

ここでload_depthwise_convolutional_Weights関数は、以下のように独自に作成する必要があります.
 
//added by chen
void load_depthwise_convolutional_weights(layer l, FILE *fp)
{

	int num = l.n*l.size*l.size;
	fread(l.biases, sizeof(float), l.n, fp);
	if (l.batch_normalize && (!l.dontloadscales)) {
		fread(l.scales, sizeof(float), l.n, fp);
		fread(l.rolling_mean, sizeof(float), l.n, fp);
		fread(l.rolling_variance, sizeof(float), l.n, fp);
		if (0) {
			int i;
			for (i = 0; i < l.n; ++i) {
				printf("%g, ", l.rolling_mean[i]);
			}
			printf("
"); for (i = 0; i < l.n; ++i) { printf("%g, ", l.rolling_variance[i]); } printf("
"); } if (0) { fill_cpu(l.n, 0, l.rolling_mean, 1); fill_cpu(l.n, 0, l.rolling_variance, 1); } } fread(l.weights, sizeof(float), num, fp); if (l.flipped) { //transpose_matrix(l.weights, l.c*l.size*l.size, l.n);//hjimce } #ifdef GPU if (gpu_index >= 0) { push_depthwise_convolutional_layer(l); } #endif }

5)save_weights_upto関数では、ネットワーク層保存パラメータ関数を追加します.
if (l.dontsave) continue;
++ if (l.type == DEPTHWISE_CONVOLUTIONAL) {
++    save_depthwise_convolutional_weights(l, fp); //added by chen
++ }

そのうちsave_depthwise_convolutional_Weights関数は、以下のように独自に作成する必要があります.
//added by chen
void save_depthwise_convolutional_weights(layer l, FILE *fp)
{
#ifdef GPU
	if (gpu_index >= 0) {
		pull_depthwise_convolutional_layer(l);
	}
#endif
	int num = l.n*l.size*l.size;
	fwrite(l.biases, sizeof(float), l.n, fp);
	if (l.batch_normalize) {
		fwrite(l.scales, sizeof(float), l.n, fp);
		fwrite(l.rolling_mean, sizeof(float), l.n, fp);
		fwrite(l.rolling_variance, sizeof(float), l.n, fp);
	}
	fwrite(l.weights, sizeof(float), num, fp);
}

2は/src/network.cに、以下のコードを追加する。


1)ヘッダファイルを追加する;
#include "data.h"
++ #include "depthwise_convolutional_layer.h" // added by chen

2)ネットワーク層を追加する;
if(l.type == CONVOLUTIONAL){
    resize_convolutional_layer(&l, w, h);
}
++ else if(l.type == DEPTHWISE_CONVOLUTIONAL){
++    resize_depthwise_convolutional_layer(&l, w, h); //added by chen
++ }

3は/include/darknet.hでは、ネットワーク層列挙タイプを追加する。


darknet.hファイルの列挙タイプLAYER_TYPEは、ネットワーク層の列挙タイプを追加し、コードは以下の通りである.
    BLANK,
   ++ DEPTHWISE_CONVOLUTIONAL //added by chen

4 DepthwiseConvolutionalインプリメンテーションコードdepthwise_を/src/ディレクトリに追加convolutional_layer.h depthwise_convolutional_layer.c depthwise_convolutional_kernels.cu;


   /src/depthwise_convolutional_layer.h
#ifndef DEPTHWISE_CONVOLUTIONAL_LAYER_H
#define DEPTHWISE_CONVOLUTIONAL_LAYER_H

#include "cuda.h"
#include "image.h"
#include "activations.h"
#include "layer.h"
#include "network.h"

typedef layer depthwise_convolutional_layer;

#ifdef GPU
void forward_depthwise_convolutional_layer_gpu(depthwise_convolutional_layer layer, network net);
void backward_depthwise_convolutional_layer_gpu(depthwise_convolutional_layer layer, network net);
void update_depthwise_convolutional_layer_gpu(depthwise_convolutional_layer layer, update_args a);

void push_depthwise_convolutional_layer(depthwise_convolutional_layer layer);
void pull_depthwise_convolutional_layer(depthwise_convolutional_layer layer);

void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
#ifdef CUDNN
void cudnn_depthwise_convolutional_setup(layer *l);
#endif
#endif

depthwise_convolutional_layer make_depthwise_convolutional_layer(int batch, int h, int w, int c, int size, int stride, int padding, ACTIVATION activation, int batch_normalize);
void resize_depthwise_convolutional_layer(depthwise_convolutional_layer *layer, int w, int h);
void forward_depthwise_convolutional_layer(const depthwise_convolutional_layer layer, network net);
void update_depthwise_convolutional_layer(depthwise_convolutional_layer layer, update_args a);


void denormalize_depthwise_convolutional_layer(depthwise_convolutional_layer l);
void backward_depthwise_convolutional_layer(depthwise_convolutional_layer layer, network net);

void add_bias(float *output, float *biases, int batch, int n, int size);
void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);



int depthwise_convolutional_out_height(depthwise_convolutional_layer layer);
int depthwise_convolutional_out_width(depthwise_convolutional_layer layer);

#endif

/src/depthwise_convolutional_layer.c
#include "depthwise_convolutional_layer.h"
#include "utils.h"
#include "batchnorm_layer.h"
#include "im2col.h"
#include "col2im.h"
#include "blas.h"
#include "gemm.h"
#include 
#include 





int depthwise_convolutional_out_height(depthwise_convolutional_layer l)
{
    return (l.h + 2*l.pad - l.size) / l.stride + 1;
}

int depthwise_convolutional_out_width(depthwise_convolutional_layer l)
{
    return (l.w + 2*l.pad - l.size) / l.stride + 1;
}


//ᅵᅵʱᅵᅵᅵݿՌᅵᅵÐ?
static size_t get_workspace_size(layer l){
#ifdef CUDNN
    if(gpu_index >= 0){
        size_t most = 0;
        size_t s = 0;
        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
                l.srcTensorDesc,
                l.weightDesc,
                l.convDesc,
                l.dstTensorDesc,
                l.fw_algo,
                &s);
        if (s > most) most = s;
        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
                l.srcTensorDesc,
                l.ddstTensorDesc,
                l.convDesc,
                l.dweightDesc,
                l.bf_algo,
                &s);
        if (s > most) most = s;
        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
                l.weightDesc,
                l.ddstTensorDesc,
                l.convDesc,
                l.dsrcTensorDesc,
                l.bd_algo,
                &s);
        if (s > most) most = s;
        return most;
    }
#endif
    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
}


#ifdef GPU
#ifdef CUDNN
void cudnn_depthwise_convolutional_setup(layer *l)
{
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, l->c, l->size, l->size);

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, l->c, l->size, l->size);
    #if CUDNN_MAJOR >= 6
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION,CUDNN_DATA_FLOAT);
    #else
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    #endif
    /*cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->weightDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);*/
}
#endif
#endif

depthwise_convolutional_layer make_depthwise_convolutional_layer(int batch, int h, int w, int c,int size, int stride, int padding, ACTIVATION activation, int batch_normalize)
{
    int i;
	depthwise_convolutional_layer l = {0};
    l.type = DEPTHWISE_CONVOLUTIONAL;

    l.h = h;
    l.w = w;
    l.n = c;
	l.c = c;

    l.batch = batch;
    l.stride = stride;
    l.size = size;
    l.pad = padding;
    l.batch_normalize = batch_normalize;

    l.weights = calloc(l.n*size*size, sizeof(float));
    l.weight_updates = calloc(l.n*size*size, sizeof(float));

    l.biases = calloc(l.n, sizeof(float));
    l.bias_updates = calloc(l.n, sizeof(float));

    l.nweights = l.n*size*size;
    l.nbiases = l.n;

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    //scale = .02;
   //for(i = 0; i < c*size*size; ++i) l.weights[i] = 0.01*i;
    for(i = 0; i < l.n*l.size*l.size; ++i) l.weights[i] = scale*rand_normal();
    int out_w = depthwise_convolutional_out_width(l);
    int out_h = depthwise_convolutional_out_height(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = l.n;
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.output = calloc(l.batch*l.outputs, sizeof(float));
    l.delta  = calloc(l.batch*l.outputs, sizeof(float));

    l.forward = forward_depthwise_convolutional_layer;
    l.backward = backward_depthwise_convolutional_layer;
    l.update = update_depthwise_convolutional_layer;


    if(batch_normalize){
        l.scales = calloc(c, sizeof(float));
        l.scale_updates = calloc(c, sizeof(float));
        for(i = 0; i < c; ++i){
            l.scales[i] = 1;
        }

        l.mean = calloc(c, sizeof(float));
        l.variance = calloc(c, sizeof(float));

        l.mean_delta = calloc(c, sizeof(float));
        l.variance_delta = calloc(c, sizeof(float));

        l.rolling_mean = calloc(c, sizeof(float));
        l.rolling_variance = calloc(c, sizeof(float));
        l.x = calloc(l.batch*l.outputs, sizeof(float));
        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
    }


#ifdef GPU
    l.forward_gpu = forward_depthwise_convolutional_layer_gpu;
    l.backward_gpu = backward_depthwise_convolutional_layer_gpu;
    l.update_gpu = update_depthwise_convolutional_layer_gpu;

    if(gpu_index >= 0){


        l.weights_gpu = cuda_make_array(l.weights, c*size*size);
        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*size*size);

        l.biases_gpu = cuda_make_array(l.biases, c);
        l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);

        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*c);
        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*c);



        if(batch_normalize){
            l.mean_gpu = cuda_make_array(l.mean, c);
            l.variance_gpu = cuda_make_array(l.variance,c);

            l.rolling_mean_gpu = cuda_make_array(l.mean, c);
            l.rolling_variance_gpu = cuda_make_array(l.variance, c);

            l.mean_delta_gpu = cuda_make_array(l.mean, c);
            l.variance_delta_gpu = cuda_make_array(l.variance, c);

            l.scales_gpu = cuda_make_array(l.scales, c);
            l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);

            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*c);
            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*c);
        }
#ifdef CUDNN
        cudnnCreateTensorDescriptor(&l.normTensorDesc);
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
        cudnnCreateConvolutionDescriptor(&l.convDesc);
        cudnn_depthwise_convolutional_setup(&l);
#endif
    }
#endif
    l.workspace_size = get_workspace_size(l);
    l.activation = activation;

    fprintf(stderr, "dw conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d
", c, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; } void resize_depthwise_convolutional_layer(depthwise_convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = depthwise_convolutional_out_width(*l); int out_h = depthwise_convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*l->outputs * sizeof(float)); l->delta = realloc(l->delta, l->batch*l->outputs * sizeof(float)); if (l->batch_normalize) { l->x = realloc(l->x, l->batch*l->outputs * sizeof(float)); l->x_norm = realloc(l->x_norm, l->batch*l->outputs * sizeof(float)); } #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); if (l->batch_normalize) { cuda_free(l->x_gpu); cuda_free(l->x_norm_gpu); l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs); l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs); } #ifdef CUDNN cudnn_depthwise_convolutional_setup(l); #endif #endif l->workspace_size = get_workspace_size(*l); } /*void test_depthwise_convolutional_layer() { #include "softmax_layer.h" #include "avgpool_layer.h" #include "cost_layer.h" float data[] = {1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 2,2,2,2,2, 2,2,2,2,2, 2,2,2,2,2, 2,2,2,2,2, 2,2,2,2,2, 3,3,3,3,3, 3,3,3,3,3, 3,3,3,3,3, 3,3,3,3,3, 3,3,3,3,3}; float truth[] = { 0,0,1 }; float delta[75] = {0 }; int num_layer = 4; network net = make_network(num_layer); net.h=5; net.w=5; net.c=3; net.batch = 1; net.input = data; net.truth = truth; net.train = 1; depthwise_convolutional_layer depthwise_conv1 = make_depthwise_convolutional_layer(net.batch, net.h, net.w, net.c, 3, 1, 0, RELU, 0); avgpool_layer global_avgpool1 = make_avgpool_layer(net.batch, depthwise_conv1.out_w, depthwise_conv1.out_h, depthwise_conv1.n); softmax_layer softmax_1 = make_softmax_layer(net.batch, depthwise_conv1.n, 1); softmax_1.temperature = 1;//ᅵᅵᅵᅵȱᅵᅵ cost_layer cost_1 = make_cost_layer(net.batch, depthwise_conv1.n, SSE, 1); net.layers[0] = depthwise_conv1; net.layers[1] = global_avgpool1; net.layers[2] = softmax_1; net.layers[3] = cost_1; net.workspace = calloc(1, 75); for (int i = 0; i < net.n; ++i) { net.index = i; layer l = net.layers[i]; if (l.delta) { fill_cpu(l.outputs * l.batch, 0, l.delta, 1); } l.forward(l, net); net.input = l.output; if (l.truth) { net.truth = l.output; } } calc_network_cost(net); fprintf(stderr, "**********************cost:%f ***************", *net.cost); fprintf(stderr, "**********************backward ***************
"); network orig = net; for (int i = net.n - 1; i >= 0; --i) { layer l = net.layers[i]; if (i == 0) { //net = orig; net.input = data; net.delta = delta; } else { layer prev = net.layers[i - 1]; net.input = prev.output; net.delta = prev.delta;//ᅵЌᅵᅵᅵᅵᅵᅵ֞ᅵ랳ֵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵbackwardᅵᅵʱᅵᅵᅵᅵʵᅵǞᅵᅵᅵᅵ˵ᅵǰᅵᅵᅵᅵᅵǰᅵᅵһᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵÖ? } net.index = i; l.backward(l, net); } //forward_depthwise_convolutional_layer(l,net); }*/ void add_bias_depthwise(float *output, float *biases, int batch, int n, int size) { int i,j,b; for(b = 0; b < batch; ++b){ for(i = 0; i < n; ++i){ for(j = 0; j < size; ++j){ output[(b*n + i)*size + j] += biases[i]; } } } } void scale_bias_depthwise(float *output, float *scales, int batch, int n, int size) { int i,j,b; for(b = 0; b < batch; ++b){ for(i = 0; i < n; ++i){ for(j = 0; j < size; ++j){ output[(b*n + i)*size + j] *= scales[i]; } } } } void backward_bias_depthwise(float *bias_updates, float *delta, int batch, int n, int size) { int i,b; for(b = 0; b < batch; ++b){ for(i = 0; i < n; ++i){ bias_updates[i] += sum_array(delta+size*(i+b*n), size); } } } void forward_depthwise_convolutional_layer(depthwise_convolutional_layer l, network net) { int out_h = l.out_h; int out_w = l.out_w; int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int k = l.size*l.size; int n = out_h*out_w; for(int b = 0; b < l.batch; ++b){ for (int c=0;c

/src/depthwise_convolutional_kernels.cu
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "depthwise_convolutional_layer.h"
#include "batchnorm_layer.h"
#include "gemm.h"
#include "blas.h"
#include "im2col.h"
#include "col2im.h"
#include "utils.h"
#include "cuda.h"
}

	__global__ void DepthwiseConv2dGPUKernelNCHW(
		const float* input,const int in_rows, const int in_cols, const int in_depth,
		const float* filter, const int filter_rows, const int filter_cols,
		const int stride,const int pad_rows,const int pad_cols,
		const int out_rows,const int out_cols,const int out_depth,
		float* output, int num_outputs) {


	int thread_id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
	if (thread_id >= num_outputs) return;


	// output 
	const int OC = thread_id % out_cols;//width
	const int OR = (thread_id / out_cols) % out_rows;//height
	const int OD = (thread_id / out_cols / out_rows) % out_depth;//channel
	const int OB = thread_id / out_cols / out_rows / out_depth;//batch size

	const int in_d = OD ;



	const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols);// output channel input channel  


	const int input_row_start = OR * stride - pad_rows;
	const int input_col_start = OC * stride - pad_cols;
	const int input_row_end = input_row_start + filter_rows;
	const int input_col_end = input_col_start + filter_cols;

	float sum = 0;
	if (input_row_start >= 0 && input_col_start >= 0 &&
		input_row_end < in_rows && input_col_end < in_cols)
	{
		#pragma unroll
			for (int f_r = 0; f_r < filter_rows; ++f_r) {
				const int in_r = input_row_start + f_r;
				#pragma unroll
				for (int f_c = 0; f_c < filter_cols; ++f_c) {
					const int in_c = input_col_start + f_c;

					const int input_offset =
						(input_offset_temp)+(in_r * in_cols) + in_c;
					const int filter_offset =f_c + filter_cols * f_r +OD*filter_cols*filter_rows;
					sum += (*(input + input_offset)) * (*(filter + filter_offset));
				}
			}
		}
	else {
		#pragma unroll
		for (int f_r = 0; f_r < filter_rows; ++f_r) {
				const int in_r = input_row_start + f_r;
				#pragma unroll
				for (int f_c = 0; f_c < filter_cols; ++f_c) {
					const int in_c = input_col_start + f_c;

					if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
						const int in_c = input_col_start + f_c;

						const int input_offset =
							(input_offset_temp)+(in_r * in_cols) + in_c;

						const int filter_offset = f_c + filter_cols * f_r + OD*filter_cols*filter_rows;
						sum += (*(input + input_offset)) * (*(filter + filter_offset));
					}
				}
			}
		}

	output[thread_id] = sum;

}

__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(const float* out_backprop,
			const int stride, const int pad_rows, const int pad_cols, const int out_rows, const int out_cols, const int out_depth,
			const float* input, const int in_rows, const int in_cols, const int in_depth,
			float* filter_backprop, const int filter_rows, const int filter_cols,
			int num_out_backprop) {

	int thread_id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
	if (thread_id >= num_out_backprop) return;


	const int out_c = thread_id % out_cols;
	const int out_r = (thread_id / out_cols) % out_rows;
	const int out_d = (thread_id / out_cols / out_rows) % out_depth;

	const int b = thread_id / out_depth / out_cols / out_rows;
	const int in_d = out_d;


	const int in_r_start = out_r * stride - pad_rows;
	const int in_c_start = out_c * stride - pad_cols;
	const int in_r_end = in_r_start + filter_rows;
	const int in_c_end = in_c_start + filter_cols;

	const int out_backprop_offset = (b * out_depth * out_rows * out_cols) +
				(out_d * out_rows * out_cols) +(out_r * out_cols) + (out_c);

	const float out_bp = *(out_backprop + out_backprop_offset);
	if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&in_c_end < in_cols) {
		#pragma unroll 
		for (int f_r = 0; f_r < filter_rows; ++f_r) {
			const int in_r = in_r_start + f_r;
			const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
						(in_d * in_rows * in_cols) +(in_r * in_cols);

			#pragma unroll 
			for (int f_c = 0; f_c < filter_cols; ++f_c) {
						const int in_c = in_c_start + f_c;
						const int input_offset = input_offset_temp + in_c;
						float partial_sum = (*(input + input_offset)) * out_bp;
						float* addr = filter_backprop + f_c + filter_cols * f_r + out_d*filter_cols*filter_rows;
						atomicAdd(addr, partial_sum);
					}
				}
			}
			else {
				#pragma unroll 
				for (int f_r = 0; f_r < filter_rows; ++f_r) {
					const int in_r = in_r_start + f_r;
					const int input_offset_temp = (b * in_depth * in_rows * in_cols) +(in_d * in_rows * in_cols) +(in_r * in_cols);
					#pragma unroll 
					for (int f_c = 0; f_c < filter_cols; ++f_c) {
						const int in_c = in_c_start + f_c;
						if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
							const int input_offset = input_offset_temp + in_c;
							float partial_sum = (*(input + input_offset)) * out_bp;
							float* addr =filter_backprop + f_c + filter_cols * f_r + out_d*filter_cols*filter_rows;
							atomicAdd(addr, partial_sum);
						}
					}
				}

		}
	}



__global__ void DepthwiseConv2dBackpropInputGPUKernelNCHW(
		const float* out_backprop, const int out_rows, const int out_cols, const int out_depth,
		const float* filter, const int filter_rows, const int filter_cols,
		float* in_backprop, const int in_rows, const int in_cols, const int in_depth,
		const int stride, const int pad_rows, const int pad_cols,int num_in_backprop)
{
		int thread_id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
		if (thread_id >= num_in_backprop) return;

		const int in_c = thread_id % in_cols;
		const int in_r = (thread_id / in_cols) % in_rows;
		const int in_d = (thread_id / in_cols / in_rows) % in_depth;
		const int b = thread_id / in_depth / in_cols / in_rows;

		float sum = 0;


		const int out_r_start =max(0, (in_r - filter_rows + pad_rows + stride) / stride);
		const int out_r_end = min(out_rows - 1, (in_r + pad_rows) / stride);
		const int out_c_start =
			max(0, (in_c - filter_cols + pad_cols + stride) / stride);
		const int out_c_end = min(out_cols - 1, (in_c + pad_cols) / stride);


		#pragma unroll 
			for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
				const int f_r = in_r + pad_rows - out_r * stride;


				for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
					const int f_c = in_c + pad_cols - out_c * stride;
					const int filter_offset = f_c + filter_cols * f_r + in_d *filter_cols*filter_rows;

					const int out_backprop_offset =
						(b * out_depth * out_rows * out_cols) +
						(in_d * out_rows * out_cols) + (out_r * out_cols) + (out_c);

					sum += (*(out_backprop + out_backprop_offset)) *
						(*(filter + filter_offset));
				}
			}
		in_backprop[thread_id] = sum;

}


void forward_depthwise_convolutional_layer_gpu(depthwise_convolutional_layer l, network net)
{
	//cuda_pull_array(l.output_gpu, l.output, l.c*l.out_h*l.out_w);//add by hjimce for debug
    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);


	int size = l.out_h*l.out_w*l.batch*l.n;
	DepthwiseConv2dGPUKernelNCHW << > >(
		net.input_gpu,l.h,l.w,l.c,
		l.weights_gpu, l.size, l.size,
		l.stride, l.pad, l.pad,
		l.out_h, l.out_w, l.n,
		l.output_gpu, size
		);
	check_error(cudaPeekAtLastError());
   /* int i;
    int k = l.size*l.size;
    int n = l.out_w*l.out_h;

	for (int b = 0; b < l.batch; ++b) {
		for (int c = 0; c> > (
		l.delta_gpu, l.stride, l.pad, l.pad, l.out_h, l.out_w, l.c,
		net.input_gpu, l.h, l.w, l.n,
		l.weight_updates_gpu, l.size, l.size,
		out_size);
	if (net.delta_gpu)// 
	{
		int in_size = l.h*l.w*l.batch*l.n;
		DepthwiseConv2dBackpropInputGPUKernelNCHW << > > (
			l.delta_gpu, l.out_h, l.out_w, l.c,
			l.weights_gpu, l.size, l.size,
			net.delta_gpu, l.h, l.w, l.c,
			l.stride, l.pad, l.pad, in_size);

	}
	//cuda_pull_array(net.delta_gpu, net.delta, l.batch*l.c*l.h*l.w);
	//pull_depthwise_convolutional_layer(l);//add by hjimce for debug

//#endif
}
//������������
void pull_depthwise_convolutional_layer(depthwise_convolutional_layer layer)
{
    cuda_pull_array(layer.weights_gpu, layer.weights, layer.n*layer.size*layer.size);
    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.n*layer.size*layer.size);
    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
    if (layer.batch_normalize){
        cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
        cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
        cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
    }
}
//������������
void push_depthwise_convolutional_layer(depthwise_convolutional_layer layer)
{
    cuda_push_array(layer.weights_gpu, layer.weights, layer.n*layer.size*layer.size);
    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.n*layer.size*layer.size);
    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
    if (layer.batch_normalize){
        cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
        cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
        cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
    }
}

void update_depthwise_convolutional_layer_gpu(layer l, update_args a)
{
    float learning_rate = a.learning_rate*l.learning_rate_scale;
    float momentum = a.momentum;
    float decay = a.decay;
    int batch = a.batch;

    int size = l.size*l.size*l.c;

    if(a.adam){
        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t);
        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
        if(l.scales_gpu){
            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
        }

    }else{
        axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
        axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
        scal_gpu(size, momentum, l.weight_updates_gpu, 1);

        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

        if(l.scales_gpu){
            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
        }


    }

}


5 Makefileで、次のコードを追加します。


主に元のMakefileにDW CONV関連のコンパイルを追加する.
++ OBJ= depthwise_convolutional_layer.o
ifeq ($(GPU), 1) 
LDFLAGS+= -lstdc++ 
++ OBJ+=depthwise_convolutional_kernels.o 
endif

6再コンパイル;


& make  -j8

7プロファイル;


例:mobilenetV 2を実装するプロファイルmobilenet_v2_imagenet.cfg
[net]
# Testing
batch=1
subdivisions=1
# Training
#batch=128
#subdivisions=2
height=224
width=224
max_crop=320
channels=3
momentum=0.9
decay=0.0005

hue=.1
saturation=.75
exposure=.75

learning_rate=0.1
policy=poly
power=4
max_batches=1600000

#learning_rate=0.045
#max_batches = 800000
#policy=steps
#steps=80000,180000,330000,480000
#scales=.1,.1,.1,.1

[convolutional]
filters=32
size=3
stride=2
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=32
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=32
size=3
stride=1
#groups=32
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=16
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=96
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=96
size=3
stride=2
#groups=96
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=24
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=144
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=144
size=3
stride=1
#groups=144
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=24
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=144
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=144
size=3
stride=2
#groups=144
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=32
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=192
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=192
size=3
stride=1
#groups=192
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=32
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=192
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=192
size=3
stride=1
#groups=192
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=32
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=192
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=192
size=3
stride=1
#groups=192
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=64
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=384
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=384
size=3
stride=1
#groups=384
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=64
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=384
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=384
size=3
stride=1
#groups=384
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=64
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=384
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=384
size=3
stride=1
#groups=384
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=64
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=384
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=384
size=3
stride=2
#groups=384
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=96
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=576
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=576
size=3
stride=1
#groups=576
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=96
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=576
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=576
size=3
stride=1
#groups=576
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=96
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=576
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=576
size=3
stride=2
#groups=576
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=160
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=960
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=960
size=3
stride=1
#groups=960
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=160
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=960
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=960
size=3
stride=1
#groups=960
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=160
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[shortcut]
from=-4
activation=linear

[convolutional]
filters=960
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[depthwise_convolutional]
#filters=960
size=3
stride=1
#groups=960
pad=1
batch_normalize=1
activation=leaky

[convolutional]
filters=320
size=1
stride=1
pad=1
batch_normalize=1
activation=linear

[convolutional]
filters=1280
size=1
stride=1
pad=1
batch_normalize=1
activation=leaky

[avgpool]

[convolutional]
filters=1000
size=1
stride=1
pad=1
activation=linear

[softmax]
groups=1