FFmpeg API を使用して映像ファイルの読み書きをする


はじめに

これまでは映像データを取り扱うときにOpenCVのVideoCaptureやVideoWriterを使っていたのですが、FFmpegのAPIを使ったほうがいろいろと融通が利くかなと思ったので、今回はそのコードを書いてみました。
(googleで検索をかけると avcodec_decode_video2() や avpicture_fill() などを使ったコードが見つかるのですが、これらの関数は現在deprecatedとなっているので、今のAPIに即したものを書いてみました。)

コード

ということで、映像ファイル(ここではmp4)を読みこみ、そのネガポジ反転をした映像を書き込むための試験的なコードがこちら。デコードした生のデータ処理自体はOpenCVで行うのがいいかもしれません。
(2021/09/14 追記:タイムスタンプ関連の箇所を変更しました)
(2021/09/17 追記:メモリ解放関連の箇所を修正しました)

#include <iostream>
extern "C"{
    #include <libavformat/avformat.h>
    #include <libavcodec/avcodec.h>
    #include <libavutil/imgutils.h>
    #include <libavutil/opt.h>
    #include <libswscale/swscale.h>
    #include <libswresample/swresample.h>
}

void decode_and_encode_video(const char *output, const char *input);

int main(int argc, char *argv[]){
    const char *input = argv[1];
    const char *output = argv[2];
    decode_and_encode_video(output, input);
    return 0;
}

void decode_and_encode_video(const char *output, const char *input){
    AVFormatContext *inputFmtContxt = NULL;
    AVFormatContext *outputFmtContxt = NULL;
    const AVCodec *encoder = NULL;
    const AVCodec *decoder = NULL;
    AVCodecContext *encoderContxt = NULL;
    AVCodecContext *decoderContxt = NULL;
    int ret = 0, video_stream_index = 0;
    ret = avformat_open_input(&inputFmtContxt, input, NULL, NULL);
    if (ret < 0){
        std::cout << "Could not open input video" << std::endl;
    }
    ret = avformat_find_stream_info(inputFmtContxt, NULL);
    if (ret < 0){
        std::cout << "Could not find the stream info" << std::endl;
    }
    const AVOutputFormat *outFmt = av_guess_format("mp4", NULL, NULL);
    avformat_alloc_output_context2(&outputFmtContxt, outFmt, NULL, NULL);
    //デコーダーとエンコーダーを設定
    for (int i=0; i<(int)inputFmtContxt->nb_streams; ++i){
        AVStream *in_stream = inputFmtContxt->streams[i];
        AVCodecParameters *in_par = in_stream->codecpar;
        AVStream *out_stream = avformat_new_stream(outputFmtContxt, NULL);
        if (in_par->codec_type == AVMEDIA_TYPE_VIDEO){
            video_stream_index = i;
            decoder = avcodec_find_decoder(in_par->codec_id);
            decoderContxt = avcodec_alloc_context3(decoder);
            avcodec_parameters_to_context(decoderContxt, in_par);
            decoderContxt->framerate = in_stream->r_frame_rate;
            decoderContxt->time_base = in_stream->time_base;
            avcodec_open2(decoderContxt, decoder, NULL);
            encoder = avcodec_find_encoder(in_par->codec_id);
            encoderContxt = avcodec_alloc_context3(encoder);
            encoderContxt->height = decoderContxt->height;
            encoderContxt->width = decoderContxt->width;
            encoderContxt->pix_fmt = decoderContxt->pix_fmt;
            encoderContxt->qmax = 31;
            encoderContxt->qmin = 2;
            encoderContxt->qcompress = 0.6;
            encoderContxt->max_qdiff = 4;
            encoderContxt->gop_size = 250;
            encoderContxt->keyint_min = 25;
            encoderContxt->max_b_frames = 16;
            encoderContxt->refs = 6;
            encoderContxt->framerate = in_stream->r_frame_rate;
            encoderContxt->time_base = in_stream->time_base;
            encoderContxt->bit_rate = decoderContxt->bit_rate;
            encoderContxt->flags = AV_CODEC_FLAG_GLOBAL_HEADER;
            av_opt_set(encoderContxt->priv_data, "preset", "fast", 0);
            av_opt_set(encoderContxt->priv_data, "tune", "zerolatency", 0);
            avcodec_open2(encoderContxt, encoder, NULL);
            out_stream->time_base = encoderContxt->time_base;
            avcodec_parameters_from_context(out_stream->codecpar, encoderContxt);
        }
        else{
            ret = avcodec_parameters_copy(out_stream->codecpar, in_par);
        }
    }
    //出力ファイルを準備
    av_dump_format(outputFmtContxt, 0, output, 1);
    avio_open(&outputFmtContxt->pb, output, AVIO_FLAG_WRITE);
    ret = avformat_write_header(outputFmtContxt, NULL);
    //YUV と RGB 間の変換用
    enum AVPixelFormat bgr_pix_fmt = AV_PIX_FMT_BGR24;
    int HEIGHT = decoderContxt->height;
    int WIDTH = decoderContxt->width;
    SwsContext *yuv2bgr = sws_getContext(WIDTH, HEIGHT, decoderContxt->pix_fmt, 
                                          WIDTH, HEIGHT, bgr_pix_fmt, SWS_BICUBIC, NULL, NULL, NULL);
    SwsContext *bgr2yuv = sws_getContext(WIDTH, HEIGHT, bgr_pix_fmt,
                                          WIDTH, HEIGHT, encoderContxt->pix_fmt, SWS_BICUBIC, NULL, NULL, NULL);
    //パケットとフレームの準備
    int res = 0;
    AVPacket *packet = av_packet_alloc();
    AVPacket *out_packet = av_packet_alloc();
    out_packet->data = NULL;
    out_packet->size = 0;
    // デコーダーから受け取るフレーム
    AVFrame *frame = av_frame_alloc();
    // RGBへの変換先のフレーム
    AVFrame *bgrframe = av_frame_alloc();
    bgrframe->width = decoderContxt->width;
    bgrframe->height = decoderContxt->height;
    bgrframe->format = bgr_pix_fmt;
    ret = av_frame_get_buffer(bgrframe, 0);
    uint8_t *buf = (uint8_t*) av_malloc(av_image_get_buffer_size(AV_PIX_FMT_BGR24, decoderContxt->width, decoderContxt->height, 1));
    ret = av_image_fill_arrays(bgrframe->data, bgrframe->linesize, buf, AV_PIX_FMT_BGR24, decoderContxt->width, decoderContxt->height, 1);
    // エンコーダ―に渡すフレーム
    AVFrame *outframe = av_frame_alloc();
    outframe->width = decoderContxt->width;
    outframe->height = decoderContxt->height;
    outframe->format = decoderContxt->pix_fmt;
    ret = av_frame_get_buffer(outframe, 0);
    uint8_t *outbuf = (uint8_t*) av_malloc(av_image_get_buffer_size(AV_PIX_FMT_YUV420P, decoderContxt->width, decoderContxt->height, 1));
    ret = av_image_fill_arrays(outframe->data, outframe->linesize, outbuf, AV_PIX_FMT_YUV420P, decoderContxt->width, decoderContxt->height, 1);
    //デコードとエンコードを開始
    while (true){
        ret = av_read_frame(inputFmtContxt, packet);
        if (ret < 0){
            break;
        }
        AVStream *input_stream = inputFmtContxt->streams[packet->stream_index];
        AVStream *output_stream = outputFmtContxt->streams[packet->stream_index];
        if (input_stream->codecpar->codec_type == video_stream_index){
            res = avcodec_send_packet(decoderContxt, packet);
            while (res >= 0){
                res = avcodec_receive_frame(decoderContxt, frame);
                if (res == AVERROR(EAGAIN) || res == AVERROR_EOF){
                    break;
                }
                if (res >= 0){
                    outframe->pict_type = frame->pict_type;
                    outframe->pts = frame->pts;
                    outframe->pkt_dts = frame->pkt_dts;
                    outframe->pkt_duration = frame->pkt_duration;
                    ret = av_frame_make_writable(outframe);
                    //YUVフレームをRGBフレームに変換
                    sws_scale(yuv2bgr, frame->data, frame->linesize, 0, frame->height, bgrframe->data, bgrframe->linesize);
                    //ネガポジ反転
                    int h = bgrframe->height;
                    int l = bgrframe->linesize[0];
                    for (int i=0; i<h; ++i){
                        for (int j=0; j<l; ++j){
                            bgrframe->data[0][i * l + j] = 255 - bgrframe->data[0][i * l + j];
                        }
                    }
                    //RGBからYUVに戻す
                    sws_scale(bgr2yuv, bgrframe->data, bgrframe->linesize, 0, h, outframe->data, outframe->linesize);
                    res = avcodec_send_frame(encoderContxt, outframe);
                    //フレームの書き込み
                    while (res >= 0){
                        res = avcodec_receive_packet(encoderContxt, out_packet);
                        if (res == AVERROR(EAGAIN) || res == AVERROR_EOF){
                            break;
                        }
                        out_packet->pts = av_rescale_q_rnd(outframe->pts, input_stream->time_base, output_stream->time_base, AV_ROUND_NEAR_INF);
                        out_packet->dts = outframe->pts;
                        out_packet->duration = av_rescale_q(packet->duration, input_stream->time_base, output_stream->time_base);
                        res = av_interleaved_write_frame(outputFmtContxt, out_packet);
                    }
                    av_packet_unref(out_packet);
                }
                av_frame_unref(frame);
            }
            av_packet_unref(packet);
        }
        else{
            //音声データはそのまま書き込み
            packet->pts = av_rescale_q_rnd(packet->pts, input_stream->time_base, output_stream->time_base, AV_ROUND_NEAR_INF);
            packet->dts = av_rescale_q_rnd(packet->dts, input_stream->time_base, output_stream->time_base, AV_ROUND_NEAR_INF);
            packet->duration = av_rescale_q(packet->duration, input_stream->time_base, output_stream->time_base);
            res = av_interleaved_write_frame(outputFmtContxt, packet);
            av_packet_unref(packet);
        }
    }
    //各メモリの解放
    av_packet_free(&packet);
    av_frame_free(&frame);
    avformat_free_context(inputFmtContxt);
    avcodec_free_context(&decoderContxt);
    av_packet_free(&out_packet);
    av_write_trailer(outputFmtContxt);
    avformat_free_context(outputFmtContxt);
    avcodec_free_context(&encoderContxt);
    av_freep(&buf);
    av_freep(&outbuf);
    sws_freeContext(yuv2bgr);
    sws_freeContext(bgr2yuv);
}