urbansound8K的音频数据转换为npy文件格式python

转载

锦绣前程未央 2024-10-02 08:28:57

文章标签 音视频 ffmpeg #include 数据编码器 文章分类 Python 后端开发

本章节将介绍如何将yuv文件或pcm文件编码成h264和aac文件，以下代码运行后会将输入的pcm文件编码成aac文件，如果需要编码yuv文件需要修改输入文件和输出文件，同时将enum AVCodecID codec_id设为AV_CODEC_ID_H264.

完整代码

在运行目录准备好yuv文件和pcm文件，编译运行后会在输出目录生成编码后的h264或aac文件。

#ifdef __cplusplus
extern "C"
{
#endif
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <libavcodec/avcodec.h>
#include <libavutil/channel_layout.h>
#include <libavutil/common.h>
#include <libavutil/frame.h>
#include <libavutil/samplefmt.h>
#include <libavutil/time.h>
#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#ifdef __cplusplus
}
#endif

// 添加7字节的adts音频头，固定格式
static void get_adts_header(AVCodecContext *ctx, uint8_t *adts_header, int aac_length)
{
    uint8_t freq_idx = 0; // 索引
    uint32_t frame_length = aac_length + 7;
    switch (ctx->sample_rate) {   // 根据采样率得到索引值
        case 96000: freq_idx = 0; break;
        case 88200: freq_idx = 1; break;
        case 64000: freq_idx = 2; break;
        case 48000: freq_idx = 3; break;
        case 44100: freq_idx = 4; break;
        case 32000: freq_idx = 5; break;
        case 24000: freq_idx = 6; break;
        case 22050: freq_idx = 7; break;
        case 16000: freq_idx = 8; break;
        case 12000: freq_idx = 9; break;
        default:    freq_idx = 4; break;
    }
    adts_header[0] = 0xFF;
    adts_header[1] = 0xF1;
    adts_header[2] = ((ctx->profile) << 6) + (freq_idx << 2) + (ctx->channels >> 2);
    adts_header[3] = (((ctx->channels & 3) << 6) + (frame_length  >> 11));
    adts_header[4] = ((frame_length & 0x7FF) >> 3);
    adts_header[5] = (((frame_length & 7) << 5) + 0x1F);
    adts_header[6] = 0xFC;
}

// 编码一帧数据（音频或者视频）
static int encode(AVCodecContext *ctx, AVFrame *frame, AVPacket *pkt, FILE *out_file)
{
    /* 使用x264进行编码时，不会增加avframe对应buffer的reference*/
    int ret = avcodec_send_frame(ctx, frame);    // 发送帧数据给解码器，send后需要用循环receive多次
    if (ret < 0) { return -1; }

    while (ret >= 0) {                           // send 1次，然后receive多次, 直到AVERROR(EAGAIN)或者AVERROR_EOF
        ret = avcodec_receive_packet(ctx, pkt);  // 函数内部会释放packet，while里不需要调用av_packet_unref();
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return 0;  // 解码完成
        } else if (ret < 0) {
            return -1; // 解码出错
        }
        if((ctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) { // 音频需要额外的adts header写入
            uint8_t aac_header[7];                       // adts音频头
            get_adts_header(ctx, aac_header, pkt->size); // 获取adts音频头
            if (fwrite(aac_header, 1, 7, out_file) != 7) {    // 将adts音频头写入文件
                return -1; // 音频头写入失败失败
            }
        }

        if (fwrite(pkt->data, 1, pkt->size, out_file) != (size_t)pkt->size) { // 将编码后的数据写入文件
            return -1; // 写文件失败
        }
    }
    return -1;
}

// 采样格式转化，将f32le packed转为float palanar
static void f32le_convert_to_fltp(float *f32le, float *fltp, int nb_samples) {
    float *fltp_l = fltp;                 // 左通道
    float *fltp_r = fltp + nb_samples;    // 右通道
    for(int i = 0; i < nb_samples; i++) { // 这里只支持2通道的转换
        fltp_l[i] = f32le[i*2];
        fltp_r[i] = f32le[i*2+1];
    }
}

int main()
{
    char           *in_file    = "f32le.pcm";                    // 输入文件
    char           *out_file   = "out.aac";                      // 输出文件
    FILE           *infile     = fopen(in_file, "rb");           // 打开输入文件
    FILE           *outfile    = fopen(out_file, "wb");          // 打开输出文件
    AVFrame        *frame      = av_frame_alloc();               // 用于存放编码前的音频帧
    AVPacket       *pkt        = av_packet_alloc();              // 用于存放编码后的音频帧
    enum AVCodecID codec_id    = AV_CODEC_ID_AAC;                // 编码器ID，libx264
    const AVCodec  *codec      = avcodec_find_encoder(codec_id); // 解码器
    AVCodecContext *codec_ctx  = avcodec_alloc_context3(codec);  // 解码器上下文

    if (!infile)    { exit(1); }  // 输入文件打开失败
    if (!outfile)   { exit(1); }  // 输出文件打开失败
    if (!frame)     { exit(1); }  // AVFrame创建失败
    if (!pkt)       { exit(1); }  // AVPacket创建失败
    if (!codec)     { exit(1); }  // 找不到解码器
    if (!codec_ctx) { exit(1); }  // 解码器上下文创建失败

    if (codec->id == AV_CODEC_ID_H264) { // 视频设置
        codec_ctx->width        = 1280;  // 设置分辨率*/
        codec_ctx->height       = 720;
        codec_ctx->time_base    = (AVRational){1, 25}; // 时间基数 1/25
        codec_ctx->framerate    = (AVRational){25, 1}; // 帧率25帧/秒
        codec_ctx->gop_size     = 25;   // I帧间隔，如果frame->pict_type设置为AV_PICTURE_TYPE_I, 则忽略gop_size的设置，一直当做I帧编码
        codec_ctx->max_b_frames = 2;    // 如果不想包含B帧则设置为0
        codec_ctx->pix_fmt      = AV_PIX_FMT_YUV420P;
        codec_ctx->bit_rate     = 3000000;   // 编码器比特率
        av_opt_set(codec_ctx->priv_data, "preset", "medium", 0);
        av_opt_set(codec_ctx->priv_data, "profile", "main", 0);   // 默认是high
        av_opt_set(codec_ctx->priv_data, "tune","zerolatency",0); // 直播时才使用该设置
        // av_opt_set(codec_ctx->priv_data, "tune","film",0); //  画质film
    } else { // 音频设置
        codec_ctx->codec_id       = codec_id;              // 编码器ID
        codec_ctx->bit_rate       = 128*1024;              // 比特率
        codec_ctx->channel_layout = AV_CH_LAYOUT_STEREO;   // 音频通道类型，可通过遍历codec->channel_layouts数组判断解码器是否支持该通道类型
        codec_ctx->sample_rate    = 48000;                 // 采样率48000，可通过遍历codec->supported_samplerates数组判断解码器是否支持该采样率
        codec_ctx->channels       = av_get_channel_layout_nb_channels(codec_ctx->channel_layout); // 通道数
        codec_ctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;    // 采样格式，可通过遍历codec->sample_fmts数组判断解码器是否支持该采样格式
        codec_ctx->codec_type     = AVMEDIA_TYPE_AUDIO;    // 解码器类型，音频解码器
        codec_ctx->profile        = FF_PROFILE_AAC_LOW;    //
        codec_ctx->flags = AV_CODEC_FLAG_GLOBAL_HEADER;    // ffmpeg默认的aac是不带adts，而fdk_aac默认带adts，这里我们强制不带
    }

    if (avcodec_open2(codec_ctx, codec, NULL) < 0) { // 将codec_ctx和codec进行绑定
        exit(1);
    }
    printf("thread_count: %d, thread_type:%d\n", codec_ctx->thread_count, codec_ctx->thread_type);
    printf("frame_size:%d\n\n", codec_ctx->frame_size);           // 帧大小，avcodec_open2函数执行后才有frame_size

    int frame_bytes = 0;
    if (codec->id == AV_CODEC_ID_H264) { // 视频设置
	    frame->format = codec_ctx->pix_fmt;
        frame->width  = codec_ctx->width;
        frame->height = codec_ctx->height;
        // 计算出每一帧的数据 像素格式 * 宽 * 高
        frame_bytes = av_image_get_buffer_size(frame->format, frame->width, frame->height, 1);
	} else { // 音频设置
		// 每次送多少数据给编码器由frame_size、sample_fmt、channel_layout决定
        frame->nb_samples     = codec_ctx->frame_size;     // 每帧单个通道的采样点数
        frame->format         = codec_ctx->sample_fmt;     // 采样格式
        frame->channel_layout = codec_ctx->channel_layout; // 声道布局
        frame->channels       = av_get_channel_layout_nb_channels(frame->channel_layout);
        // 计算出每一帧的数据 单个采样点的字节 * 通道数目 * 每帧采样点数量
        frame_bytes = av_get_bytes_per_sample(frame->format) * frame->channels * frame->nb_samples;
	}
    
    if (av_frame_get_buffer(frame, 0) < 0) { // 为AVFrame分配空间
        exit(1);  // AVFrame空间分配失败
    }

    printf("frame_bytes %d\n", frame_bytes);
    uint8_t *frame_buf = (uint8_t *)malloc(frame_bytes);
    uint8_t *pcm_temp_buf = (uint8_t *)malloc(frame_bytes);
    if(!frame_buf || !pcm_temp_buf) {
        return 1; // 缓冲区空间分配失败
    }

    int64_t all_begin_time = av_gettime_relative() / 1000;   // 编码开始时间
    int64_t pts = 0; // 时间戳，这个时间戳用来告诉播放器该在什么时候显示这一帧的数据
    printf("============================== Start Encode ==============================\n");
    for (;;) {
        memset(frame_buf, 0, frame_bytes); // 清空缓冲区
        size_t read_bytes = fread(frame_buf, 1, frame_bytes, infile);
        if(read_bytes <= 0) {
            break; // 读到文件末尾，编码结束
        }

        // 确保该frame可写，新写入的数据和编码器保存的数据不能产生冲突
        if(av_frame_make_writable(frame) != 0) { // av_frame_is_writable(frame)函数可以判断frame是否可写
            break; // frame不可写，退出
        }

        if (codec->id == AV_CODEC_ID_H264){ // 视频
            av_image_fill_arrays(frame->data, frame->linesize, frame_buf,
                                 frame->format, frame->width, frame->height, 1);
            pts += 40; // 40ms一帧，一秒25帧
        } else { // 音频
            memset(pcm_temp_buf, 0, frame_bytes);
            f32le_convert_to_fltp((float *)frame_buf, (float *)pcm_temp_buf, frame->nb_samples); // 将f32le packed转为float palanar
            av_samples_fill_arrays(frame->data, frame->linesize,    // 将读取到的PCM数据填充到frame去
                    pcm_temp_buf, frame->channels, frame->nb_samples, frame->format, 0);
            pts += frame->nb_samples; // 使用采样率作为时间戳的单位，一个nb_samples代表一个单位pts
        }

        frame->pts = pts;         // 设置pts，具体换算成秒 pts/采样率
        if (encode(codec_ctx, frame, pkt, outfile) < 0) { // 一帧一帧编码
            break; // 编码失败
        }
    }

    encode(codec_ctx, NULL, pkt, outfile);  // 冲刷编码器，可能有一小段未编码完成
    printf("============================== End of Encode ==============================\n");
    printf("Encode time: %4.4fs\n", ((av_gettime_relative() / 1000) - all_begin_time) / 1000.0); // 计算编码总时长

    if(frame_buf) { // 释放内存
        free(frame_buf);
        frame_buf = NULL;
    }
    if (pcm_temp_buf) {
        free(pcm_temp_buf);
        pcm_temp_buf = NULL;
    }
    fclose(infile);// 关闭文件
    fclose(outfile);
    av_frame_free(&frame);
    av_packet_free(&pkt);
    avcodec_free_context(&codec_ctx);
    getchar();
    return 0;
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。