IOS微软语音转文本，lame压缩音频_业界新闻

发布时间:2024-07-29 14:01

阅读量:0

在IOS开发中，用微软进行语音转文本操作，并将录音文件压缩后返回

项目中遇到了利用微软SDK进行实时录音转文本操作，如果操作失败，那么就利用原始音频文件通过网络请求操作，最终这份文件上传到阿里云保存，考虑到传输速率，对文件压缩成mp3再上传

遇到的难点

微软的示例中只能转文本，微软并不保存这份音频文件，需要自己实现从录音到推流，到获取结果
项目是uniapp项目，非原生工程项目，录音管理器需要激活后才能使用
关于压缩代码，采用Lame库压缩，网上大部分都是通过文件提取压缩再保存，直接录制音频压缩较少，记录下来以便后续使用

流程图

实现步骤

录音的实现

// 每个缓冲区的大小 #define kBufferSize 2048 // 缓冲区数量 #define kNumberBuffers 3  // 定义结构体，里面保存录音队列ID，录音格式 typedef struct {     AudioStreamBasicDescription dataFormat;     AudioQueueRef               queue;     AudioQueueBufferRef         buffers[kNumberBuffers];     UInt32                      bufferByteSize;     __unsafe_unretained id      selfRef; } AQRecorderState;  AQRecorderState recorderState = {0};  - (instancetype)init {     self = [super init];     if (self) {         // 设置音频格式         recorderState.dataFormat.mFormatID = kAudioFormatLinearPCM;         recorderState.dataFormat.mSampleRate = 16000.0;         recorderState.dataFormat.mChannelsPerFrame = 1;         recorderState.dataFormat.mBitsPerChannel = 16;         recorderState.dataFormat.mBytesPerPacket = recorderState.dataFormat.mBytesPerFrame = recorderState.dataFormat.mChannelsPerFrame * sizeof(SInt16);         recorderState.dataFormat.mFramesPerPacket = 1;         recorderState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;     }     return self; }  - (void)configureAudioSession {     AVAudioSession *session = [AVAudioSession sharedInstance];     NSError *error = nil;          // 设置音频会话类别和模式     [session setCategory:AVAudioSessionCategoryPlayAndRecord  error:&error];          if (error) {         NSLog(@"Error setting category: %@", error.localizedDescription);     }          // 激活音频会话     [session setActive:YES error:&error];     if (error) {         NSLog(@"Error activating session: %@", error.localizedDescription);     } } // 开始录音 - (void)startRecording{ 	// 激活录音文件 	 [self configureAudioSession];     // 创建录音队列     AudioQueueNewInput(&recorderState.dataFormat, HandleInputBuffer, &recorderState, NULL, kCFRunLoopCommonModes, 0, &recorderState.queue);     // 设置录音增益     AudioQueueSetParameter(recorderState.queue, kAudioQueueParam_Volume, 1.0);     // 计算缓冲区大小     DeriveBufferSize(recorderState.queue, &recorderState.dataFormat, 0.5, &recorderState.bufferByteSize);          // 分配和分配缓冲区     for (int i = 0; i < kNumberBuffers; i++) {         AudioQueueAllocateBuffer(recorderState.queue, recorderState.bufferByteSize, &recorderState.buffers[i]);         AudioQueueEnqueueBuffer(recorderState.queue, recorderState.buffers[i], 0, NULL);     } 	OSStatus status = AudioQueueStart(recorderState.queue, NULL);     if (status != noErr) {         NSLog(@"AudioQueueNewInput failed with error: %d", (int)status);     } } // 结束录音 - (void)stopRecording{     // 停止录音     AudioQueueStop(recorderState.queue, true);     AudioQueueDispose(recorderState.queue, true); }; // 计算缓冲区大小 void DeriveBufferSize(AudioQueueRef audioQueue, AudioStreamBasicDescription *ASBDesc, Float64 seconds, UInt32 *outBufferSize) {          static const int maxBufferSize = 0x50000; // 限制缓冲区的最大值     int maxPacketSize = ASBDesc->mBytesPerPacket;     if (maxPacketSize == 0) {         UInt32 maxVBRPacketSize = sizeof(maxPacketSize);         AudioQueueGetProperty(audioQueue, kAudioQueueProperty_MaximumOutputPacketSize, &maxPacketSize, &maxVBRPacketSize);     }     Float64 numBytesForTime = ASBDesc->mSampleRate * maxPacketSize * seconds;     *outBufferSize = (UInt32)(numBytesForTime < maxBufferSize ? numBytesForTime : maxBufferSize); } // 数据处理回调函数 这个里面有个录音回掉的PCM数据 void HandleInputBuffer(void *aqData, AudioQueueRef inAQ, AudioQueueBufferRef inBuffer, const AudioTimeStamp *inStartTime, UInt32 inNumPackets, const AudioStreamPacketDescription *inPacketDesc) {     AQRecorderState *pAqData = (AQRecorderState *)aqData;          // 如果有数据，就处理     if (inNumPackets > 0) {         // 创建NSData对象         NSData *audioData = [NSData dataWithBytes:inBuffer->mAudioData length:inBuffer->mAudioDataByteSize];         // 打印NSData对象内容         NSLog(@"Audio Data: %@", audioData);         // 这儿将进行保存文件         // 编码文件         // 推送数据到微软SDK     }          // 将缓冲区重新加入到队列中     AudioQueueEnqueueBuffer(pAqData->queue, inBuffer, 0, NULL); }

微软SDK初始化和推流

这个大部分和微软示例差不多，需要注意的是获取微软示例时候传入的是自定义的录音设置，并将自定义录音设置保存起来，在录音回掉中将数据推入流中

- (void)setUpKey:(NSString *)token service:(NSString *)service lang:(NSString *) lang {  	// 这里将通过token和区域初始化配置类，微软还有其他获取配置类的方法，其他方法示例化也可以     SPXSpeechConfiguration *speechConfig = nil;             speechConfig = [[SPXSpeechConfiguration alloc] initWithAuthorizationToken:token region:service];             // 这个是通过token实例化配置类的方式 //    speechConfig = [[SPXSpeechConfiguration alloc] initWithSubscription:token region:service]; 	// 设置语言 en-US格式     [speechConfig setSpeechRecognitionLanguage:lang];     // 设置微软接收到的数据的格式 16000HZ 16位深 单通道     SPXAudioStreamFormat *audioFormat = [[SPXAudioStreamFormat alloc] initUsingPCMWithSampleRate: 16000 bitsPerSample:16 channels:1];     // 获取推流的类，并保存起来，后面就通过它推送数据到SDK     self.audioInputStream = [[SPXPushAudioInputStream alloc] initWithAudioFormat:audioFormat];     // 获取录音配置     SPXAudioConfiguration* audioConfig = [[SPXAudioConfiguration alloc] initWithStreamInput:self.audioInputStream];     // 通过配置类和录音类信息获取微软识别器     self.recognizer = [[SPXSpeechRecognizer alloc] initWithSpeechConfiguration:speechConfig audioConfiguration:audioConfig];     // 定义已识别事件的处理函数        [self.recognizer addRecognizedEventHandler:^(SPXSpeechRecognizer *recognizer, SPXSpeechRecognitionEventArgs *eventArgs) {         NSString *recognizedText = eventArgs.result.text;         NSLog(@"Final recognized text: %@", recognizedText);         // 在这里处理最终识别结果         [self.speechToTextResult appendFormat:recognizedText];     }];          // 定义识别中事件的处理函数     [self.recognizer addRecognizingEventHandler:^(SPXSpeechRecognizer *recognizer, SPXSpeechRecognitionEventArgs *eventArgs) {         NSString *intermediateText = eventArgs.result.text;         NSLog(@"Intermediate recognized text: %@", intermediateText);         // 在这里处理中间识别结果     }];          // 定义取消事件的处理函数     [self.recognizer addCanceledEventHandler:^(SPXSpeechRecognizer *recognizer, SPXSpeechRecognitionCanceledEventArgs *eventArgs) {         NSLog(@"Recognition canceled. Reason: %ld", (long)eventArgs.reason);         if (eventArgs.errorDetails != nil) {             NSLog(@"Error details: %@", eventArgs.errorDetails);         }     }];  } - (void)startRecording{  	    [self.recognizer startContinuousRecognition]; } - (void)stopRecording{  	[self.recognizer stopContinuousRecognition]; } //  // 数据处理回调函数 void HandleInputBuffer(void *aqData, AudioQueueRef inAQ, AudioQueueBufferRef inBuffer, const AudioTimeStamp *inStartTime, UInt32 inNumPackets, const AudioStreamPacketDescription *inPacketDesc) { 		// 在回掉函数中，将数据传递给微软，上面的回掉函数中就能拿到数据了 	   [self.audioInputStream write:audioData]; }

保存文件

保存文件相对简单，录音开始清除上一次的音频文件，创建新的音频文件
WAV文件需要添加头文件，才能正常播放

 #import "SaveAudioFile.h"   #define isValidString(string)               (string && [string isEqualToString:@""] == NO) // WAV 文件头结构 typedef struct {     char riff[4];     UInt32 fileSize;     char wave[4];     char fmt[4];     UInt32 fmtSize;     UInt16 formatTag;     UInt16 channels;     UInt32 samplesPerSec;     UInt32 avgBytesPerSec;     UInt16 blockAlign;     UInt16 bitsPerSample;     char data[4];     UInt32 dataSize; } WAVHeader;  @implementation SaveAudioFile /**  * 清理文件  */ - (void)cleanFile {          if (isValidString(self.mp3Path)) {         NSFileManager *fileManager = [NSFileManager defaultManager];         BOOL isDir = FALSE;         BOOL isDirExist = [fileManager fileExistsAtPath:self.mp3Path isDirectory:&isDir];         if (isDirExist) {             [fileManager removeItemAtPath:self.mp3Path error:nil];             NSLog(@"  xxx.mp3  file   already delete");         }     }     if (isValidString(self.wavPath)) {         NSFileManager *fileManager = [NSFileManager defaultManager];         BOOL isDir = FALSE;         BOOL isDirExist = [fileManager fileExistsAtPath:self.wavPath isDirectory:&isDir];         if (isDirExist) {             [fileManager removeItemAtPath:self.wavPath error:nil];             NSLog(@"  xxx.caf  file   already delete");         }     } } /**  *  取得录音文件保存路径  *  *  @return 录音文件路径  */ -(NSURL *)getSavePath{     //  在Documents目录下创建一个名为FileData的文件夹     NSString *path = [[NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES)lastObject] stringByAppendingPathComponent:@"AudioData"];     NSLog(@"%@",path);          NSFileManager *fileManager = [NSFileManager defaultManager];     BOOL isDir = FALSE;     BOOL isDirExist = [fileManager fileExistsAtPath:path isDirectory:&isDir];     if(!(isDirExist && isDir))              {         BOOL bCreateDir = [fileManager createDirectoryAtPath:path withIntermediateDirectories:YES attributes:nil error:nil];         if(!bCreateDir){             NSLog(@"创建文件夹失败！");         }         NSLog(@"创建文件夹成功，文件路径%@",path);     }     NSString *fileName = @"record";     NSString *wavFileName = [NSString stringWithFormat:@"%@.wav", fileName];     NSString *mp3FileName = [NSString stringWithFormat:@"%@.mp3", fileName];          NSString *wavPath = [path stringByAppendingPathComponent:wavFileName];     NSString *mp3Path = [path stringByAppendingPathComponent:mp3FileName];          self.wavPath = wavPath;     self.mp3Path = mp3Path;          NSLog(@"file path:%@",mp3Path);          NSURL *url=[NSURL fileURLWithPath:mp3Path];     return url; }  -(void) startWritingHeaders {     [self cleanFile];     [self getSavePath];     // 写入 WAV 头部    WAVHeader header;    memcpy(header.riff, "RIFF", 4);    header.fileSize = 0;  // 将在录音结束时填充    memcpy(header.wave, "WAVE", 4);    memcpy(header.fmt, "fmt ", 4);    header.fmtSize = 16;    header.formatTag = 1;  // PCM    header.channels = 1;    header.samplesPerSec = 16000;    header.avgBytesPerSec = 16000 * 2;    header.blockAlign = 2;    header.bitsPerSample = 16;    memcpy(header.data, "data", 4);    header.dataSize = 0;  // 将在录音结束时填充          // 创建 WAV 文件API    [[NSFileManager defaultManager] createFileAtPath:self.wavPath contents:nil attributes:nil];    self.audioFileHandle = [NSFileHandle fileHandleForWritingAtPath:self.wavPath];     [self.audioFileHandle writeData:[NSData dataWithBytes:&header length:sizeof(header)]];     // 创建 mp3 文件API     [[NSFileManager defaultManager] createFileAtPath:self.mp3Path contents:nil attributes:nil];     self.audioFileHandle2 = [NSFileHandle fileHandleForWritingAtPath:self.mp3Path]; } - (void) saveAudioFile: (NSData *) data type:(NSString *) type{     if([type isEqualToString:@"wav"]){         // 写入音频数据到 WAV 文件         [self.audioFileHandle writeData:data];     }else{         // 拿到编码过后的数据，保存到本地         [self.audioFileHandle2 writeData:data];     } } @end

利用Lame库编码PCM数据

下载Lame库并导入项目中操作，参考网上文章https://www.cnblogs.com/XYQ-208910/p/7650759.html
lame库的使用主要分成3部分
- 初始化Lame 并设置比特率，位深，通道数，压缩程度
- 传入原始的音频数据，得到编码过后的mp3音频数据
- 结束时刷新lame中还剩的数据，关闭Lame

// //  LameEncoderMp3.m //  SpeechUntil // //  Created by 肖鹏程 on 2024/7/25. //  #import "LameEncoderMp3.h"   @implementation LameEncoderMp3  - (void) settingFormat:(int)sampleRate channels:(int)channels{     // 初始化lame编码器 设置格式      self.lame = lame_init();     lame_set_in_samplerate(self.lame, sampleRate);     lame_set_num_channels(self.lame, channels);     lame_set_brate(self.lame, 16); // 比特率128 kbps     lame_set_mode(self.lame, channels == 1 ? MONO : STEREO);     lame_set_quality(self.lame, 7); // 0 = 最高质量（最慢），9 = 最低质量（最快）     lame_init_params(self.lame);     self.channels = channels;  }; - (NSData *)encodePCMToMP3:(NSData *)pcmData{      // PCM数据的指针和长度     const short *pcmBuffer = (const short *)[pcmData bytes];     int pcmLength = (int)[pcmData length] / sizeof(short);     NSLog(@"pcmLength %lu", [pcmData length]);     // 分配MP3缓冲区     int mp3BufferSize = (int)(1.25 * pcmLength) + 7200;     unsigned char *mp3Buffer = (unsigned char *)malloc(mp3BufferSize);     // 确保mp3Buffer分配成功        if (mp3Buffer == NULL) {            NSLog(@"Failed to allocate memory for MP3 buffer");            return nil;        }     // PCM编码为MP3     // 注意这个是单通道的方法，如果是双通道调用这个lame_encode_buffer_interleaved(     //   lame,     //   recordingData,     //   numSamples / 2,  // 双声道     //   mp3Buffer,     //   mp3BufferSize     );     int mp3Length = lame_encode_buffer(self.lame, (short *)pcmBuffer, (short *)pcmBuffer,pcmLength, mp3Buffer, mp3BufferSize);     if (mp3Length < 0) {         NSLog(@"LAME encoding error: %d", mp3Length);         free(mp3Buffer);         return nil;     }     // 创建MP3数据     NSData *mp3Data = [NSData dataWithBytes:mp3Buffer length:mp3Length];     NSLog(@"mp3Length %lu", [mp3Data length]);     // 清理     free(mp3Buffer);          return mp3Data; }  - (NSData *) closeLame{     // 刷新LAME缓冲区   unsigned char mp3Buffer[7200];   int flushLength = lame_encode_flush(self.lame, mp3Buffer, sizeof(mp3Buffer));     NSData *flushData;   if (flushLength > 0) {       // 将刷新后的数据追加到已有的MP3数据       flushData = [NSData dataWithBytes:mp3Buffer length:flushLength];   } else if (flushLength < 0) {       NSLog(@"LAME flushing error: %d", flushLength);   }   // 关闭     lame_close(self.lame);     return flushData; }  @end