feat : 语音转文字实现

2025-12-26 18:41:58 +08:00
parent e114675eba
commit f6b7755e32
2 changed files with 304 additions and 155 deletions
--- a/hook/useRealtimeRecorderOnce.js
+++ b/hook/useRealtimeRecorderOnce.js
@@ -2,44 +2,138 @@ import {
    ref,
    onUnmounted
 } from 'vue'
 import {
    $api
 } from '../common/globalFunction';
 import config from '@/config'
 export function useRealtimeRecorderOnce() {
    // --- 状态定义 ---
    const isRecording = ref(false)
-    const isProcessing = ref(false) // 新增：处理录音数据状态
+    const isProcessing = ref(false)
    const recordingDuration = ref(0)
    const volumeLevel = ref(0) // 0-100
    const recognizedText = ref('')
-    const audioData = ref(null) // 新增：存储录音数据
+    const audioData = ref(null)
-    const audioDataForDisplay = ref([]) // 新增：用于波形显示的数据
+    const audioDataForDisplay = ref([])
    // --- 内部变量 ---
    let durationTimer = null
    // --- APP/小程序 变量 ---
    let recorderManager = null;
-    let appAudioChunks = []; // 新增：存储APP录音数据块
+    let appAudioChunks = [];
    // --- H5 变量 ---
    let audioContext = null;
-    let scriptProcessor = null;
+    let mediaRecorder = null;
    let mediaStreamSource = null;
    let h5Stream = null;
-    let h5AudioChunks = []; // 新增：存储H5录音数据块
+    let h5AudioChunks = [];
    let analyser = null;
    let dataArray = null;
    // --- 配置项 ---
    const RECORD_CONFIG = {
        duration: 600000,
        sampleRate: 16000,
        numberOfChannels: 1,
-        format: 'pcm',
+        format: 'wav',
        encodeBitRate: 16000,
        frameSize: 4096
    }
    // --- WAV文件头函数 ---
    const encodeWAV = (samples, sampleRate = 16000, numChannels = 1, bitsPerSample = 16) => {
        const bytesPerSample = bitsPerSample / 8;
        const blockAlign = numChannels * bytesPerSample;
        const byteRate = sampleRate * blockAlign;
        const dataSize = samples.length * bytesPerSample;
        const buffer = new ArrayBuffer(44 + dataSize);
        const view = new DataView(buffer);
        // RIFF chunk descriptor
        writeString(view, 0, 'RIFF');
        view.setUint32(4, 36 + dataSize, true);
        writeString(view, 8, 'WAVE');
        // fmt sub-chunk
        writeString(view, 12, 'fmt ');
        view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM)
        view.setUint16(20, 1, true); // AudioFormat (1 for PCM)
        view.setUint16(22, numChannels, true);
        view.setUint32(24, sampleRate, true);
        view.setUint32(28, byteRate, true);
        view.setUint16(32, blockAlign, true);
        view.setUint16(34, bitsPerSample, true);
        // data sub-chunk
        writeString(view, 36, 'data');
        view.setUint32(40, dataSize, true);
        // Write audio samples
        const volume = 1;
        let offset = 44;
        for (let i = 0; i < samples.length; i++) {
            let sample = Math.max(-1, Math.min(1, samples[i]));
            sample = sample * volume;
            view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
            offset += 2;
        }
        return buffer;
    }
    const writeString = (view, offset, string) => {
        for (let i = 0; i < string.length; i++) {
            view.setUint8(offset + i, string.charCodeAt(i));
        }
    }
    const floatTo16BitPCM = (output, offset, input) => {
        for (let i = 0; i < input.length; i++, offset += 2) {
            const s = Math.max(-1, Math.min(1, input[i]));
            output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
        }
    }
    // --- 音量计算函数 ---
    const calculateVolumeFromFloat32 = (float32Array) => {
        let sum = 0;
        const length = float32Array.length;
        // 计算RMS (均方根)
        for (let i = 0; i < length; i++) {
            sum += float32Array[i] * float32Array[i];
        }
        const rms = Math.sqrt(sum / length);
        // 转换为0-100的值
        // 通常对话语音的RMS在0.01-0.1之间，尖叫可达0.3
        let volume = Math.min(100, Math.floor(rms * 300));
        // 设置最小阈值，避免静音时完全为0
        if (volume < 5) volume = 0;
        return volume;
    }
    const calculateVolumeFromInt16 = (int16Array) => {
        let sum = 0;
        const length = int16Array.length;
        // 计算RMS
        for (let i = 0; i < length; i++) {
            const normalized = int16Array[i] / 32768; // 归一化到[-1, 1]
            sum += normalized * normalized;
        }
        const rms = Math.sqrt(sum / length);
        // 转换为0-100的值
        let volume = Math.min(100, Math.floor(rms * 300));
        // 设置最小阈值
        if (volume < 5) volume = 0;
        return volume;
    }
    /**
     * 开始录音 (入口)
     */
@@ -90,51 +184,75 @@ export function useRealtimeRecorderOnce() {
    }
    /**
-     * H5录音实现
+     * H5录音实现 - 手动构建WAV文件
     */
    const startH5Recording = async () => {
        try {
            // 1. 获取麦克风流
            const stream = await navigator.mediaDevices.getUserMedia({
-                audio: true
+                audio: {
                    sampleRate: 16000,
                    channelCount: 1,
                    echoCancellation: true,
                    noiseSuppression: true,
                    autoGainControl: false
                }
            });
            h5Stream = stream;
-            // 2. 创建 AudioContext
+            // 2. 创建 AudioContext 用于处理音频
            const AudioContext = window.AudioContext || window.webkitAudioContext;
            audioContext = new AudioContext({
-                sampleRate: 16000
+                sampleRate: 16000,
                latencyHint: 'interactive'
            });
-            mediaStreamSource = audioContext.createMediaStreamSource(stream);
+            // 创建音频处理节点
-            scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
+            const source = audioContext.createMediaStreamSource(stream);
-            scriptProcessor.onaudioprocess = (event) => {
+            // 创建分析器用于音量计算
            analyser = audioContext.createAnalyser();
            analyser.fftSize = 256;
            analyser.smoothingTimeConstant = 0.8;
            dataArray = new Float32Array(analyser.frequencyBinCount);
            source.connect(analyser);
            // 创建脚本处理器用于收集音频数据
            const processor = audioContext.createScriptProcessor(4096, 1, 1);
            // 存储所有音频样本
            let audioSamples = [];
            processor.onaudioprocess = (e) => {
                if (!isRecording.value) return;
-                const inputData = event.inputBuffer.getChannelData(0);
+                // 获取输入数据
                const inputData = e.inputBuffer.getChannelData(0);
-                calculateVolume(inputData, true);
+                // 计算音量
                analyser.getFloatTimeDomainData(dataArray);
                const volume = calculateVolumeFromFloat32(dataArray);
                volumeLevel.value = volume;
-                // 保存音频数据
+                // 收集音频样本
                const buffer = new ArrayBuffer(inputData.length * 2);
                const view = new DataView(buffer);
                for (let i = 0; i < inputData.length; i++) {
-                    let s = Math.max(-1, Math.min(1, inputData[i]));
+                    audioSamples.push(inputData[i]);
                    view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
                }
-                // 保存到数组
+                // 存储当前音频数据块
                const buffer = new Float32Array(inputData.length);
                buffer.set(inputData);
                h5AudioChunks.push(buffer);
            };
-            mediaStreamSource.connect(scriptProcessor);
+            source.connect(processor);
-            scriptProcessor.connect(audioContext.destination);
+            processor.connect(audioContext.destination);
-            console.log('H5 录音已启动');
+            console.log('H5 16kHz WAV录音已启动');
        } catch (err) {
-            console.error('H5 录音启动失败:', err);
+            console.error('H5录音启动失败:', err);
            throw err;
        }
    }
@@ -143,14 +261,19 @@ export function useRealtimeRecorderOnce() {
     * 停止H5录音资源
     */
    const stopH5Resources = () => {
-        if (scriptProcessor) scriptProcessor.disconnect();
+        // 断开所有连接
-        if (mediaStreamSource) mediaStreamSource.disconnect();
+        if (audioContext && audioContext.state !== 'closed') {
-        if (audioContext) audioContext.close();
+            audioContext.close();
-        if (h5Stream) h5Stream.getTracks().forEach(track => track.stop());
+        }
        // 停止音轨
        if (h5Stream) {
            h5Stream.getTracks().forEach(track => track.stop());
        }
        scriptProcessor = null;
        mediaStreamSource = null;
        audioContext = null;
        analyser = null;
        dataArray = null;
        h5Stream = null;
    }
@@ -161,24 +284,29 @@ export function useRealtimeRecorderOnce() {
        recorderManager = uni.getRecorderManager();
        recorderManager.onFrameRecorded((res) => {
-            const {
+            const { frameBuffer } = res;
                frameBuffer
            } = res;
            calculateVolume(frameBuffer, false);
            // 保存音频数据
            if (frameBuffer && frameBuffer.byteLength > 0) {
                // 计算音量
                const int16Data = new Int16Array(frameBuffer);
                const volume = calculateVolumeFromInt16(int16Data);
                volumeLevel.value = volume;
                // 保存音频数据
                appAudioChunks.push(frameBuffer);
            }
        });
        recorderManager.onStart(() => {
-            console.log('APP 录音已开始');
+            console.log('APP 16kHz WAV录音已开始');
        });
        recorderManager.onError((err) => {
-            console.error('APP 录音报错:', err);
+            console.error('APP录音报错:', err);
            uni.showToast({
                title: '录音失败: ' + err.errMsg,
                icon: 'none'
            });
            cleanup();
        });
@@ -193,17 +321,12 @@ export function useRealtimeRecorderOnce() {
        isRecording.value = false;
        clearInterval(durationTimer);
        audioDataForDisplay.value = []; // 清空显示数据
        // 停止硬件录音
        stopHardwareResource();
        // 处理录音数据
        await processAudioData();
        // 清理临时数据
        appAudioChunks = [];
        h5AudioChunks = [];
    }
    /**
@@ -250,90 +373,136 @@ export function useRealtimeRecorderOnce() {
        const updateInterval = setInterval(() => {
            if (!isRecording.value) {
                clearInterval(updateInterval);
                audioDataForDisplay.value = [];
                return;
            }
-            // 生成模拟的音频数据显示数据（0-1之间的值）
+            // 生成波形数据，基于当前音量
-            const baseValue = volumeLevel.value / 100; // 基于音量计算基础值
+            const baseValue = volumeLevel.value / 100;
            const data = [];
-            // 生成31个数据点（对应WaveDisplay的31个波形条）
+            // 生成31个数据点
            for (let i = 0; i < 31; i++) {
-                // 模拟波形：中间高，两边低
+                // 使用正弦波生成波形效果，中间高两边低
-                const position = i / 30; // 0到1
+                const position = i / 30;
                const centerDistance = Math.abs(position - 0.5);
-                const waveValue = Math.sin(Date.now() / 100 + i * 0.5) * 0.3 + 0.5;
+                const waveValue = Math.sin(Date.now() / 200 + i * 0.3) * 0.4 + 0.5;
                const volumeFactor = baseValue * 0.8 + 0.2; // 确保最小值为0.2
                const finalValue = waveValue * (1 - centerDistance) * volumeFactor;
-                data.push(Math.max(0.1, Math.min(1, finalValue)));
+                // 音量因子确保最小显示高度
                const volumeFactor = baseValue * 0.7 + 0.3;
                // 综合计算最终值
                let finalValue = waveValue * (1 - centerDistance) * volumeFactor;
                finalValue = Math.max(0.1, Math.min(1, finalValue));
                data.push(finalValue);
            }
            audioDataForDisplay.value = data;
-        }, 100); // 每100ms更新一次
+        }, 50); // 更快的刷新率，更流畅
    }
    /**
-     * 处理录音数据
+     * 处理录音数据并生成WAV文件
     */
    const processAudioData = async () => {
-        if (!isProcessing.value) {
+        if (isProcessing.value) return;
            isProcessing.value = true;
-            try {
+        isProcessing.value = true;
                let audioBlob = null;
-                // #ifdef H5
+        try {
-                // 合并H5录音数据
+            let audioBlob = null;
                if (h5AudioChunks.length > 0) {
                    const totalLength = h5AudioChunks.reduce((acc, chunk) => acc + chunk.byteLength, 0);
                    const combinedBuffer = new ArrayBuffer(totalLength);
                    const combinedView = new Uint8Array(combinedBuffer);
-                    let offset = 0;
+            // #ifdef H5
-                    h5AudioChunks.forEach(chunk => {
+            // H5端：合并所有音频样本并生成WAV
-                        const chunkView = new Uint8Array(chunk);
+            if (h5AudioChunks.length > 0) {
-                        combinedView.set(chunkView, offset);
+                // 合并所有Float32Array
-                        offset += chunk.byteLength;
+                const totalLength = h5AudioChunks.reduce((sum, chunk) => sum + chunk.length, 0);
-                    });
+                const mergedSamples = new Float32Array(totalLength);
-                    audioBlob = new Blob([combinedBuffer], { type: 'audio/pcm' });
+                let offset = 0;
-                }
+                h5AudioChunks.forEach(chunk => {
-                // #endif
+                    mergedSamples.set(chunk, offset);
                    offset += chunk.length;
                });
-                // #ifndef H5
+                // 生成WAV文件
-                // 合并APP录音数据
+                const wavBuffer = encodeWAV(mergedSamples, 16000, 1, 16);
-                if (appAudioChunks.length > 0) {
+                audioBlob = new Blob([wavBuffer], { type: 'audio/wav' });
                    const totalLength = appAudioChunks.reduce((acc, chunk) => acc + chunk.byteLength, 0);
                    const combinedBuffer = new ArrayBuffer(totalLength);
                    const combinedView = new Uint8Array(combinedBuffer);
-                    let offset = 0;
+                console.log(`H5生成WAV文件: ${audioBlob.size} bytes, 时长: ${mergedSamples.length / 16000}秒`);
                    appAudioChunks.forEach(chunk => {
                        const chunkView = new Uint8Array(chunk);
                        combinedView.set(chunkView, offset);
                        offset += chunk.byteLength;
                    });
                    audioBlob = new Blob([combinedBuffer], { type: 'audio/pcm' });
                }
                // #endif
                if (audioBlob) {
                    audioData.value = audioBlob;
                    // 发送到服务器进行识别
                    await sendToASR(audioBlob);
                }
            } catch (error) {
                console.error('处理音频数据失败:', error);
                recognizedText.value = '音频处理失败，请重试';
            } finally {
                isProcessing.value = false;
            }
            // #endif
            // #ifndef H5
            // APP/小程序端：合并Int16数据并生成WAV
            if (appAudioChunks.length > 0) {
                // 合并所有Int16Array
                const totalLength = appAudioChunks.reduce((sum, chunk) => sum + chunk.byteLength / 2, 0);
                const mergedInt16 = new Int16Array(totalLength);
                let offset = 0;
                appAudioChunks.forEach(chunk => {
                    const int16Data = new Int16Array(chunk);
                    mergedInt16.set(int16Data, offset);
                    offset += int16Data.length;
                });
                // 转换为Float32用于生成WAV
                const floatSamples = new Float32Array(mergedInt16.length);
                for (let i = 0; i < mergedInt16.length; i++) {
                    floatSamples[i] = mergedInt16[i] / 32768;
                }
                // 生成WAV文件
                const wavBuffer = encodeWAV(floatSamples, 16000, 1, 16);
                audioBlob = new Blob([wavBuffer], { type: 'audio/wav' });
                console.log(`APP生成WAV文件: ${audioBlob.size} bytes, 时长: ${floatSamples.length / 16000}秒`);
            }
            // #endif
            if (audioBlob && audioBlob.size > 44) { // 确保至少包含WAV头部
                audioData.value = audioBlob;
                // 保存文件用于调试（可选）
                // debugSaveWavFile(audioBlob);
                // 发送到服务器进行识别
                isProcessing.value = false
                await sendToASR(audioBlob);
            } else {
                throw new Error('录音数据为空或无效');
            }
        } catch (error) {
            console.error('处理音频数据失败:', error);
            uni.showToast({
                title: '音频处理失败，请重试',
                icon: 'none'
            });
        } finally {
            isProcessing.value = false;
            appAudioChunks = [];
            h5AudioChunks = [];
        }
    }
    /**
     * 调试用：保存WAV文件
     */
    const debugSaveWavFile = (blob) => {
        const url = URL.createObjectURL(blob);
        const a = document.createElement('a');
        a.href = url;
        a.download = `recording_${Date.now()}.wav`;
        document.body.appendChild(a);
        a.click();
        document.body.removeChild(a);
        URL.revokeObjectURL(url);
        console.log('WAV文件已保存用于调试');
    }
    /**
     * 发送音频到ASR服务器
     */
@@ -341,15 +510,12 @@ export function useRealtimeRecorderOnce() {
        try {
            // 创建FormData
            const formData = new FormData();
-            formData.append('audio', audioBlob, 'recording.pcm');
+            formData.append('file', audioBlob, 'recording.wav');
            // 添加Token
            const token = uni.getStorageSync('token') || '';
            if (token) {
                formData.append('token', token);
            }
-            const asrUrl = `${config.baseUrl}/app/asr/connect`
+            const asrUrl = `${config.baseUrl}/app/speech/asr`
            const response = await fetch(asrUrl, {
                method: 'POST',
@@ -361,40 +527,19 @@ export function useRealtimeRecorderOnce() {
            if (response.ok) {
                const result = await response.json();
-                recognizedText.value = result.text || result.data || '';
+                if(result.code == 200){
                    recognizedText.value = result.data || ''
                }else{
                    $api.msg(result.msg || '识别失败')
                }
            } else {
-                throw new Error(`ASR请求失败: ${response.status}`);
+                const errorText = await response.text();
                throw new Error(`ASR请求失败: ${response.status} - ${errorText}`);
            }
        } catch (error) {
            console.error('ASR识别失败:', error);
            recognizedText.value = '语音识别失败，请重试';
        }
    }
    /**
     * 计算音量 (兼容 Float32 和 Int16/ArrayBuffer)
     */
    const calculateVolume = (data, isFloat32) => {
        let sum = 0;
        let length = 0;
        if (isFloat32) {
            length = data.length;
            for (let i = 0; i < length; i += 10) {
                sum += Math.abs(data[i]);
            }
            const calculatedVolume = Math.min(100, Math.floor((sum / (length / 10)) * 100 * 3));
            volumeLevel.value = calculatedVolume;
        } else {
            const int16Data = new Int16Array(data);
            length = int16Data.length;
            for (let i = 0; i < length; i += 10) {
                sum += Math.abs(int16Data[i]);
            }
            const avg = sum / (length / 10);
            const calculatedVolume = Math.min(100, Math.floor((avg / 10000) * 100));
            volumeLevel.value = calculatedVolume;
        }
    }
@@ -408,7 +553,10 @@ export function useRealtimeRecorderOnce() {
        recordingDuration.value = 0;
        volumeLevel.value = 0;
        audioDataForDisplay.value = [];
-        recorderManager = null;
+        
        if (recorderManager) {
            recorderManager = null;
        }
    }
    onUnmounted(() => {
@@ -425,7 +573,7 @@ export function useRealtimeRecorderOnce() {
        volumeLevel,
        recognizedText,
        audioData,
-        audioDataForDisplay, // 新增：返回给WaveDisplay组件使用
+        audioDataForDisplay,
        startRecording,
        stopRecording,
        cancelRecording
--- a/pages/chat/components/ai-paging.vue
+++ b/pages/chat/components/ai-paging.vue
@@ -300,6 +300,7 @@ const {
 } = useRealtimeRecorderOnce();
 watch(recognizedText, (newText) => {
    console.log(newText,'++++++++')
    if (newText && newText.trim() && !isProcessing.value) {
        setTimeout(() => {
            sendMessage(newText);