修改TTS断句逻辑

2025-12-29 10:28:49 +08:00
parent 9769f486f0
commit 14af7b8b20
1 changed files with 95 additions and 31 deletions
--- a/hook/useAudioSpeak.js
+++ b/hook/useAudioSpeak.js
@@ -112,47 +112,111 @@ export const useAudioSpeak = (config = {}) => {
    if (!text || typeof text !== 'string') return [];
    
    const cleanText = text.replace(/\s+/g, ' ').trim();
+    if (!cleanText) return [];
    
-    // 先按标点粗分
-    const rawChunks = cleanText.split(/([。？！；\n\r]|……)/).filter((t) => t.trim());
-    const mergedChunks = [];
-    let temp = '';
-
-    for (let i = 0; i < rawChunks.length; i++) {
-      const part = rawChunks[i];
-      // 如果是标点，追加到上一句
-      if (/^[。？！；\n\r……]$/.test(part)) {
-        temp += part;
+    const segments = [];
+    
+    // 1. 按完整标点分割成独立的句子（包括中英文标点）
+    // 正则解释：匹配非标点字符 + 标点符号（或者匹配到结尾）
+    const sentenceRegex = /([^。？！；，、\n\r\.\?!;,]+[。？！；，、\n\r\.\?!;,]+|.+$)/g;
+    
+    let currentIndex = 0;
+    let match;
+    const rawSentences = [];
+    
+    while ((match = sentenceRegex.exec(cleanText)) !== null) {
+      const sentence = match[0].trim();
+      if (sentence) {
+        rawSentences.push(sentence);
+      }
+      currentIndex = match.index + match[0].length;
+    }
+    
+    // 处理最后剩余的部分
+    if (currentIndex < cleanText.length) {
+      const remaining = cleanText.substring(currentIndex).trim();
+      if (remaining) {
+        rawSentences.push(remaining);
+      }
+    }
+    
+    // 如果正则没有匹配到，整个文本作为一句话
+    if (rawSentences.length === 0) {
+      rawSentences.push(cleanText);
+    }
+    
+    // 2. 处理每个句子
+    for (const sentence of rawSentences) {
+      if (sentence.length <= maxSegmentLength) {
+        // 句子长度正常，直接作为一个片段
+        segments.push(sentence);
      } else {
-        // 如果当前积累的句子太长（超过50字），先推入队列
-        if (temp.length > 50) {
-          mergedChunks.push(temp);
-          temp = part;
-        } else if (temp.length + part.length < 15) {
-          // 如果当前积累的太短（少于15字），则合并下一句
-          temp += part;
-        } else {
-          // 正常长度，推入
-          if (temp) mergedChunks.push(temp);
-          temp = part;
+        // 句子超长，需要分割
+        console.log('检测到超长句子，需要分割:', sentence);
+        
+        let currentPos = 0;
+        const sentenceLength = sentence.length;
+        
+        while (currentPos < sentenceLength) {
+          // 优先在标点处分割
+          let splitPos = -1;
+          const searchStart = currentPos;
+          const searchEnd = Math.min(currentPos + maxSegmentLength, sentenceLength);
+          
+          // 在搜索范围内找标点
+          for (let i = searchEnd - 1; i > searchStart; i--) {
+            if (/[。？！；，、\n\r\.\?!;,]/u.test(sentence[i])) {
+              splitPos = i + 1; // 包含标点
+              break;
+            }
+          }
+          
+          // 如果没找到标点，在最大限制处分割
+          if (splitPos === -1) {
+            splitPos = searchEnd;
+          }
+          
+          // 确保至少分割出一个字符
+          if (splitPos <= currentPos) {
+            splitPos = currentPos + 1;
+          }
+          
+          const segment = sentence.substring(currentPos, splitPos).trim();
+          if (segment) {
+            segments.push(segment);
+          }
+          
+          currentPos = splitPos;
        }
      }
    }
-    if (temp) mergedChunks.push(temp);
    
-    // 如果还有超过 maxSegmentLength 的，强制分割
+    // 3. 特殊情况：合并以冒号开头的短片段到上一句
    const finalSegments = [];
-    mergedChunks.forEach(segment => {
-      if (segment.length <= maxSegmentLength) {
-        finalSegments.push(segment);
-      } else {
-        // 按字数强制分割
-        for (let i = 0; i < segment.length; i += maxSegmentLength) {
-          finalSegments.push(segment.substring(i, Math.min(i + maxSegmentLength, segment.length)));
+    for (let i = 0; i < segments.length; i++) {
+      const currentSegment = segments[i];
+      
+      // 检查是否以冒号开头且很短（可能是被错误分割的部分）
+      if (i > 0 && 
+          (currentSegment.startsWith('：') || currentSegment.startsWith(':')) && 
+          currentSegment.length < 15 &&
+          !currentSegment.endsWith('。') && 
+          !currentSegment.endsWith('!') && 
+          !currentSegment.endsWith('?')) {
+          
+        // 尝试合并到上一句
+        const previousSegment = finalSegments[finalSegments.length - 1];
+        if (previousSegment && (previousSegment.length + currentSegment.length) <= maxSegmentLength) {
+          finalSegments[finalSegments.length - 1] = previousSegment + currentSegment;
+        } else {
+          finalSegments.push(currentSegment);
        }
+      } else {
+        finalSegments.push(currentSegment);
      }
-    });
+    }
    
+    // 清理：移除空白和空字符串
    return finalSegments.filter(seg => seg && seg.trim());
  }