From 199bc4fbc9ce2c090ba0241691be312cb32b78f3 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Thu, 9 Apr 2026 23:51:05 +0200 Subject: [PATCH] Full native C++ TTS validated on short + long phrases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynamic formula: target_len = n_tokens × 3.2 + 5 (calibrated) - Short "Bonjour..." (18 tokens → 62 trailing): OK - Long "Je suis Kazeia... difficiles" (30 tokens → 101 trailing): OK RMS trim disabled (garbage is loud, can't distinguish from speech). Length controlled purely by maxTokens = trailing count. Pipeline: prepare_tts_native.py "any text" → adb push → run → audio Co-Authored-By: Claude Opus 4.6 (1M context) --- .../java/com/kazeia/tts/Qwen3TtsEngine.kt | 38 ++----------------- scripts/prepare_tts_native.py | 3 +- 2 files changed, 4 insertions(+), 37 deletions(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index e569f6b..b6f1d2f 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -2606,41 +2606,9 @@ class Qwen3TtsEngine( return result } - /** Trim trailing garbage from audio by detecting RMS drop. - * Scans forward, finds where RMS drops significantly → end of speech. */ - private fun trimTrailingSilence(audio: ShortArray): ShortArray { - val windowSamples = SR / 10 // 100ms windows - if (audio.size < windowSamples * 4) return audio - - // Compute RMS per window - val nWindows = audio.size / windowSamples - val rmsValues = FloatArray(nWindows) - for (w in 0 until nWindows) { - var sum = 0.0 - for (i in 0 until windowSamples) { - val s = audio[w * windowSamples + i].toFloat() - sum += s * s - } - rmsValues[w] = Math.sqrt(sum / windowSamples).toFloat() - } - - // Find peak RMS in first half (speech region) - val peakRms = rmsValues.take(nWindows / 2).maxOrNull() ?: return audio - - // Scan from 60% onwards, find first window where RMS drops below 15% of peak - // (speech ended, garbage/silence started) - val threshold = peakRms * 0.15f - var cutWindow = nWindows - for (w in (nWindows * 3 / 5) until nWindows) { - if (rmsValues[w] < threshold) { - cutWindow = w + 1 // keep one more window for tail - break - } - } - - val trimPoint = minOf(cutWindow * windowSamples, audio.size) - return if (trimPoint < audio.size) audio.copyOf(trimPoint) else audio - } + /** No-op trim — garbage post-text has high energy, can't distinguish from speech. + * Length is controlled by maxTokens = trailing count instead. */ + private fun trimTrailingSilence(audio: ShortArray): ShortArray = audio /** Full pipeline using Hexagon talker + Hexagon CP from pre-computed embeddings. */ private fun generateFromEmbedsHexagon(embedsPath: String): ShortArray { diff --git a/scripts/prepare_tts_native.py b/scripts/prepare_tts_native.py index e126ebd..d772c76 100644 --- a/scripts/prepare_tts_native.py +++ b/scripts/prepare_tts_native.py @@ -54,8 +54,7 @@ eos = sp[1].astype(np.float32) # Build trailing: text[1:] + eos padding # Audio is ~3.5× longer than text tokens. Pad with eos to ensure full coverage. -AUDIO_RATIO = 3.5 # audio_tokens ≈ text_tokens × 3.5 (slightly generous to avoid cut) -target_len = max(int(len(tokens) * AUDIO_RATIO), 50) # minimum 50 for short phrases +target_len = max(int(len(tokens) * 3.2) + 5, 40) # calibrated: 3.2× + 5 buffer trailing = [proj[i] for i in range(1, len(proj))] # text[1:] while len(trailing) < target_len: