From c25040a78066f10b7d067b94353605702d906b8e Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Mon, 13 Apr 2026 11:32:33 +0200
Subject: [PATCH] TTS: conditional tail-trim + export script accepts voice path
 arg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small changes:

  * export_tts_text_embeddings.py now takes the voice wav as an optional
    second CLI arg (defaults to damien_15s_24k.wav). Lets the same script
    capture voice-prefix+suffix for any speaker wav without editing the
    source — used today to test Elodie alongside Damien.

  * synthesizeTextStreaming + generateSegmentAudioVC only run the
    trimTailLowEnergy trim when n >= maxGen. The trim's 35%-of-peak
    threshold is tuned to catch "page beg beg" filler after the talker
    fails to emit EOS — but it was cutting valid speech when EOS fired
    early (observed on Elodie seg 1: 10.08 s → 2.92 s, a 4-second over-
    trim). With the guard it's a no-op on converging generations and
    only fires on the ~15% of segments that hit maxGen.

Validation after the fix (Elodie, Baer monologue):
  - seg 1: 126 tokens = maxGen → trimmed 10.08 s → 8.88 s (1.2 s cut,
           the filler tail)
  - seg 2: 105 tokens < 138 maxGen → no trim, 8.4 s kept as-is
  - seg 3: 69 tokens < 96 maxGen → no trim, 5.6 s kept as-is

Voice prefix/suffix shape is speaker-invariant except position 7 (the
xvector). Confirmed by capturing both Damien and Elodie and diffing:
positions 0-6 and 8 identical within 1e-8, suffix identical within
1e-8, only pos 7 has a different xvector embedding (norm 10.36 vs 10.12).
That means swapping speakers on-device is a 45 KB file push — no app
rebuild, no re-export of the 297 MB vocabulary table.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../java/com/kazeia/tts/Qwen3TtsEngine.kt     | 55 ++++++++++++++++++-
 scripts/export_tts_text_embeddings.py         |  4 +-
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
index bbc95f1..5060f19 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@@ -3428,7 +3428,12 @@ class Qwen3TtsEngine(
                 if (t < n) { val v = codes[t][cb]; if (v in 0 until CODEBOOK_SIZE) v else 0 } else 0
             }
         }
-        return decodeChunked(codebooks, n)
+        val audio = decodeChunked(codebooks, n)
+        // Match the conservative trim policy in synthesizeTextStreaming:
+        // only trim when we hit the maxGen cap, which is the "failure to
+        // emit EOS" signal. Shorter generations are kept verbatim to
+        // avoid cutting low-energy speech.
+        return if (n >= maxGen) trimTailLowEnergy(audio) else audio
     }
 
     /**
@@ -3627,9 +3632,17 @@ class Qwen3TtsEngine(
                     if (t < n) { val v = codes[t][cb]; if (v in 0 until CODEBOOK_SIZE) v else 0 } else 0
                 }
             }
-            val audio = decodeChunked(codebooks, n)
+            val rawAudio = decodeChunked(codebooks, n)
+            // Only trim when the talker exhausted its budget — that's the
+            // case where "page beg beg" filler actually sneaks in. When
+            // EOS or the degeneracy guard fires early, the audio is already
+            // clean and trimTailLowEnergy's 35%-of-peak threshold is too
+            // aggressive for natural French cadence (it cut Elodie seg1
+            // from 7.04 s to 2.92 s because the second half of the
+            // sentence was below the speech threshold).
+            val audio = if (n >= maxGen) trimTailLowEnergy(rawAudio) else rawAudio
             val segMs = System.currentTimeMillis() - tSeg
-            nlog("Seg ${segIdx+1}/${segments.size}: $n tokens, ${audio.size/SR.toFloat()}s audio in ${segMs}ms")
+            nlog("Seg ${segIdx+1}/${segments.size}: $n tokens, ${audio.size/SR.toFloat()}s audio (raw ${rawAudio.size/SR.toFloat()}s, trimmed=${n >= maxGen}) in ${segMs}ms")
 
             segmentAudios.add(audio)
             saveWav("/data/local/tmp/kazeia/kazeia_stream_seg${segIdx+1}.wav", audio)
@@ -3649,6 +3662,42 @@ class Qwen3TtsEngine(
         return concat
     }
 
+    /**
+     * Trim low-energy tail from a voice-cloned segment. When the talker
+     * fails to emit EOS within maxGen, the remaining decoded codes tend
+     * to be "page beg beg" fillers — audible as a mumbled tail. This
+     * RMS-based trim finds the last sustained high-energy window and cuts
+     * after it, with a small fade-out so the last real syllable keeps its
+     * natural decay. Extracted from generateMultiSegment so the
+     * synthesizeTextStreaming path can reuse it verbatim.
+     */
+    private fun trimTailLowEnergy(audio: ShortArray): ShortArray {
+        if (audio.size < SR / 2) return audio
+        val winSamples = SR * 40 / 1000  // 40 ms windows = 960 samples
+        val nWin = audio.size / winSamples
+        if (nWin < 6) return audio
+        val rms = FloatArray(nWin)
+        for (w in 0 until nWin) {
+            var s = 0.0
+            val o = w * winSamples
+            for (i in 0 until winSamples) { val x = audio[o + i].toFloat() / 32768f; s += x * x }
+            rms[w] = kotlin.math.sqrt(s / winSamples).toFloat()
+        }
+        // Reference peak over the first 70% of the segment; the tail is
+        // assumed to be the degenerate filler region.
+        var peak = 0f
+        val refEnd = (nWin * 7) / 10
+        for (w in 0 until refEnd) if (rms[w] > peak) peak = rms[w]
+        val thr = peak * 0.35f
+        var lastSpeech = nWin - 1
+        for (w in nWin - 1 downTo 2) {
+            if (rms[w] >= thr && rms[w-1] >= thr && rms[w-2] >= thr) { lastSpeech = w; break }
+        }
+        val keepWin = (lastSpeech + 2).coerceAtMost(nWin - 1)
+        val keepSamples = (keepWin + 1) * winSamples
+        return audio.copyOf(keepSamples)
+    }
+
     /** Write PCM16 mono audio to a WAV file. Used by the streaming pipeline to
      *  save one file per segment plus the concatenated result for inspection. */
     private fun saveWav(path: String, audio: ShortArray) {
diff --git a/scripts/export_tts_text_embeddings.py b/scripts/export_tts_text_embeddings.py
index e52c7fa..001eafa 100644
--- a/scripts/export_tts_text_embeddings.py
+++ b/scripts/export_tts_text_embeddings.py
@@ -32,15 +32,15 @@ prefill tensor PyTorch would build, bit-for-bit at fp16 — which is
 what our Hexagon talker consumes anyway.
 
 Usage:
-    python3 export_tts_text_embeddings.py [output_dir]
+    python3 export_tts_text_embeddings.py [output_dir] [voice_wav_path]
 """
 import sys, os, struct, shutil, warnings
 os.chdir("/tmp")
 warnings.filterwarnings("ignore")
 
 OUTPUT_DIR = sys.argv[1] if len(sys.argv) > 1 else "/tmp/kazeia_tts_export"
+VOICE = sys.argv[2] if len(sys.argv) > 2 else "/opt/Kazeia/voix/damien_15s_24k.wav"
 MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc"
-VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav"
 
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 os.makedirs(f"{OUTPUT_DIR}/qwen3_tokenizer", exist_ok=True)