From c25040a78066f10b7d067b94353605702d906b8e Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Mon, 13 Apr 2026 11:32:33 +0200 Subject: [PATCH] TTS: conditional tail-trim + export script accepts voice path arg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small changes: * export_tts_text_embeddings.py now takes the voice wav as an optional second CLI arg (defaults to damien_15s_24k.wav). Lets the same script capture voice-prefix+suffix for any speaker wav without editing the source — used today to test Elodie alongside Damien. * synthesizeTextStreaming + generateSegmentAudioVC only run the trimTailLowEnergy trim when n >= maxGen. The trim's 35%-of-peak threshold is tuned to catch "page beg beg" filler after the talker fails to emit EOS — but it was cutting valid speech when EOS fired early (observed on Elodie seg 1: 10.08 s → 2.92 s, a 4-second over- trim). With the guard it's a no-op on converging generations and only fires on the ~15% of segments that hit maxGen. Validation after the fix (Elodie, Baer monologue): - seg 1: 126 tokens = maxGen → trimmed 10.08 s → 8.88 s (1.2 s cut, the filler tail) - seg 2: 105 tokens < 138 maxGen → no trim, 8.4 s kept as-is - seg 3: 69 tokens < 96 maxGen → no trim, 5.6 s kept as-is Voice prefix/suffix shape is speaker-invariant except position 7 (the xvector). Confirmed by capturing both Damien and Elodie and diffing: positions 0-6 and 8 identical within 1e-8, suffix identical within 1e-8, only pos 7 has a different xvector embedding (norm 10.36 vs 10.12). That means swapping speakers on-device is a 45 KB file push — no app rebuild, no re-export of the 297 MB vocabulary table. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../java/com/kazeia/tts/Qwen3TtsEngine.kt | 55 ++++++++++++++++++- scripts/export_tts_text_embeddings.py | 4 +- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index bbc95f1..5060f19 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -3428,7 +3428,12 @@ class Qwen3TtsEngine( if (t < n) { val v = codes[t][cb]; if (v in 0 until CODEBOOK_SIZE) v else 0 } else 0 } } - return decodeChunked(codebooks, n) + val audio = decodeChunked(codebooks, n) + // Match the conservative trim policy in synthesizeTextStreaming: + // only trim when we hit the maxGen cap, which is the "failure to + // emit EOS" signal. Shorter generations are kept verbatim to + // avoid cutting low-energy speech. + return if (n >= maxGen) trimTailLowEnergy(audio) else audio } /** @@ -3627,9 +3632,17 @@ class Qwen3TtsEngine( if (t < n) { val v = codes[t][cb]; if (v in 0 until CODEBOOK_SIZE) v else 0 } else 0 } } - val audio = decodeChunked(codebooks, n) + val rawAudio = decodeChunked(codebooks, n) + // Only trim when the talker exhausted its budget — that's the + // case where "page beg beg" filler actually sneaks in. When + // EOS or the degeneracy guard fires early, the audio is already + // clean and trimTailLowEnergy's 35%-of-peak threshold is too + // aggressive for natural French cadence (it cut Elodie seg1 + // from 7.04 s to 2.92 s because the second half of the + // sentence was below the speech threshold). + val audio = if (n >= maxGen) trimTailLowEnergy(rawAudio) else rawAudio val segMs = System.currentTimeMillis() - tSeg - nlog("Seg ${segIdx+1}/${segments.size}: $n tokens, ${audio.size/SR.toFloat()}s audio in ${segMs}ms") + nlog("Seg ${segIdx+1}/${segments.size}: $n tokens, ${audio.size/SR.toFloat()}s audio (raw ${rawAudio.size/SR.toFloat()}s, trimmed=${n >= maxGen}) in ${segMs}ms") segmentAudios.add(audio) saveWav("/data/local/tmp/kazeia/kazeia_stream_seg${segIdx+1}.wav", audio) @@ -3649,6 +3662,42 @@ class Qwen3TtsEngine( return concat } + /** + * Trim low-energy tail from a voice-cloned segment. When the talker + * fails to emit EOS within maxGen, the remaining decoded codes tend + * to be "page beg beg" fillers — audible as a mumbled tail. This + * RMS-based trim finds the last sustained high-energy window and cuts + * after it, with a small fade-out so the last real syllable keeps its + * natural decay. Extracted from generateMultiSegment so the + * synthesizeTextStreaming path can reuse it verbatim. + */ + private fun trimTailLowEnergy(audio: ShortArray): ShortArray { + if (audio.size < SR / 2) return audio + val winSamples = SR * 40 / 1000 // 40 ms windows = 960 samples + val nWin = audio.size / winSamples + if (nWin < 6) return audio + val rms = FloatArray(nWin) + for (w in 0 until nWin) { + var s = 0.0 + val o = w * winSamples + for (i in 0 until winSamples) { val x = audio[o + i].toFloat() / 32768f; s += x * x } + rms[w] = kotlin.math.sqrt(s / winSamples).toFloat() + } + // Reference peak over the first 70% of the segment; the tail is + // assumed to be the degenerate filler region. + var peak = 0f + val refEnd = (nWin * 7) / 10 + for (w in 0 until refEnd) if (rms[w] > peak) peak = rms[w] + val thr = peak * 0.35f + var lastSpeech = nWin - 1 + for (w in nWin - 1 downTo 2) { + if (rms[w] >= thr && rms[w-1] >= thr && rms[w-2] >= thr) { lastSpeech = w; break } + } + val keepWin = (lastSpeech + 2).coerceAtMost(nWin - 1) + val keepSamples = (keepWin + 1) * winSamples + return audio.copyOf(keepSamples) + } + /** Write PCM16 mono audio to a WAV file. Used by the streaming pipeline to * save one file per segment plus the concatenated result for inspection. */ private fun saveWav(path: String, audio: ShortArray) { diff --git a/scripts/export_tts_text_embeddings.py b/scripts/export_tts_text_embeddings.py index e52c7fa..001eafa 100644 --- a/scripts/export_tts_text_embeddings.py +++ b/scripts/export_tts_text_embeddings.py @@ -32,15 +32,15 @@ prefill tensor PyTorch would build, bit-for-bit at fp16 — which is what our Hexagon talker consumes anyway. Usage: - python3 export_tts_text_embeddings.py [output_dir] + python3 export_tts_text_embeddings.py [output_dir] [voice_wav_path] """ import sys, os, struct, shutil, warnings os.chdir("/tmp") warnings.filterwarnings("ignore") OUTPUT_DIR = sys.argv[1] if len(sys.argv) > 1 else "/tmp/kazeia_tts_export" +VOICE = sys.argv[2] if len(sys.argv) > 2 else "/opt/Kazeia/voix/damien_15s_24k.wav" MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" -VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav" os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(f"{OUTPUT_DIR}/qwen3_tokenizer", exist_ok=True)