From f6df1738c5f5d04d5793ef87a7dac07651f153c0 Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Thu, 9 Apr 2026 14:05:42 +0200
Subject: [PATCH] Add prepare_tts_embeds.py for any text + codec_sum fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- prepare_tts_embeds.py: generates pre-computed embeddings from any text
  via Python generate_voice_clone, capturing talker inputs
- C++ pipeline: always build codec_sum + trailing (not as-is)
- maxTokens: 4× trailing count (audio >> text tokens)
- Long text tested: 224 Python tokens → 125 NPU tokens (10s audio)
- Text-only embeds don't work (model needs Python pre-computed codec_sum)

Usage: python3 scripts/prepare_tts_embeds.py "Your text" output.bin
       adb push output.bin /data/local/tmp/.../full_pipeline_embeds.bin

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 executorch-custom/jni_layer_tts.cpp           |  25 +++--
 .../java/com/kazeia/tts/Qwen3TtsEngine.kt     |   2 +-
 scripts/prepare_tts_embeds.py                 | 104 ++++++++++++++++++
 3 files changed, 119 insertions(+), 12 deletions(-)
 create mode 100644 scripts/prepare_tts_embeds.py

diff --git a/executorch-custom/jni_layer_tts.cpp b/executorch-custom/jni_layer_tts.cpp
index a6cc15d..04fabdc 100644
--- a/executorch-custom/jni_layer_tts.cpp
+++ b/executorch-custom/jni_layer_tts.cpp
@@ -839,21 +839,24 @@ ExecuTorchJni::runTtsPipelineImpl(
         for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
         cb0Hist.push_back(curCb0);
 
-        // Next embed: use pre-computed trailing embeds (codec_sum+text from Python)
-        // then codec_sum + eos/pad after trailing exhausted
+        // Next embed: codec_sum + (trailing text / eos / pad)
         float nextEmb[DIM]={};
+        // Always add codec embeddings from our codes
+        const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
+        for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
+        for(int cb=0;cb<15;cb++){
+            const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
+            for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
+        }
+        // Add trailing text embed, then eos once, then pad
         if(trIdx<nTrailing){
-            memcpy(nextEmb,trailing.data()+trIdx*DIM,DIM*4);
+            const float*te=trailing.data()+trIdx*DIM;
+            for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
             trIdx++;
+        } else if(trIdx==nTrailing){
+            for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmb[k]; trIdx++;
         } else {
-            const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
-            for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
-            for(int cb=0;cb<15;cb++){
-                const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
-                for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
-            }
-            if(trIdx==nTrailing){for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmb[k];trIdx++;}
-            else {for(int k=0;k<DIM;k++) nextEmb[k]+=padEmb[k];}
+            for(int k=0;k<DIM;k++) nextEmb[k]+=padEmb[k];
         }
 
         auto tt0=std::chrono::high_resolution_clock::now();
diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
index 9b2ffa4..753f245 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@@ -2333,7 +2333,7 @@ class Qwen3TtsEngine(
                 talkerPteRotaryCos ?: FloatArray(0), talkerPteRotarySin ?: FloatArray(0),
                 cpRotaryCos ?: FloatArray(0), cpRotarySin ?: FloatArray(0),
                 ttsEosEmbed ?: FloatArray(TALKER_DIM), ttsPadEmbed ?: FloatArray(TALKER_DIM),
-                nTotal - nPrefill
+                maxOf(200, (nTotal - nPrefill) * 4)  // maxTokens: audio is ~3-4× longer than text
             )
             if (flat == null || flat.isEmpty()) return ShortArray(0)
             val nTokens = flat.size / NUM_CODEBOOKS
diff --git a/scripts/prepare_tts_embeds.py b/scripts/prepare_tts_embeds.py
new file mode 100644
index 0000000..93eefe8
--- /dev/null
+++ b/scripts/prepare_tts_embeds.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Generate pre-computed TTS embeddings for any text.
+Run on PC, push result to tablet, then run pipeline.
+
+Usage: python3 prepare_tts_embeds.py "Your text here" [output.bin]
+       Then: adb push output.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin
+"""
+import sys, os, struct, warnings, types
+os.chdir("/tmp")
+warnings.filterwarnings("ignore")
+
+TEXT = sys.argv[1] if len(sys.argv) > 1 else "Bonjour, je m'appelle Kazeia."
+OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_embeds.bin"
+VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav"
+MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc"
+
+import torch, numpy as np
+from qwen_tts import Qwen3TTSModel
+
+print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'")
+print("Loading model...")
+tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu")
+talker = tts.model.talker
+
+# Monkey-patch the talker's inner model __call__ to capture inputs
+# (avoids HuggingFace generate() validation issues with forward patch)
+captured_inputs = []
+original_model_forward = talker.model.forward
+
+def patched_model_forward(input_ids=None, inputs_embeds=None, **kwargs):
+    if inputs_embeds is not None and inputs_embeds.shape[1] == 1:
+        captured_inputs.append(inputs_embeds[0, 0, :].detach().cpu().numpy().astype(np.float32))
+    return original_model_forward(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
+
+talker.model.forward = patched_model_forward
+
+print("Generating voice clone...")
+audio_list, sr = tts.generate_voice_clone(
+    text=TEXT, ref_audio=VOICE, language="french",
+    x_vector_only_mode=True, non_streaming_mode=True,
+)
+audio = audio_list[0]
+print(f"Audio: {len(audio)/sr:.2f}s, {len(captured_inputs)} generation steps captured")
+
+if len(captured_inputs) < 2:
+    print("ERROR: Not enough generation steps captured")
+    sys.exit(1)
+
+# Build embeds file
+# Prefill: first 10 captured are prefill steps, rest are decode
+# Actually, captured_inputs only has single-token inputs (generation, not prefill)
+# We need the prefill embeddings too. Load from existing structure.
+
+# The first captured input is the FIRST generation step input
+# (after prefill is done, the model starts generating codec tokens)
+# Prefill inputs are multi-token and not captured
+
+# Load existing prefill from reference file
+EXISTING = "/tmp/existing_embeds.bin"
+if not os.path.exists(EXISTING):
+    # Pull from tablet
+    os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {EXISTING}")
+
+if os.path.exists(EXISTING):
+    with open(EXISTING, "rb") as f:
+        nP = struct.unpack("<i", f.read(4))[0]
+        nT = struct.unpack("<i", f.read(4))[0]
+        old_embeds = []
+        for i in range(nT):
+            old_embeds.append(np.frombuffer(f.read(1024*4), dtype=np.float32).copy())
+    prefill_embeds = old_embeds[:9]  # role+ctrl+spk+bos
+else:
+    print("WARNING: No existing embeds file, prefill will be zeros")
+    prefill_embeds = [np.zeros(1024, dtype=np.float32)] * 9
+
+# Build output: 10 prefill + N decode
+nPrefill = 10  # 9 role/ctrl/spk/bos + first gen embed
+nDecode = len(captured_inputs) - 1
+nTotal = nPrefill + nDecode
+
+with open(OUTPUT, "wb") as f:
+    f.write(struct.pack("<i", nPrefill))
+    f.write(struct.pack("<i", nTotal))
+    # Prefill: 9 from existing + first captured
+    for emb in prefill_embeds:
+        f.write(emb.tobytes())
+    f.write(captured_inputs[0].tobytes())
+    # Decode: remaining captured inputs (complete embeddings from Python)
+    for i in range(1, len(captured_inputs)):
+        f.write(captured_inputs[i].tobytes())
+
+print(f"\nSaved: {OUTPUT}")
+print(f"  {nPrefill} prefill + {nDecode} decode = {nTotal} total")
+print(f"  {os.path.getsize(OUTPUT)/1024:.0f} KB")
+print(f"  Audio: {len(audio)/sr:.2f}s ({len(captured_inputs)} tokens)")
+print(f"\nPush to tablet:")
+print(f"  adb push {OUTPUT} /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin")
+
+# Also save reference audio
+import soundfile as sf
+ref_path = OUTPUT.replace('.bin', '_ref.wav')
+sf.write(ref_path, audio, sr)
+print(f"  Reference audio: {ref_path}")