From f6df1738c5f5d04d5793ef87a7dac07651f153c0 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Thu, 9 Apr 2026 14:05:42 +0200 Subject: [PATCH] Add prepare_tts_embeds.py for any text + codec_sum fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - prepare_tts_embeds.py: generates pre-computed embeddings from any text via Python generate_voice_clone, capturing talker inputs - C++ pipeline: always build codec_sum + trailing (not as-is) - maxTokens: 4× trailing count (audio >> text tokens) - Long text tested: 224 Python tokens → 125 NPU tokens (10s audio) - Text-only embeds don't work (model needs Python pre-computed codec_sum) Usage: python3 scripts/prepare_tts_embeds.py "Your text" output.bin adb push output.bin /data/local/tmp/.../full_pipeline_embeds.bin Co-Authored-By: Claude Opus 4.6 (1M context) --- executorch-custom/jni_layer_tts.cpp | 25 +++-- .../java/com/kazeia/tts/Qwen3TtsEngine.kt | 2 +- scripts/prepare_tts_embeds.py | 104 ++++++++++++++++++ 3 files changed, 119 insertions(+), 12 deletions(-) create mode 100644 scripts/prepare_tts_embeds.py diff --git a/executorch-custom/jni_layer_tts.cpp b/executorch-custom/jni_layer_tts.cpp index a6cc15d..04fabdc 100644 --- a/executorch-custom/jni_layer_tts.cpp +++ b/executorch-custom/jni_layer_tts.cpp @@ -839,21 +839,24 @@ ExecuTorchJni::runTtsPipelineImpl( for(int i=0;i 1 else "Bonjour, je m'appelle Kazeia." +OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_embeds.bin" +VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav" +MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" + +import torch, numpy as np +from qwen_tts import Qwen3TTSModel + +print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'") +print("Loading model...") +tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu") +talker = tts.model.talker + +# Monkey-patch the talker's inner model __call__ to capture inputs +# (avoids HuggingFace generate() validation issues with forward patch) +captured_inputs = [] +original_model_forward = talker.model.forward + +def patched_model_forward(input_ids=None, inputs_embeds=None, **kwargs): + if inputs_embeds is not None and inputs_embeds.shape[1] == 1: + captured_inputs.append(inputs_embeds[0, 0, :].detach().cpu().numpy().astype(np.float32)) + return original_model_forward(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs) + +talker.model.forward = patched_model_forward + +print("Generating voice clone...") +audio_list, sr = tts.generate_voice_clone( + text=TEXT, ref_audio=VOICE, language="french", + x_vector_only_mode=True, non_streaming_mode=True, +) +audio = audio_list[0] +print(f"Audio: {len(audio)/sr:.2f}s, {len(captured_inputs)} generation steps captured") + +if len(captured_inputs) < 2: + print("ERROR: Not enough generation steps captured") + sys.exit(1) + +# Build embeds file +# Prefill: first 10 captured are prefill steps, rest are decode +# Actually, captured_inputs only has single-token inputs (generation, not prefill) +# We need the prefill embeddings too. Load from existing structure. + +# The first captured input is the FIRST generation step input +# (after prefill is done, the model starts generating codec tokens) +# Prefill inputs are multi-token and not captured + +# Load existing prefill from reference file +EXISTING = "/tmp/existing_embeds.bin" +if not os.path.exists(EXISTING): + # Pull from tablet + os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {EXISTING}") + +if os.path.exists(EXISTING): + with open(EXISTING, "rb") as f: + nP = struct.unpack("