#!/usr/bin/env python3 """ Export everything the tablet needs to build TTS prefill embeds for arbitrary LLM text, offline, without talking to a PC. Outputs (pushed to /data/local/tmp/kazeia/models/qwen3-tts-npu/): - text_embeds_full_fp16.bin : 151936 × 1024 fp16 = 311 MB Pre-projected text embeddings for the full Qwen3 vocab. Per-token lookup on-device replaces a lookup + FC1 + SiLU + FC2 + bias. Same numbers PyTorch produces for text_projection(text_embedding(id)). - damien_voice_prefix.bin : 9 × 1024 fp32 = 36 KB The fixed voice-cloning prefix (positions 0..8) for speaker Damien, captured from a real voice-clone run. Positions 0..6 = role/control tokens, position 7 = xvector (L2 norm ~10), position 8 = trailing voice-marker. Same for every phrase uttered by this speaker, so we capture once here and reuse indefinitely on-device. - damien_voice_suffix.bin : 2 × 1024 fp32 = 8 KB The fixed voice-cloning SUFFIX (last 2 positions of the prefill) that Python emits AFTER the text tokens. Verified bit-identical across segments of different texts → invariant closure marker for the voice-clone conditioning. Without it the talker misreads the end of the text and produces garbled output. - qwen3_tokenizer/ : tokenizer files copied from HF snapshot tokenizer.json, vocab.json, merges.txt, special_tokens_map.json. Kotlin BPE implementation reads vocab + merges at init. The combination lets the tablet build, for any text, the exact same prefill tensor PyTorch would build, bit-for-bit at fp16 — which is what our Hexagon talker consumes anyway. Usage: python3 export_tts_text_embeddings.py [output_dir] """ import sys, os, struct, shutil, warnings os.chdir("/tmp") warnings.filterwarnings("ignore") OUTPUT_DIR = sys.argv[1] if len(sys.argv) > 1 else "/tmp/kazeia_tts_export" MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav" os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(f"{OUTPUT_DIR}/qwen3_tokenizer", exist_ok=True) import torch, numpy as np from qwen_tts import Qwen3TTSModel print("Loading Qwen3-TTS model (~30s, CPU)...") tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu") talker = tts.model.talker # ---- 1. Full projected text embeddings ---- # Evaluate text_projection(text_embedding.weight) for EVERY vocab entry. # Batching keeps peak memory bounded; fp32 matmul then fp16 store preserves # precision up to the final quantization step. print("\n[1/3] Precomputing projected embeddings for full vocab...") vocab_size = talker.model.text_embedding.weight.shape[0] print(f" Vocab size: {vocab_size}") BATCH = 4096 out_path = f"{OUTPUT_DIR}/text_embeds_full_fp16.bin" with torch.no_grad(): W_emb = talker.model.text_embedding.weight # [vocab, 2048] fc1_w = talker.text_projection.linear_fc1.weight # [2048, 2048] fc1_b = talker.text_projection.linear_fc1.bias # [2048] fc2_w = talker.text_projection.linear_fc2.weight # [1024, 2048] fc2_b = talker.text_projection.linear_fc2.bias # [1024] with open(out_path, "wb") as f: for start in range(0, vocab_size, BATCH): end = min(start + BATCH, vocab_size) x = W_emb[start:end].float() # [b, 2048] h = torch.nn.functional.linear(x, fc1_w, fc1_b) # [b, 2048] h = torch.nn.functional.silu(h) # [b, 2048] y = torch.nn.functional.linear(h, fc2_w, fc2_b) # [b, 1024] f.write(y.to(torch.float16).numpy().tobytes()) if start % (BATCH * 4) == 0: print(f" {end}/{vocab_size} ({end*100//vocab_size}%)", flush=True) sz_mb = os.path.getsize(out_path) / (1024*1024) print(f" -> {out_path} ({sz_mb:.1f} MB)") # Sanity check: re-read a couple of tokens, project live, compare. print("\n Sanity check (token 1043 = 'Bonjour'):") with torch.no_grad(): live = talker.text_projection(talker.model.text_embedding(torch.tensor([1043])))[0].float().numpy() with open(out_path, "rb") as f: f.seek(1043 * 1024 * 2) stored = np.frombuffer(f.read(1024 * 2), dtype=np.float16).astype(np.float32) diff = float(np.abs(live - stored).max()) print(f" max abs diff live vs stored fp16: {diff:.2e} (expect < 1e-3)") # ---- 2. Damien voice prefix (positions 0..8) ---- # Run a voice-clone and capture the multi-token prefill call, then keep the # first 9 rows. Those are fixed per speaker — same for every phrase — so # one capture suffices for the app's lifetime. print(f"\n[2/3] Capturing Damien voice prefix from {VOICE}...") captured = [] call_shapes = [] original_forward = talker.model.forward def patched(input_ids=None, inputs_embeds=None, **kwargs): if inputs_embeds is not None and inputs_embeds.dim() == 3: call_shapes.append(inputs_embeds.shape[1]) for i in range(inputs_embeds.shape[1]): captured.append(inputs_embeds[0, i, :].detach().cpu().numpy().astype(np.float32)) return original_forward(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs) talker.model.forward = patched # Any short sentence works — we only keep positions 0..8 which are text- # invariant. _ = tts.generate_voice_clone( text="Bonjour, je suis Kazeia.", ref_audio=VOICE, language="french", x_vector_only_mode=True, non_streaming_mode=True, ) talker.model.forward = original_forward nP = call_shapes[0] print(f" Prefill size: {nP} tokens") prefix_9 = np.stack(captured[:9]) # [9, 1024] suffix_2 = np.stack(captured[nP-2:nP]) # [2, 1024] prefix_path = f"{OUTPUT_DIR}/damien_voice_prefix.bin" with open(prefix_path, "wb") as f: f.write(struct.pack(" {prefix_path} ({os.path.getsize(prefix_path)} bytes)") suffix_path = f"{OUTPUT_DIR}/damien_voice_suffix.bin" with open(suffix_path, "wb") as f: f.write(struct.pack(" {suffix_path} ({os.path.getsize(suffix_path)} bytes)") norms_pref = [float(np.linalg.norm(prefix_9[i])) for i in range(9)] norms_suff = [float(np.linalg.norm(suffix_2[i])) for i in range(2)] print(f" Prefix norms: {[f'{n:.2f}' for n in norms_pref]} (pos 7 = xvector ~10, others ~1.6-1.8)") print(f" Suffix norms: {[f'{n:.2f}' for n in norms_suff]}") # ---- 3. Tokenizer files ---- # Copy the HF tokenizer artefacts so a Kotlin BPE can reproduce Python # encode() bit-for-bit. print(f"\n[3/3] Copying tokenizer to {OUTPUT_DIR}/qwen3_tokenizer/...") for name in ("tokenizer.json", "vocab.json", "merges.txt", "tokenizer_config.json", "special_tokens_map.json"): src = os.path.join(MODEL, name) if os.path.exists(src): shutil.copy(src, f"{OUTPUT_DIR}/qwen3_tokenizer/{name}") print(f" {name} ({os.path.getsize(src)} bytes)") else: print(f" (skipped, not present: {name})") print(f"\n=== DONE ===") print(f"Files ready in {OUTPUT_DIR}/") print(f"\nPush to tablet:") print(f" adb push {OUTPUT_DIR}/text_embeds_full_fp16.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/") print(f" adb push {OUTPUT_DIR}/damien_voice_prefix.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/") print(f" adb push {OUTPUT_DIR}/qwen3_tokenizer /data/local/tmp/kazeia/models/qwen3-tts-npu/")