kazeia/scripts/export_tts_text_embeddings.py

#!/usr/bin/env python3
"""
Export everything the tablet needs to build TTS prefill embeds for arbitrary
LLM text, offline, without talking to a PC.

Outputs (pushed to /data/local/tmp/kazeia/models/qwen3-tts-npu/):
  - text_embeds_full_fp16.bin    : 151936 × 1024 fp16 = 311 MB
       Pre-projected text embeddings for the full Qwen3 vocab. Per-token
       lookup on-device replaces a lookup + FC1 + SiLU + FC2 + bias. Same
       numbers PyTorch produces for text_projection(text_embedding(id)).

  - damien_voice_prefix.bin      : 9 × 1024 fp32 = 36 KB
       The fixed voice-cloning prefix (positions 0..8) for speaker Damien,
       captured from a real voice-clone run. Positions 0..6 = role/control
       tokens, position 7 = xvector (L2 norm ~10), position 8 = trailing
       voice-marker. Same for every phrase uttered by this speaker, so we
       capture once here and reuse indefinitely on-device.

  - damien_voice_suffix.bin      : 2 × 1024 fp32 = 8 KB
       The fixed voice-cloning SUFFIX (last 2 positions of the prefill)
       that Python emits AFTER the text tokens. Verified bit-identical
       across segments of different texts → invariant closure marker
       for the voice-clone conditioning. Without it the talker misreads
       the end of the text and produces garbled output.

  - qwen3_tokenizer/             : tokenizer files copied from HF snapshot
       tokenizer.json, vocab.json, merges.txt, special_tokens_map.json.
       Kotlin BPE implementation reads vocab + merges at init.

The combination lets the tablet build, for any text, the exact same
prefill tensor PyTorch would build, bit-for-bit at fp16 — which is
what our Hexagon talker consumes anyway.

Usage:
    python3 export_tts_text_embeddings.py [output_dir]
"""
import sys, os, struct, shutil, warnings
os.chdir("/tmp")
warnings.filterwarnings("ignore")

OUTPUT_DIR = sys.argv[1] if len(sys.argv) > 1 else "/tmp/kazeia_tts_export"
MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc"
VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/qwen3_tokenizer", exist_ok=True)

import torch, numpy as np
from qwen_tts import Qwen3TTSModel

print("Loading Qwen3-TTS model (~30s, CPU)...")
tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu")
talker = tts.model.talker

# ---- 1. Full projected text embeddings ----
# Evaluate text_projection(text_embedding.weight) for EVERY vocab entry.
# Batching keeps peak memory bounded; fp32 matmul then fp16 store preserves
# precision up to the final quantization step.
print("\n[1/3] Precomputing projected embeddings for full vocab...")
vocab_size = talker.model.text_embedding.weight.shape[0]
print(f"  Vocab size: {vocab_size}")
BATCH = 4096
out_path = f"{OUTPUT_DIR}/text_embeds_full_fp16.bin"
with torch.no_grad():
    W_emb = talker.model.text_embedding.weight        # [vocab, 2048]
    fc1_w = talker.text_projection.linear_fc1.weight  # [2048, 2048]
    fc1_b = talker.text_projection.linear_fc1.bias    # [2048]
    fc2_w = talker.text_projection.linear_fc2.weight  # [1024, 2048]
    fc2_b = talker.text_projection.linear_fc2.bias    # [1024]
    with open(out_path, "wb") as f:
        for start in range(0, vocab_size, BATCH):
            end = min(start + BATCH, vocab_size)
            x = W_emb[start:end].float()                          # [b, 2048]
            h = torch.nn.functional.linear(x, fc1_w, fc1_b)       # [b, 2048]
            h = torch.nn.functional.silu(h)                       # [b, 2048]
            y = torch.nn.functional.linear(h, fc2_w, fc2_b)       # [b, 1024]
            f.write(y.to(torch.float16).numpy().tobytes())
            if start % (BATCH * 4) == 0:
                print(f"  {end}/{vocab_size}  ({end*100//vocab_size}%)", flush=True)
sz_mb = os.path.getsize(out_path) / (1024*1024)
print(f"  -> {out_path} ({sz_mb:.1f} MB)")

# Sanity check: re-read a couple of tokens, project live, compare.
print("\n  Sanity check (token 1043 = 'Bonjour'):")
with torch.no_grad():
    live = talker.text_projection(talker.model.text_embedding(torch.tensor([1043])))[0].float().numpy()
with open(out_path, "rb") as f:
    f.seek(1043 * 1024 * 2)
    stored = np.frombuffer(f.read(1024 * 2), dtype=np.float16).astype(np.float32)
diff = float(np.abs(live - stored).max())
print(f"    max abs diff live vs stored fp16: {diff:.2e}  (expect < 1e-3)")

# ---- 2. Damien voice prefix (positions 0..8) ----
# Run a voice-clone and capture the multi-token prefill call, then keep the
# first 9 rows. Those are fixed per speaker — same for every phrase — so
# one capture suffices for the app's lifetime.
print(f"\n[2/3] Capturing Damien voice prefix from {VOICE}...")
captured = []
call_shapes = []
original_forward = talker.model.forward
def patched(input_ids=None, inputs_embeds=None, **kwargs):
    if inputs_embeds is not None and inputs_embeds.dim() == 3:
        call_shapes.append(inputs_embeds.shape[1])
        for i in range(inputs_embeds.shape[1]):
            captured.append(inputs_embeds[0, i, :].detach().cpu().numpy().astype(np.float32))
    return original_forward(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
talker.model.forward = patched
# Any short sentence works — we only keep positions 0..8 which are text-
# invariant.
_ = tts.generate_voice_clone(
    text="Bonjour, je suis Kazeia.", ref_audio=VOICE, language="french",
    x_vector_only_mode=True, non_streaming_mode=True,
)
talker.model.forward = original_forward
nP = call_shapes[0]
print(f"  Prefill size: {nP} tokens")
prefix_9 = np.stack(captured[:9])  # [9, 1024]
suffix_2 = np.stack(captured[nP-2:nP])  # [2, 1024]

prefix_path = f"{OUTPUT_DIR}/damien_voice_prefix.bin"
with open(prefix_path, "wb") as f:
    f.write(struct.pack("<i", 9))
    f.write(struct.pack("<i", 1024))
    f.write(prefix_9.astype(np.float32).tobytes())
print(f"  prefix -> {prefix_path} ({os.path.getsize(prefix_path)} bytes)")

suffix_path = f"{OUTPUT_DIR}/damien_voice_suffix.bin"
with open(suffix_path, "wb") as f:
    f.write(struct.pack("<i", 2))
    f.write(struct.pack("<i", 1024))
    f.write(suffix_2.astype(np.float32).tobytes())
print(f"  suffix -> {suffix_path} ({os.path.getsize(suffix_path)} bytes)")

norms_pref = [float(np.linalg.norm(prefix_9[i])) for i in range(9)]
norms_suff = [float(np.linalg.norm(suffix_2[i])) for i in range(2)]
print(f"  Prefix norms: {[f'{n:.2f}' for n in norms_pref]}  (pos 7 = xvector ~10, others ~1.6-1.8)")
print(f"  Suffix norms: {[f'{n:.2f}' for n in norms_suff]}")

# ---- 3. Tokenizer files ----
# Copy the HF tokenizer artefacts so a Kotlin BPE can reproduce Python
# encode() bit-for-bit.
print(f"\n[3/3] Copying tokenizer to {OUTPUT_DIR}/qwen3_tokenizer/...")
for name in ("tokenizer.json", "vocab.json", "merges.txt", "tokenizer_config.json", "special_tokens_map.json"):
    src = os.path.join(MODEL, name)
    if os.path.exists(src):
        shutil.copy(src, f"{OUTPUT_DIR}/qwen3_tokenizer/{name}")
        print(f"  {name} ({os.path.getsize(src)} bytes)")
    else:
        print(f"  (skipped, not present: {name})")

print(f"\n=== DONE ===")
print(f"Files ready in {OUTPUT_DIR}/")
print(f"\nPush to tablet:")
print(f"  adb push {OUTPUT_DIR}/text_embeds_full_fp16.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/")
print(f"  adb push {OUTPUT_DIR}/damien_voice_prefix.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/")
print(f"  adb push {OUTPUT_DIR}/qwen3_tokenizer /data/local/tmp/kazeia/models/qwen3-tts-npu/")