kazeia/scripts/prepare_tts_native.py

#!/usr/bin/env python3
"""
Generate text-only TTS embeddings for FULL C++ native pipeline.
No Python model generation needed — just tokenize + text_projection.

Usage: python3 prepare_tts_native.py "Your text here" [output.bin]
       adb push output.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin

Formula: trailing = text_proj[1:] + eos_padding(n_tokens × 4 total)
         maxTokens = trailing_count (cut after trailing exhausted)
"""
import sys, os, struct, warnings
os.chdir("/tmp")
warnings.filterwarnings("ignore")

TEXT = sys.argv[1] if len(sys.argv) > 1 else "Bonjour, je m'appelle Kazeia."
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_native.bin"
GOLDEN_PREFILL = "/tmp/existing_embeds.bin"  # Must exist (captured on-device once)
MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc"

import torch, numpy as np
from qwen_tts import Qwen3TTSModel

print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'")

# Load model (just for tokenizer + text_projection)
tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu")
talker = tts.model.talker
tokenizer = tts.processor.tokenizer

# Tokenize + project
tokens = tokenizer.encode(TEXT, add_special_tokens=False)
with torch.no_grad():
    proj = talker.text_projection(
        talker.get_text_embeddings()(torch.tensor([tokens]))
    )[0].numpy().astype(np.float32)
print(f"Tokens: {len(tokens)}")

# Load golden prefill[0:9] (captured on-device, text-independent)
if not os.path.exists(GOLDEN_PREFILL):
    os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {GOLDEN_PREFILL}")
with open(GOLDEN_PREFILL, "rb") as f:
    nP = struct.unpack("<i", f.read(4))[0]
    nT = struct.unpack("<i", f.read(4))[0]
    golden = [np.frombuffer(f.read(1024*4), dtype=np.float32).copy() for _ in range(nT)]

# Load codec_bos embedding
ce = np.load("/tmp/ce.npy", allow_pickle=True).reshape(-1, 1024)
CODEC_BOS = 2149

# Load eos embedding
sp = np.load("/tmp/tts_special.npy").reshape(3, 1024)
eos = sp[1].astype(np.float32)

# Build trailing: text[1:] + eos padding
# Audio is ~3.5× longer than text tokens. Pad with eos to ensure full coverage.
AUDIO_RATIO = 3.5  # audio_tokens ≈ text_tokens × 3.5 (slightly generous to avoid cut)
target_len = max(int(len(tokens) * AUDIO_RATIO), 50)  # minimum 50 for short phrases

trailing = [proj[i] for i in range(1, len(proj))]  # text[1:]
while len(trailing) < target_len:
    trailing.append(eos)

# Build file
nPrefill = 10
nTotal = nPrefill + len(trailing)

with open(OUTPUT, "wb") as f:
    f.write(struct.pack("<i", nPrefill))
    f.write(struct.pack("<i", nTotal))
    # Golden prefill[0:8]
    for i in range(9):
        f.write(golden[i].tobytes())
    # Prefill[9] = text[0] + codec_bos
    f.write((proj[0] + ce[CODEC_BOS]).tobytes())
    # Trailing
    for e in trailing:
        f.write(np.array(e, dtype=np.float32).tobytes())

audio_est = len(trailing) * 0.08
print(f"Trailing: {len(trailing)} ({len(tokens)-1} text + {len(trailing)-len(tokens)+1} eos)")
print(f"Audio: ~{audio_est:.1f}s estimated")
print(f"Saved: {OUTPUT} ({os.path.getsize(OUTPUT)/1024:.0f}KB)")
print(f"\nadb push {OUTPUT} /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin")