#!/usr/bin/env python3 """ Generate text-only TTS embeddings for FULL C++ native pipeline. No Python model generation needed — just tokenize + text_projection. Usage: python3 prepare_tts_native.py "Your text here" [output.bin] adb push output.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin Formula: trailing = text_proj[1:] + eos_padding(n_tokens × 4 total) maxTokens = trailing_count (cut after trailing exhausted) """ import sys, os, struct, warnings os.chdir("/tmp") warnings.filterwarnings("ignore") TEXT = sys.argv[1] if len(sys.argv) > 1 else "Bonjour, je m'appelle Kazeia." OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_native.bin" GOLDEN_PREFILL = "/tmp/existing_embeds.bin" # Must exist (captured on-device once) MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" import torch, numpy as np from qwen_tts import Qwen3TTSModel print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'") # Load model (just for tokenizer + text_projection) tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu") talker = tts.model.talker tokenizer = tts.processor.tokenizer # Tokenize + project tokens = tokenizer.encode(TEXT, add_special_tokens=False) with torch.no_grad(): proj = talker.text_projection( talker.get_text_embeddings()(torch.tensor([tokens])) )[0].numpy().astype(np.float32) print(f"Tokens: {len(tokens)}") # Load golden prefill[0:9] (captured on-device, text-independent) if not os.path.exists(GOLDEN_PREFILL): os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {GOLDEN_PREFILL}") with open(GOLDEN_PREFILL, "rb") as f: nP = struct.unpack("