#!/usr/bin/env python3 """ Generate text-only TTS embeddings for FULL C++ native pipeline. No Python model generation needed — just tokenize + text_projection. Usage: python3 prepare_tts_native.py "Your text here" [output.bin] adb push output.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin Mirrors Python qwen_tts protocol exactly: trailing = text_proj[1:] (no eos padding — C++ adds 1×eos then pad_embed itself) Stop = natural codec_eos_token_id (handled in C++) """ import sys, os, struct, warnings os.chdir("/tmp") warnings.filterwarnings("ignore") TEXT = sys.argv[1] if len(sys.argv) > 1 else "Bonjour, je m'appelle Kazeia." OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_native.bin" GOLDEN_PREFILL = "/tmp/existing_embeds.bin" MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" MAX_SEGMENT_TOKENS = 20 # ~70 audio steps + prefill = ~80, fits KV_LEN=100 with margin import torch, numpy as np, re from qwen_tts import Qwen3TTSModel print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'") tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu") talker = tts.model.talker tokenizer = tts.processor.tokenizer # Load golden prefill + codec/eos if not os.path.exists(GOLDEN_PREFILL): os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {GOLDEN_PREFILL}") with open(GOLDEN_PREFILL, "rb") as f: nP = struct.unpack(" max_tokens and current: segments.append(current.strip()) current = part else: current = test if current.strip(): segments.append(current.strip()) return [s for s in segments if s.strip()] def make_segment(text_segment): """Build embeds for one segment. Mirrors Python qwen_tts: trailing = text_proj[1:] (no padding). C++ then adds 1×eos after exhausting trailing, then pad_embed, and stops on natural EOS. """ tokens = tokenizer.encode(text_segment, add_special_tokens=False) with torch.no_grad(): proj = talker.text_projection( talker.get_text_embeddings()(torch.tensor([tokens])) )[0].numpy().astype(np.float32) trailing = [proj[i] for i in range(1, len(proj))] # text[1:], no eos here return { 'tokens': len(tokens), 'proj0': proj[0], 'trailing': trailing, } # Split text into segments segments = split_text(TEXT, MAX_SEGMENT_TOKENS) print(f"Segments: {len(segments)}") for i, s in enumerate(segments): n = len(tokenizer.encode(s, add_special_tokens=False)) print(f" [{i}] ({n} tok) '{s[:60]}{'...' if len(s)>60 else ''}'") # Generate embeds per segment seg_data = [make_segment(s) for s in segments] if len(seg_data) == 1: # Single segment: legacy format s = seg_data[0] nPrefill = 10 nTotal = nPrefill + len(s['trailing']) with open(OUTPUT, "wb") as f: f.write(struct.pack("