kazeia/scripts/prepare_tts_native.py

85 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Generate text-only TTS embeddings for FULL C++ native pipeline.
No Python model generation needed — just tokenize + text_projection.
Usage: python3 prepare_tts_native.py "Your text here" [output.bin]
adb push output.bin /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin
Formula: trailing = text_proj[1:] + eos_padding(n_tokens × 4 total)
maxTokens = trailing_count (cut after trailing exhausted)
"""
import sys, os, struct, warnings
os.chdir("/tmp")
warnings.filterwarnings("ignore")
TEXT = sys.argv[1] if len(sys.argv) > 1 else "Bonjour, je m'appelle Kazeia."
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/tts_native.bin"
GOLDEN_PREFILL = "/tmp/existing_embeds.bin" # Must exist (captured on-device once)
MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc"
import torch, numpy as np
from qwen_tts import Qwen3TTSModel
print(f"Text: '{TEXT[:80]}{'...' if len(TEXT)>80 else ''}'")
# Load model (just for tokenizer + text_projection)
tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu")
talker = tts.model.talker
tokenizer = tts.processor.tokenizer
# Tokenize + project
tokens = tokenizer.encode(TEXT, add_special_tokens=False)
with torch.no_grad():
proj = talker.text_projection(
talker.get_text_embeddings()(torch.tensor([tokens]))
)[0].numpy().astype(np.float32)
print(f"Tokens: {len(tokens)}")
# Load golden prefill[0:9] (captured on-device, text-independent)
if not os.path.exists(GOLDEN_PREFILL):
os.system(f"adb pull /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin {GOLDEN_PREFILL}")
with open(GOLDEN_PREFILL, "rb") as f:
nP = struct.unpack("<i", f.read(4))[0]
nT = struct.unpack("<i", f.read(4))[0]
golden = [np.frombuffer(f.read(1024*4), dtype=np.float32).copy() for _ in range(nT)]
# Load codec_bos embedding
ce = np.load("/tmp/ce.npy", allow_pickle=True).reshape(-1, 1024)
CODEC_BOS = 2149
# Load eos embedding
sp = np.load("/tmp/tts_special.npy").reshape(3, 1024)
eos = sp[1].astype(np.float32)
# Build trailing: text[1:] + eos padding
# Audio is ~3.5× longer than text tokens. Pad with eos to ensure full coverage.
AUDIO_RATIO = 3.5 # audio_tokens ≈ text_tokens × 3.5 (slightly generous to avoid cut)
target_len = max(int(len(tokens) * AUDIO_RATIO), 50) # minimum 50 for short phrases
trailing = [proj[i] for i in range(1, len(proj))] # text[1:]
while len(trailing) < target_len:
trailing.append(eos)
# Build file
nPrefill = 10
nTotal = nPrefill + len(trailing)
with open(OUTPUT, "wb") as f:
f.write(struct.pack("<i", nPrefill))
f.write(struct.pack("<i", nTotal))
# Golden prefill[0:8]
for i in range(9):
f.write(golden[i].tobytes())
# Prefill[9] = text[0] + codec_bos
f.write((proj[0] + ce[CODEC_BOS]).tobytes())
# Trailing
for e in trailing:
f.write(np.array(e, dtype=np.float32).tobytes())
audio_est = len(trailing) * 0.08
print(f"Trailing: {len(trailing)} ({len(tokens)-1} text + {len(trailing)-len(tokens)+1} eos)")
print(f"Audio: ~{audio_est:.1f}s estimated")
print(f"Saved: {OUTPUT} ({os.path.getsize(OUTPUT)/1024:.0f}KB)")
print(f"\nadb push {OUTPUT} /data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin")