#!/usr/bin/env python3 """ Test CP ExecuTorch NPU quality vs Python reference. 1. Run full Python TTS pipeline, capturing CP inputs (hidden + cb0_emb) 2. Send those to CP ET runner on NPU via TCP (adb forward) 3. Compare NPU codes vs Python codes 4. Save both for tablet decoding """ import sys, os, struct, socket, time os.chdir("/tmp") # avoid numpy import issues import warnings; warnings.filterwarnings("ignore") import torch import numpy as np MODEL = "/home/alf/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/5d83992436eae1d760afd27aff78a71d676296fc" VOICE = "/opt/Kazeia/voix/damien_15s_24k.wav" TEXT = "Bonjour, je m'appelle Kazeia." CP_ET_PORT = 5556 print("Loading model...") from qwen_tts import Qwen3TTSModel tts = Qwen3TTSModel.from_pretrained(MODEL, local_files_only=True, device_map="cpu") talker = tts.model.talker cp = talker.code_predictor # ── Monkey-patch code_predictor.generate to capture inputs + outputs ── captured_frames = [] original_cp_generate = cp.generate.__func__ if hasattr(cp.generate, '__func__') else cp.generate def patched_cp_generate(self_cp, **kwargs): ie = kwargs.get("inputs_embeds") # inputs_embeds shape: [1, 2, 1024] = [past_hidden, cb0_emb] hidden = ie[0, 0, :].detach().cpu().numpy().astype(np.float32) cb0_emb = ie[0, 1, :].detach().cpu().numpy().astype(np.float32) result = original_cp_generate(self_cp, **kwargs) # result.sequences shape: [1, 15] = CB1-CB15 codes py_codes = result.sequences[0].tolist() captured_frames.append({ "hidden": hidden, "cb0_emb": cb0_emb, "py_codes": py_codes, # CB1-CB15 from Python }) return result # Bind the patch import types cp.generate = types.MethodType(patched_cp_generate, cp) # ── Run full Python pipeline ── print(f"Generating: '{TEXT}'") audio_list, sr = tts.generate_voice_clone( text=TEXT, ref_audio=VOICE, language="french", x_vector_only_mode=True, non_streaming_mode=True, ) audio = audio_list[0] print(f"Python: {len(audio)} samples, {len(audio)/sr:.2f}s, {len(captured_frames)} frames captured") import soundfile as sf sf.write("/opt/Kazeia/kazeia_PY_REF.wav", audio, sr) print("Saved: kazeia_PY_REF.wav") # ── Extract CB0 codes from captured data ── # CB0 comes from talker's codec_head, but we need it from the generation output. # We can reverse-lookup from cb0_emb: find closest embedding in talker's embedding table. talker_emb = talker.get_input_embeddings() emb_weight = talker_emb.weight.detach().cpu().numpy() # [vocab_size, 1024] cb0_codes = [] for frame in captured_frames: # Find which embedding row matches cb0_emb diffs = np.sum((emb_weight - frame["cb0_emb"]) ** 2, axis=1) cb0 = int(np.argmin(diffs)) cb0_codes.append(cb0) if diffs[cb0] > 1e-6: print(f" WARNING: cb0 lookup imprecise, min_diff={diffs[cb0]:.6f}") print(f"CB0 codes (first 5): {cb0_codes[:5]}") # ── Send to CP ET runner on NPU ── print(f"\nConnecting to CP ET runner on port {CP_ET_PORT}...") sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(30) sock.connect(("127.0.0.1", CP_ET_PORT)) print("Connected!") npu_codes_all = [] total_npu_ms = 0 mismatches = 0 for i, frame in enumerate(captured_frames): # Send hidden_in + cb0_emb = 2*1024*4 = 8192 bytes payload = frame["hidden"].tobytes() + frame["cb0_emb"].tobytes() sock.sendall(payload) # Read response: 15 ints (60 bytes) + 1 float (4 bytes) = 64 bytes resp = b"" while len(resp) < 64: chunk = sock.recv(64 - len(resp)) if not chunk: raise RuntimeError(f"Socket closed at frame {i}") resp += chunk npu_codes = list(struct.unpack("<15i", resp[:60])) timing = struct.unpack("