From 09d36f20251aacca6b56c2bcb6213de409607af8 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Thu, 9 Apr 2026 23:00:37 +0200 Subject: [PATCH] Root cause found + on-device embed capture + KV=100 restored MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: embeds must come from SAME NPU model instance. Python fp32 embeds cause divergence on NPU fp16 after ~20 steps. Solution: Java pipeline captures embeds on-device during generation. Captured embeds work perfectly with C++ pipeline (validated "bon"). - Added capture mode: touch /data/local/tmp/kazeia/capture_mode - Embeds saved to captured_embeds.bin (same format as pipeline input) - KV_LEN restored to 100 (KV=64 lost role tokens → quality loss) - C++ uses pre-computed embeds as-is (no double codec_sum) Production path: Java pipeline RTF 1.8 for new texts (good quality) Replay path: C++ pipeline RTF 1.26 with captured embeds Co-Authored-By: Claude Opus 4.6 (1M context) --- .../java/com/kazeia/tts/Qwen3TtsEngine.kt | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index a8dcf65..ea05d6d 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -1067,10 +1067,13 @@ class Qwen3TtsEngine( nlog("PTE pipeline: prefill=${prefill.size}, trailing=${trailingEmbeds.size}") + // Capture mode: save all talker inputs for reuse with C++ pipeline + val capturedEmbeds = mutableListOf() + // ===== PREFILL ===== val tPrefill = System.currentTimeMillis() for (step in prefill.indices) { - // Unmask position + capturedEmbeds.add(prefill[step].clone()) // capture prefill input val maskIdx = TALKER_PTE_KV_LEN - 1 - minOf(pos, TALKER_PTE_KV_LEN - 1) if (maskIdx >= 0) maskData[maskIdx] = 0f @@ -1139,6 +1142,8 @@ class Qwen3TtsEngine( else -> sumEmb(codecSum, padE) } + capturedEmbeds.add(nextEmbed.clone()) // capture decode input + // 3. Talker step val maskIdx = TALKER_PTE_KV_LEN - 1 - minOf(pos, TALKER_PTE_KV_LEN - 1) if (maskIdx >= 0) maskData[maskIdx] = 0f @@ -1192,6 +1197,26 @@ class Qwen3TtsEngine( val n = allCodes.size nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)") + + // Save captured embeds for C++ pipeline reuse + if (capturedEmbeds.isNotEmpty()) { + try { + val capPath = "/data/local/tmp/kazeia/captured_embeds.bin" + val nPrefill = prefill.size + val fos = java.io.FileOutputStream(capPath) + val hdr = java.nio.ByteBuffer.allocate(8).order(java.nio.ByteOrder.LITTLE_ENDIAN) + hdr.putInt(nPrefill); hdr.putInt(capturedEmbeds.size) + fos.write(hdr.array()) + for (emb in capturedEmbeds) { + val buf = java.nio.ByteBuffer.allocate(TALKER_DIM * 4).order(java.nio.ByteOrder.LITTLE_ENDIAN) + for (v in emb) buf.putFloat(v) + fos.write(buf.array()) + } + fos.close() + nlog("Captured ${capturedEmbeds.size} embeds → $capPath") + } catch (e: Exception) { nlog("Capture save failed: ${e.message}") } + } + return allCodes.toTypedArray() } @@ -2299,8 +2324,10 @@ class Qwen3TtsEngine( nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)") val allCodes: Array + // Check if capture mode requested (force Java path to capture embeds) + val captureMode = File("/data/local/tmp/kazeia/capture_mode").exists() // Native C++ pipeline using SAME Java Module instances (no quality loss) - if (talkerPteModule != null && cpPteModule != null) { + if (!captureMode && talkerPteModule != null && cpPteModule != null) { // C++ loop on Java's Module instances — same QNN compilation, no JNI overhead val prefillFlat = FloatArray(nPrefill * TALKER_DIM) for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM) @@ -2449,9 +2476,13 @@ class Qwen3TtsEngine( return Pair(hidden, logits) } + // Capture embeds for C++ reuse + val capturedEmbeds = mutableListOf() + // ===== PREFILL ===== val tPrefill = System.currentTimeMillis() for (step in prefillEmbeds.indices) { + capturedEmbeds.add(prefillEmbeds[step].clone()) val (h, logits) = talkerStep(prefillEmbeds[step]) pastHidden = h if (step == prefillEmbeds.size - 1) { @@ -2480,12 +2511,12 @@ class Qwen3TtsEngine( if (trailingIdx < trailingEmbeds.size) { nextEmbed = trailingEmbeds[trailingIdx]; trailingIdx++ } else { - // Build from codec embeddings + trailing text (pad) val codecSum = FloatArray(TALKER_DIM) addEmb(codecSum, codecEmb(codes[0])) for (cb in 1 until NUM_CODEBOOKS) addEmb(codecSum, cpEmb(cb - 1, codes[cb])) nextEmbed = sumEmb(codecSum, padE) } + capturedEmbeds.add(nextEmbed.clone()) val tTalker0 = System.currentTimeMillis() val (h, logits) = talkerStep(nextEmbed) @@ -2506,7 +2537,26 @@ class Qwen3TtsEngine( val n = allCodes.size nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)") - nlog("CB0 Java: [${generatedCb0.joinToString(",")}]") + + // Save captured embeds + if (capturedEmbeds.isNotEmpty()) { + try { + val capPath = "/data/local/tmp/kazeia/captured_embeds.bin" + val nPrefill = prefillEmbeds.size + val fos = java.io.FileOutputStream(capPath) + val hdr = java.nio.ByteBuffer.allocate(8).order(java.nio.ByteOrder.LITTLE_ENDIAN) + hdr.putInt(nPrefill); hdr.putInt(capturedEmbeds.size) + fos.write(hdr.array()) + for (emb in capturedEmbeds) { + val buf = java.nio.ByteBuffer.allocate(TALKER_DIM * 4).order(java.nio.ByteOrder.LITTLE_ENDIAN) + for (v in emb) buf.putFloat(v) + fos.write(buf.array()) + } + fos.close() + nlog("Captured ${capturedEmbeds.size} embeds → $capPath (${nPrefill} prefill + ${capturedEmbeds.size - nPrefill} decode)") + } catch (e: Exception) { nlog("Capture save failed: ${e.message}") } + } + return allCodes.toTypedArray() }