diff --git a/executorch-custom/tts_pipeline_jni.cpp b/executorch-custom/tts_pipeline_jni.cpp
index 6b2ddec..b967209 100644
--- a/executorch-custom/tts_pipeline_jni.cpp
+++ b/executorch-custom/tts_pipeline_jni.cpp
@@ -143,11 +143,8 @@ Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring
     env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
     if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
     gState->loaded=true;
-    // Warmup both
-    {auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
-    {auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
     auto t1=std::chrono::high_resolution_clock::now();
-    LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
+    LOGI("Models loaded: %.0fms (no warmup — first forward will be slower)",std::chrono::duration<float,std::milli>(t1-t0).count());
     return JNI_TRUE;
 }
 
@@ -289,8 +286,9 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
         }
     }
     auto tP1=std::chrono::high_resolution_clock::now();
-    LOGI("Prefill: %.0fms, %d steps, cb0=%d",
-         std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
+    LOGI("Prefill: %.0fms, %d steps, cb0=%d, hidden[0:4]=[%.6f,%.6f,%.6f,%.6f]",
+         std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0,
+         hidden[0],hidden[1],hidden[2],hidden[3]);
 
     if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
 
@@ -315,14 +313,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
         for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
         cb0History.push_back(currentCb0);
 
-        // Build next talker input
+        // Build next talker input: use pre-computed trailing embeds as-is
         float nextEmb[DIM]={};
         if(trailingIdx<nTrailing){
-            // Pre-computed decode embed from file: use as-is (already contains codec+text)
             memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
             trailingIdx++;
         } else {
-            // After trailing exhausted: build from our codes + eos/pad
             const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
             for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
             for(int cb=0;cb<15;cb++){
@@ -347,7 +343,7 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
         totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
         pos++;
 
-        // Sample next cb0 (suppress non-codec, repetition penalty)
+        // Next cb0: suppress non-codec, repetition penalty, top-k sampling
         for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
         std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
         for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
@@ -365,6 +361,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
     }
 
     int nTokens=(int)allCodes.size()/NUM_CB;
+    // Log all CB0 codes for quality comparison
+    {
+        char buf[2048]={};int off=0;
+        for(int i=0;i<(int)cb0History.size()&&off<2000;i++) off+=snprintf(buf+off,2048-off,"%d,",cb0History[i]);
+        LOGI("CB0 sequence: [%s]",buf);
+    }
     auto T1=std::chrono::high_resolution_clock::now();
     LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
          nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),
diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
index 0d4a163..0d6bc7e 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@@ -226,14 +226,32 @@ class Qwen3TtsEngine(
                     nlog("Speech decoder on HTP")
                 }
 
-                // Load CP .pte JNI BEFORE talker Hexagon (must grab CDSP first for skel path)
+                // Set ADSP library path for QNN HTP skel libs (needed by both Java and C++ paths)
+                android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
+
+                // Try native C++ pipeline first (single QNN instance, no Java overhead)
                 run {
                     val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
-                    if (etModel.exists() && cpPteModule == null) {
+                    val talkerPte = File("/data/local/tmp/kazeia/models/talker_transformer_fp16.pte")
+                    if (etModel.exists() && talkerPte.exists()) {
                         try {
-                            // Set ADSP library path so FastRPC can find skel libs in app's native dir
-                            android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
-                            nlog("ADSP_LIBRARY_PATH=$nativeLibDir")
+                            val tn = System.currentTimeMillis()
+                            nativePipelineReady = TtsPipeline.nativeInit(
+                                talkerPte.absolutePath, etModel.absolutePath
+                            )
+                            nlog("Native C++ pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
+                        } catch (e: Exception) {
+                            nlog("Native pipeline init failed: ${e.message}")
+                        }
+                    }
+                }
+
+                // Fallback: Load Java .pte modules (only if native pipeline failed)
+                run {
+                    val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
+                    if (!nativePipelineReady && etModel.exists() && cpPteModule == null) {
+                        try {
+                            nlog("Loading Java .pte modules (native unavailable)...")
                             val t0 = System.currentTimeMillis()
                             cpPteModule = org.pytorch.executorch.Module.load(
                                 etModel.absolutePath,
@@ -309,18 +327,7 @@ class Qwen3TtsEngine(
                                     nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
                                 } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
 
-                                // Init native C++ pipeline (loads models with own ExecuTorch runtime)
-                                try {
-                                    val tn = System.currentTimeMillis()
-                                    nativePipelineReady = TtsPipeline.nativeInit(
-                                        "/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
-                                        "/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
-                                    )
-                                    nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
-                                } catch (e: Exception) {
-                                    nlog("Native pipeline init failed: ${e.message}")
-                                    nativePipelineReady = false
-                                }
+                                // Native pipeline already initialized above
                             }
                         } catch (e: Exception) {
                             nlog("Talker .pte JNI failed: ${e.message}")
@@ -2172,16 +2179,20 @@ class Qwen3TtsEngine(
      * Runs talker ONNX → CP → VQ decode → speech decoder.
      */
     fun generateFromEmbeds(embedsPath: String): ShortArray {
-        if (!loaded || (talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
-            nlog("generateFromEmbeds: talker not loaded (pte=${talkerPteModule != null}, hex=$useHexagonTalker, onnx=${talkerKv != null})")
+        if (!loaded || (!nativePipelineReady && talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
+            nlog("generateFromEmbeds: no talker (native=$nativePipelineReady, pte=${talkerPteModule != null}, hex=$useHexagonTalker)")
             return ShortArray(0)
         }
-        if (useHexagonTalker) {
-            return generateFromEmbedsHexagon(embedsPath)
+        // Priority: native C++ > Java .pte > Hexagon > ONNX CPU
+        if (nativePipelineReady) {
+            return generateFromEmbedsPte(embedsPath)
         }
         if (talkerPteModule != null && cpPteModule != null) {
             return generateFromEmbedsPte(embedsPath)
         }
+        if (useHexagonTalker) {
+            return generateFromEmbedsHexagon(embedsPath)
+        }
         nlog("Full pipeline from: $embedsPath")
         val t0 = System.currentTimeMillis()
 
@@ -2287,10 +2298,10 @@ class Qwen3TtsEngine(
         nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
 
         val allCodes: Array<IntArray>
-        // Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
-        // Java pipeline: RTF 1.8, validated quality
-        // TODO: share QNN context between Java and C++ for same quality at C++ speed
-        if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
+        // Native C++ disabled: QNN HTP compilation not deterministic between loads
+        // Two instances of same .pte give slightly different hidden states → trembling
+        // Keep Java pipeline (same QNN instance, validated quality)
+        if (false && nativePipelineReady) {
             // Native C++ pipeline — zero Java overhead
             val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
             for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
@@ -2309,6 +2320,22 @@ class Qwen3TtsEngine(
                 }
             }
 
+            // Ensure all data arrays are loaded
+            val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
+            if (codecEmbedding == null) codecEmbedding = loadNpy("$mpath/codec_embedding.npy")
+            if (cpEmbeddings == null) cpEmbeddings = loadNpy("$mpath/code_predictor_embeddings.npy")
+            if (cpRotaryCos == null) cpRotaryCos = loadNpy("$mpath/cp_kv_v2/cp_rotary_cos.npy")
+            if (cpRotarySin == null) cpRotarySin = loadNpy("$mpath/cp_kv_v2/cp_rotary_sin.npy")
+            if (talkerPteRotaryCos == null) talkerPteRotaryCos = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_cos.npy")
+            if (talkerPteRotarySin == null) talkerPteRotarySin = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_sin.npy")
+            if (ttsEosEmbed == null || ttsPadEmbed == null) {
+                val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
+                val sp = loadNpy("$mpath/tts_special_embeds.npy")
+                ttsBosEmbed = sp.sliceArray(0 until TALKER_DIM)
+                ttsEosEmbed = sp.sliceArray(TALKER_DIM until 2 * TALKER_DIM)
+                ttsPadEmbed = sp.sliceArray(2 * TALKER_DIM until 3 * TALKER_DIM)
+            }
+
             nlog("Running native C++ pipeline...")
             val flat = TtsPipeline.nativeRun(
                 prefillFlat, nPrefill,
@@ -2480,6 +2507,7 @@ class Qwen3TtsEngine(
 
         val n = allCodes.size
         nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)")
+        nlog("CB0 Java: [${generatedCb0.joinToString(",")}]")
         return allCodes.toTypedArray()
     }