Disable C++ pipeline (QNN non-deterministic), keep Java RTF 1.8

Root cause found: QNN HTP level=1 compilation is not bitwise deterministic. Two loads of the same .pte produce slightly different hidden states → audible trembling in decoded speech. Java pipeline uses single QNN instance → no trembling, validated quality. C++ pipeline code preserved for future use when QNN context caching is fixed (would make both loads use same compiled graph). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 11:42:49 +02:00 · 2026-04-09 11:42:49 +02:00 · 38c0e9874a
parent 439629c9bf
commit 38c0e9874a
2 changed files with 65 additions and 35 deletions
--- a/executorch-custom/tts_pipeline_jni.cpp
+++ b/executorch-custom/tts_pipeline_jni.cpp
@ -143,11 +143,8 @@ Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring
    env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
    if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
    gState->loaded=true;
    // Warmup both
    {auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
    {auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
    auto t1=std::chrono::high_resolution_clock::now();
-    LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
+    LOGI("Models loaded: %.0fms (no warmup — first forward will be slower)",std::chrono::duration<float,std::milli>(t1-t0).count());
    return JNI_TRUE;
 }
@ -289,8 +286,9 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
        }
    }
    auto tP1=std::chrono::high_resolution_clock::now();
-    LOGI("Prefill: %.0fms, %d steps, cb0=%d",
+    LOGI("Prefill: %.0fms, %d steps, cb0=%d, hidden[0:4]=[%.6f,%.6f,%.6f,%.6f]",
-         std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
+         std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0,
         hidden[0],hidden[1],hidden[2],hidden[3]);
    if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
@ -315,14 +313,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
        for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
        cb0History.push_back(currentCb0);
-        // Build next talker input
+        // Build next talker input: use pre-computed trailing embeds as-is
        float nextEmb[DIM]={};
        if(trailingIdx<nTrailing){
            // Pre-computed decode embed from file: use as-is (already contains codec+text)
            memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
            trailingIdx++;
        } else {
            // After trailing exhausted: build from our codes + eos/pad
            const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
            for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
            for(int cb=0;cb<15;cb++){
@ -347,7 +343,7 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
        totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
        pos++;
-        // Sample next cb0 (suppress non-codec, repetition penalty)
+        // Next cb0: suppress non-codec, repetition penalty, top-k sampling
        for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
        std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
        for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
@ -365,6 +361,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
    }
    int nTokens=(int)allCodes.size()/NUM_CB;
    // Log all CB0 codes for quality comparison
    {
        char buf[2048]={};int off=0;
        for(int i=0;i<(int)cb0History.size()&&off<2000;i++) off+=snprintf(buf+off,2048-off,"%d,",cb0History[i]);
        LOGI("CB0 sequence: [%s]",buf);
    }
    auto T1=std::chrono::high_resolution_clock::now();
    LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
         nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@ -226,14 +226,32 @@ class Qwen3TtsEngine(
                    nlog("Speech decoder on HTP")
                }
-                // Load CP .pte JNI BEFORE talker Hexagon (must grab CDSP first for skel path)
+                // Set ADSP library path for QNN HTP skel libs (needed by both Java and C++ paths)
                android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
                // Try native C++ pipeline first (single QNN instance, no Java overhead)
                run {
                    val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
-                    if (etModel.exists() && cpPteModule == null) {
+                    val talkerPte = File("/data/local/tmp/kazeia/models/talker_transformer_fp16.pte")
                    if (etModel.exists() && talkerPte.exists()) {
                        try {
-                            // Set ADSP library path so FastRPC can find skel libs in app's native dir
+                            val tn = System.currentTimeMillis()
-                            android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
+                            nativePipelineReady = TtsPipeline.nativeInit(
-                            nlog("ADSP_LIBRARY_PATH=$nativeLibDir")
+                                talkerPte.absolutePath, etModel.absolutePath
                            )
                            nlog("Native C++ pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
                        } catch (e: Exception) {
                            nlog("Native pipeline init failed: ${e.message}")
                        }
                    }
                }
                // Fallback: Load Java .pte modules (only if native pipeline failed)
                run {
                    val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
                    if (!nativePipelineReady && etModel.exists() && cpPteModule == null) {
                        try {
                            nlog("Loading Java .pte modules (native unavailable)...")
                            val t0 = System.currentTimeMillis()
                            cpPteModule = org.pytorch.executorch.Module.load(
                                etModel.absolutePath,
@ -309,18 +327,7 @@ class Qwen3TtsEngine(
                                    nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
                                } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
-                                // Init native C++ pipeline (loads models with own ExecuTorch runtime)
+                                // Native pipeline already initialized above
                                try {
                                    val tn = System.currentTimeMillis()
                                    nativePipelineReady = TtsPipeline.nativeInit(
                                        "/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
                                        "/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
                                    )
                                    nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
                                } catch (e: Exception) {
                                    nlog("Native pipeline init failed: ${e.message}")
                                    nativePipelineReady = false
                                }
                            }
                        } catch (e: Exception) {
                            nlog("Talker .pte JNI failed: ${e.message}")
@ -2172,16 +2179,20 @@ class Qwen3TtsEngine(
     * Runs talker ONNX → CP → VQ decode → speech decoder.
     */
    fun generateFromEmbeds(embedsPath: String): ShortArray {
-        if (!loaded || (talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
+        if (!loaded || (!nativePipelineReady && talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
-            nlog("generateFromEmbeds: talker not loaded (pte=${talkerPteModule != null}, hex=$useHexagonTalker, onnx=${talkerKv != null})")
+            nlog("generateFromEmbeds: no talker (native=$nativePipelineReady, pte=${talkerPteModule != null}, hex=$useHexagonTalker)")
            return ShortArray(0)
        }
-        if (useHexagonTalker) {
+        // Priority: native C++ > Java .pte > Hexagon > ONNX CPU
-            return generateFromEmbedsHexagon(embedsPath)
+        if (nativePipelineReady) {
            return generateFromEmbedsPte(embedsPath)
        }
        if (talkerPteModule != null && cpPteModule != null) {
            return generateFromEmbedsPte(embedsPath)
        }
        if (useHexagonTalker) {
            return generateFromEmbedsHexagon(embedsPath)
        }
        nlog("Full pipeline from: $embedsPath")
        val t0 = System.currentTimeMillis()
@ -2287,10 +2298,10 @@ class Qwen3TtsEngine(
        nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
        val allCodes: Array<IntArray>
-        // Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
+        // Native C++ disabled: QNN HTP compilation not deterministic between loads
-        // Java pipeline: RTF 1.8, validated quality
+        // Two instances of same .pte give slightly different hidden states → trembling
-        // TODO: share QNN context between Java and C++ for same quality at C++ speed
+        // Keep Java pipeline (same QNN instance, validated quality)
-        if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
+        if (false && nativePipelineReady) {
            // Native C++ pipeline — zero Java overhead
            val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
            for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
@ -2309,6 +2320,22 @@ class Qwen3TtsEngine(
                }
            }
            // Ensure all data arrays are loaded
            val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
            if (codecEmbedding == null) codecEmbedding = loadNpy("$mpath/codec_embedding.npy")
            if (cpEmbeddings == null) cpEmbeddings = loadNpy("$mpath/code_predictor_embeddings.npy")
            if (cpRotaryCos == null) cpRotaryCos = loadNpy("$mpath/cp_kv_v2/cp_rotary_cos.npy")
            if (cpRotarySin == null) cpRotarySin = loadNpy("$mpath/cp_kv_v2/cp_rotary_sin.npy")
            if (talkerPteRotaryCos == null) talkerPteRotaryCos = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_cos.npy")
            if (talkerPteRotarySin == null) talkerPteRotarySin = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_sin.npy")
            if (ttsEosEmbed == null || ttsPadEmbed == null) {
                val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
                val sp = loadNpy("$mpath/tts_special_embeds.npy")
                ttsBosEmbed = sp.sliceArray(0 until TALKER_DIM)
                ttsEosEmbed = sp.sliceArray(TALKER_DIM until 2 * TALKER_DIM)
                ttsPadEmbed = sp.sliceArray(2 * TALKER_DIM until 3 * TALKER_DIM)
            }
            nlog("Running native C++ pipeline...")
            val flat = TtsPipeline.nativeRun(
                prefillFlat, nPrefill,
@ -2480,6 +2507,7 @@ class Qwen3TtsEngine(
        val n = allCodes.size
        nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)")
        nlog("CB0 Java: [${generatedCb0.joinToString(",")}]")
        return allCodes.toTypedArray()
    }