diff --git a/executorch-custom/tts_pipeline_jni.cpp b/executorch-custom/tts_pipeline_jni.cpp index 6b2ddec..b967209 100644 --- a/executorch-custom/tts_pipeline_jni.cpp +++ b/executorch-custom/tts_pipeline_jni.cpp @@ -143,11 +143,8 @@ Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp); if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;} gState->loaded=true; - // Warmup both - {auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();} - {auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();} auto t1=std::chrono::high_resolution_clock::now(); - LOGI("Loaded+warmup: %.0fms",std::chrono::duration(t1-t0).count()); + LOGI("Models loaded: %.0fms (no warmup — first forward will be slower)",std::chrono::duration(t1-t0).count()); return JNI_TRUE; } @@ -289,8 +286,9 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun( } } auto tP1=std::chrono::high_resolution_clock::now(); - LOGI("Prefill: %.0fms, %d steps, cb0=%d", - std::chrono::duration(tP1-tP0).count(), nPrefill, currentCb0); + LOGI("Prefill: %.0fms, %d steps, cb0=%d, hidden[0:4]=[%.6f,%.6f,%.6f,%.6f]", + std::chrono::duration(tP1-tP0).count(), nPrefill, currentCb0, + hidden[0],hidden[1],hidden[2],hidden[3]); if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);} @@ -315,14 +313,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun( for(int i=0;i(tt1-tt0).count(); pos++; - // Sample next cb0 (suppress non-codec, repetition penalty) + // Next cb0: suppress non-codec, repetition penalty, top-k sampling for(int j=CB_SIZE;j seen(cb0History.begin(),cb0History.end()); for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f; @@ -365,6 +361,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun( } int nTokens=(int)allCodes.size()/NUM_CB; + // Log all CB0 codes for quality comparison + { + char buf[2048]={};int off=0; + for(int i=0;i<(int)cb0History.size()&&off<2000;i++) off+=snprintf(buf+off,2048-off,"%d,",cb0History[i]); + LOGI("CB0 sequence: [%s]",buf); + } auto T1=std::chrono::high_resolution_clock::now(); LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms", nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1), diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index 0d4a163..0d6bc7e 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -226,14 +226,32 @@ class Qwen3TtsEngine( nlog("Speech decoder on HTP") } - // Load CP .pte JNI BEFORE talker Hexagon (must grab CDSP first for skel path) + // Set ADSP library path for QNN HTP skel libs (needed by both Java and C++ paths) + android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true) + + // Try native C++ pipeline first (single QNN instance, no Java overhead) run { val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte") - if (etModel.exists() && cpPteModule == null) { + val talkerPte = File("/data/local/tmp/kazeia/models/talker_transformer_fp16.pte") + if (etModel.exists() && talkerPte.exists()) { try { - // Set ADSP library path so FastRPC can find skel libs in app's native dir - android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true) - nlog("ADSP_LIBRARY_PATH=$nativeLibDir") + val tn = System.currentTimeMillis() + nativePipelineReady = TtsPipeline.nativeInit( + talkerPte.absolutePath, etModel.absolutePath + ) + nlog("Native C++ pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)") + } catch (e: Exception) { + nlog("Native pipeline init failed: ${e.message}") + } + } + } + + // Fallback: Load Java .pte modules (only if native pipeline failed) + run { + val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte") + if (!nativePipelineReady && etModel.exists() && cpPteModule == null) { + try { + nlog("Loading Java .pte modules (native unavailable)...") val t0 = System.currentTimeMillis() cpPteModule = org.pytorch.executorch.Module.load( etModel.absolutePath, @@ -309,18 +327,7 @@ class Qwen3TtsEngine( nlog("CP warmup: ${System.currentTimeMillis() - cw}ms") } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") } - // Init native C++ pipeline (loads models with own ExecuTorch runtime) - try { - val tn = System.currentTimeMillis() - nativePipelineReady = TtsPipeline.nativeInit( - "/data/local/tmp/kazeia/models/talker_transformer_fp16.pte", - "/data/local/tmp/kazeia/models/cp_transformer_fp16.pte" - ) - nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)") - } catch (e: Exception) { - nlog("Native pipeline init failed: ${e.message}") - nativePipelineReady = false - } + // Native pipeline already initialized above } } catch (e: Exception) { nlog("Talker .pte JNI failed: ${e.message}") @@ -2172,16 +2179,20 @@ class Qwen3TtsEngine( * Runs talker ONNX → CP → VQ decode → speech decoder. */ fun generateFromEmbeds(embedsPath: String): ShortArray { - if (!loaded || (talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) { - nlog("generateFromEmbeds: talker not loaded (pte=${talkerPteModule != null}, hex=$useHexagonTalker, onnx=${talkerKv != null})") + if (!loaded || (!nativePipelineReady && talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) { + nlog("generateFromEmbeds: no talker (native=$nativePipelineReady, pte=${talkerPteModule != null}, hex=$useHexagonTalker)") return ShortArray(0) } - if (useHexagonTalker) { - return generateFromEmbedsHexagon(embedsPath) + // Priority: native C++ > Java .pte > Hexagon > ONNX CPU + if (nativePipelineReady) { + return generateFromEmbedsPte(embedsPath) } if (talkerPteModule != null && cpPteModule != null) { return generateFromEmbedsPte(embedsPath) } + if (useHexagonTalker) { + return generateFromEmbedsHexagon(embedsPath) + } nlog("Full pipeline from: $embedsPath") val t0 = System.currentTimeMillis() @@ -2287,10 +2298,10 @@ class Qwen3TtsEngine( nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)") val allCodes: Array - // Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance) - // Java pipeline: RTF 1.8, validated quality - // TODO: share QNN context between Java and C++ for same quality at C++ speed - if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above + // Native C++ disabled: QNN HTP compilation not deterministic between loads + // Two instances of same .pte give slightly different hidden states → trembling + // Keep Java pipeline (same QNN instance, validated quality) + if (false && nativePipelineReady) { // Native C++ pipeline — zero Java overhead val prefillFlat = FloatArray(nPrefill * TALKER_DIM) for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM) @@ -2309,6 +2320,22 @@ class Qwen3TtsEngine( } } + // Ensure all data arrays are loaded + val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu" + if (codecEmbedding == null) codecEmbedding = loadNpy("$mpath/codec_embedding.npy") + if (cpEmbeddings == null) cpEmbeddings = loadNpy("$mpath/code_predictor_embeddings.npy") + if (cpRotaryCos == null) cpRotaryCos = loadNpy("$mpath/cp_kv_v2/cp_rotary_cos.npy") + if (cpRotarySin == null) cpRotarySin = loadNpy("$mpath/cp_kv_v2/cp_rotary_sin.npy") + if (talkerPteRotaryCos == null) talkerPteRotaryCos = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_cos.npy") + if (talkerPteRotarySin == null) talkerPteRotarySin = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_sin.npy") + if (ttsEosEmbed == null || ttsPadEmbed == null) { + val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu" + val sp = loadNpy("$mpath/tts_special_embeds.npy") + ttsBosEmbed = sp.sliceArray(0 until TALKER_DIM) + ttsEosEmbed = sp.sliceArray(TALKER_DIM until 2 * TALKER_DIM) + ttsPadEmbed = sp.sliceArray(2 * TALKER_DIM until 3 * TALKER_DIM) + } + nlog("Running native C++ pipeline...") val flat = TtsPipeline.nativeRun( prefillFlat, nPrefill, @@ -2480,6 +2507,7 @@ class Qwen3TtsEngine( val n = allCodes.size nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)") + nlog("CB0 Java: [${generatedCb0.joinToString(",")}]") return allCodes.toTypedArray() }