Disable C++ pipeline (QNN non-deterministic), keep Java RTF 1.8

Root cause found: QNN HTP level=1 compilation is not bitwise
deterministic. Two loads of the same .pte produce slightly different
hidden states → audible trembling in decoded speech.

Java pipeline uses single QNN instance → no trembling, validated quality.
C++ pipeline code preserved for future use when QNN context caching
is fixed (would make both loads use same compiled graph).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-09 11:42:49 +02:00
parent 439629c9bf
commit 38c0e9874a
2 changed files with 65 additions and 35 deletions

View File

@ -143,11 +143,8 @@ Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring
env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
gState->loaded=true;
// Warmup both
{auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
{auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
auto t1=std::chrono::high_resolution_clock::now();
LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
LOGI("Models loaded: %.0fms (no warmup — first forward will be slower)",std::chrono::duration<float,std::milli>(t1-t0).count());
return JNI_TRUE;
}
@ -289,8 +286,9 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
}
}
auto tP1=std::chrono::high_resolution_clock::now();
LOGI("Prefill: %.0fms, %d steps, cb0=%d",
std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
LOGI("Prefill: %.0fms, %d steps, cb0=%d, hidden[0:4]=[%.6f,%.6f,%.6f,%.6f]",
std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0,
hidden[0],hidden[1],hidden[2],hidden[3]);
if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
@ -315,14 +313,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
cb0History.push_back(currentCb0);
// Build next talker input
// Build next talker input: use pre-computed trailing embeds as-is
float nextEmb[DIM]={};
if(trailingIdx<nTrailing){
// Pre-computed decode embed from file: use as-is (already contains codec+text)
memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
trailingIdx++;
} else {
// After trailing exhausted: build from our codes + eos/pad
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
for(int cb=0;cb<15;cb++){
@ -347,7 +343,7 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
pos++;
// Sample next cb0 (suppress non-codec, repetition penalty)
// Next cb0: suppress non-codec, repetition penalty, top-k sampling
for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
@ -365,6 +361,12 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
}
int nTokens=(int)allCodes.size()/NUM_CB;
// Log all CB0 codes for quality comparison
{
char buf[2048]={};int off=0;
for(int i=0;i<(int)cb0History.size()&&off<2000;i++) off+=snprintf(buf+off,2048-off,"%d,",cb0History[i]);
LOGI("CB0 sequence: [%s]",buf);
}
auto T1=std::chrono::high_resolution_clock::now();
LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),

View File

@ -226,14 +226,32 @@ class Qwen3TtsEngine(
nlog("Speech decoder on HTP")
}
// Load CP .pte JNI BEFORE talker Hexagon (must grab CDSP first for skel path)
// Set ADSP library path for QNN HTP skel libs (needed by both Java and C++ paths)
android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
// Try native C++ pipeline first (single QNN instance, no Java overhead)
run {
val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
if (etModel.exists() && cpPteModule == null) {
val talkerPte = File("/data/local/tmp/kazeia/models/talker_transformer_fp16.pte")
if (etModel.exists() && talkerPte.exists()) {
try {
// Set ADSP library path so FastRPC can find skel libs in app's native dir
android.system.Os.setenv("ADSP_LIBRARY_PATH", "$nativeLibDir;/data/local/tmp/kazeia/qnn_libs;/vendor/dsp/cdsp;/vendor/dsp", true)
nlog("ADSP_LIBRARY_PATH=$nativeLibDir")
val tn = System.currentTimeMillis()
nativePipelineReady = TtsPipeline.nativeInit(
talkerPte.absolutePath, etModel.absolutePath
)
nlog("Native C++ pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
} catch (e: Exception) {
nlog("Native pipeline init failed: ${e.message}")
}
}
}
// Fallback: Load Java .pte modules (only if native pipeline failed)
run {
val etModel = File("/data/local/tmp/kazeia/models/cp_transformer_fp16.pte")
if (!nativePipelineReady && etModel.exists() && cpPteModule == null) {
try {
nlog("Loading Java .pte modules (native unavailable)...")
val t0 = System.currentTimeMillis()
cpPteModule = org.pytorch.executorch.Module.load(
etModel.absolutePath,
@ -309,18 +327,7 @@ class Qwen3TtsEngine(
nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
} catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
// Init native C++ pipeline (loads models with own ExecuTorch runtime)
try {
val tn = System.currentTimeMillis()
nativePipelineReady = TtsPipeline.nativeInit(
"/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
"/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
)
nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
} catch (e: Exception) {
nlog("Native pipeline init failed: ${e.message}")
nativePipelineReady = false
}
// Native pipeline already initialized above
}
} catch (e: Exception) {
nlog("Talker .pte JNI failed: ${e.message}")
@ -2172,16 +2179,20 @@ class Qwen3TtsEngine(
* Runs talker ONNX CP VQ decode speech decoder.
*/
fun generateFromEmbeds(embedsPath: String): ShortArray {
if (!loaded || (talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
nlog("generateFromEmbeds: talker not loaded (pte=${talkerPteModule != null}, hex=$useHexagonTalker, onnx=${talkerKv != null})")
if (!loaded || (!nativePipelineReady && talkerPteModule == null && !useHexagonTalker && (talkerKv == null || !talkerUsesCosSin))) {
nlog("generateFromEmbeds: no talker (native=$nativePipelineReady, pte=${talkerPteModule != null}, hex=$useHexagonTalker)")
return ShortArray(0)
}
if (useHexagonTalker) {
return generateFromEmbedsHexagon(embedsPath)
// Priority: native C++ > Java .pte > Hexagon > ONNX CPU
if (nativePipelineReady) {
return generateFromEmbedsPte(embedsPath)
}
if (talkerPteModule != null && cpPteModule != null) {
return generateFromEmbedsPte(embedsPath)
}
if (useHexagonTalker) {
return generateFromEmbedsHexagon(embedsPath)
}
nlog("Full pipeline from: $embedsPath")
val t0 = System.currentTimeMillis()
@ -2287,10 +2298,10 @@ class Qwen3TtsEngine(
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
val allCodes: Array<IntArray>
// Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
// Java pipeline: RTF 1.8, validated quality
// TODO: share QNN context between Java and C++ for same quality at C++ speed
if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
// Native C++ disabled: QNN HTP compilation not deterministic between loads
// Two instances of same .pte give slightly different hidden states → trembling
// Keep Java pipeline (same QNN instance, validated quality)
if (false && nativePipelineReady) {
// Native C++ pipeline — zero Java overhead
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
@ -2309,6 +2320,22 @@ class Qwen3TtsEngine(
}
}
// Ensure all data arrays are loaded
val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
if (codecEmbedding == null) codecEmbedding = loadNpy("$mpath/codec_embedding.npy")
if (cpEmbeddings == null) cpEmbeddings = loadNpy("$mpath/code_predictor_embeddings.npy")
if (cpRotaryCos == null) cpRotaryCos = loadNpy("$mpath/cp_kv_v2/cp_rotary_cos.npy")
if (cpRotarySin == null) cpRotarySin = loadNpy("$mpath/cp_kv_v2/cp_rotary_sin.npy")
if (talkerPteRotaryCos == null) talkerPteRotaryCos = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_cos.npy")
if (talkerPteRotarySin == null) talkerPteRotarySin = loadNpy("/data/local/tmp/kazeia/models/talker_pte_rotary_sin.npy")
if (ttsEosEmbed == null || ttsPadEmbed == null) {
val mpath = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
val sp = loadNpy("$mpath/tts_special_embeds.npy")
ttsBosEmbed = sp.sliceArray(0 until TALKER_DIM)
ttsEosEmbed = sp.sliceArray(TALKER_DIM until 2 * TALKER_DIM)
ttsPadEmbed = sp.sliceArray(2 * TALKER_DIM until 3 * TALKER_DIM)
}
nlog("Running native C++ pipeline...")
val flat = TtsPipeline.nativeRun(
prefillFlat, nPrefill,
@ -2480,6 +2507,7 @@ class Qwen3TtsEngine(
val n = allCodes.size
nlog("Generated $n tokens | Talker(PTE): ${totalTalkerMs}ms (${totalTalkerMs/maxOf(n,1)}ms/step) | CP(PTE): ${totalCpMs}ms (${totalCpMs/maxOf(n,1)}ms/step)")
nlog("CB0 Java: [${generatedCb0.joinToString(",")}]")
return allCodes.toTypedArray()
}