Fix C++ pipeline eos/pad + disable for quality (keep Java default)

- Fixed trailing embed handling (use pre-computed as-is) - Added eos/pad embed params to nativeRun - Improved C++ PRNG for sampling - Disabled native pipeline: slight quality regression vs Java (two separate QNN instances give different numerical results) - Java pipeline (RTF 1.8) kept as default for validated quality Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 10:53:19 +02:00 · 2026-04-09 10:53:19 +02:00 · 8e536094df
parent 3b01302cfb
commit 8e536094df
2 changed files with 29 additions and 23 deletions
--- a/executorch-custom/tts_pipeline_jni.cpp
+++ b/executorch-custom/tts_pipeline_jni.cpp
@ -57,27 +57,30 @@ static int argmax_head(const float*h,const float*W,int vocab,int dim){
    return best;
 }
-// Top-k sampling with temperature
+// Top-k sampling with temperature (Java-compatible PRNG)
 static uint64_t g_rng_state = 0x12345678ABCDEF01ULL;
 static float next_rand() {
    // Java-style LCG for reproducibility
    g_rng_state = g_rng_state * 6364136223846793005ULL + 1442695040888963407ULL;
    return (float)((g_rng_state >> 33) & 0x7FFFFFFF) / (float)0x7FFFFFFF;
 }
 static int sample_topk(const float* logits, int vocab, float temp, int k) {
    // Find top-k
    struct IV { int i; float v; };
    std::vector<IV> topk(k, {0, -FLT_MAX});
    for (int i = 0; i < vocab; i++) {
        if (logits[i] > topk[k-1].v) {
            topk[k-1] = {i, logits[i]};
            // Bubble up
            for (int j = k-2; j >= 0; j--) {
                if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
                else break;
            }
        }
    }
    // Softmax with temperature
    float maxv = topk[0].v;
    float sum = 0;
    for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
-    // Sample
+    float r = next_rand() * sum;
    float r = (float)rand() / RAND_MAX * sum;
    float acc = 0;
    for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
    return topk[0].i;
@ -312,27 +315,27 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
        for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
        cb0History.push_back(currentCb0);
-        // Build next talker input: sum codec embeddings
+        // Build next talker input
        float nextEmb[DIM]={};
-        // cb0 embedding
+        if(trailingIdx<nTrailing){
            // Pre-computed decode embed from file: use as-is (already contains codec+text)
            memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
            trailingIdx++;
        } else {
            // After trailing exhausted: build from our codes + eos/pad
            const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
            for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
        // cb1-15 embeddings
            for(int cb=0;cb<15;cb++){
                const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
                for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
            }
-        // Add trailing text, then eos, then pad (matches Python/Kotlin pipeline)
+            if(trailingIdx==nTrailing){
        if(trailingIdx<nTrailing){
            const float*te=trailingData.data()+trailingIdx*DIM;
            for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
            trailingIdx++;
        } else if(trailingIdx==nTrailing){
                for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmbed[k];
                trailingIdx++;
            } else {
                for(int k=0;k<DIM;k++) nextEmb[k]+=padEmbed[k];
            }
        }
        // Talker step
        int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@ -2287,7 +2287,10 @@ class Qwen3TtsEngine(
        nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
        val allCodes: Array<IntArray>
-        if (nativePipelineReady) {
+        // Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
        // Java pipeline: RTF 1.8, validated quality
        // TODO: share QNN context between Java and C++ for same quality at C++ speed
        if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
            // Native C++ pipeline — zero Java overhead
            val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
            for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)