Fix: use pre-computed embeds as-is (no double codec_sum)

Pre-computed embeds from Python already contain codec_sum+text. Using them as-is works correctly. After exhausted, fallback to our codec_sum + pad. Long text: 191 tokens, 15.28s audio, RTF 1.27 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:10:23 +02:00 · 2026-04-09 14:10:23 +02:00 · 24157c0a68
parent f6df1738c5
commit 24157c0a68
1 changed files with 9 additions and 13 deletions
--- a/executorch-custom/jni_layer_tts.cpp
+++ b/executorch-custom/jni_layer_tts.cpp
@ -839,23 +839,19 @@ ExecuTorchJni::runTtsPipelineImpl(
        for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
        cb0Hist.push_back(curCb0);

-        // Next embed: codec_sum + (trailing text / eos / pad)
+        // Next embed: pre-computed from Python (complete: codec_sum+text)
+        // After exhausted: codec_sum(our codes) + pad
        float nextEmb[DIM]={};
-        // Always add codec embeddings from our codes
+        if(trIdx<nTrailing){
+            memcpy(nextEmb,trailing.data()+trIdx*DIM,DIM*4);
+            trIdx++;
+        } else {
            const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
            for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
            for(int cb=0;cb<15;cb++){
                const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
                for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
            }
-        // Add trailing text embed, then eos once, then pad
-        if(trIdx<nTrailing){
-            const float*te=trailing.data()+trIdx*DIM;
-            for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
-            trIdx++;
-        } else if(trIdx==nTrailing){
-            for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmb[k]; trIdx++;
-        } else {
            for(int k=0;k<DIM;k++) nextEmb[k]+=padEmb[k];
        }