diff --git a/executorch-custom/jni_layer_tts.cpp b/executorch-custom/jni_layer_tts.cpp index 78e6604..7783d0c 100644 --- a/executorch-custom/jni_layer_tts.cpp +++ b/executorch-custom/jni_layer_tts.cpp @@ -712,7 +712,7 @@ ExecuTorchJni::runTtsPipelineImpl( auto eosEmb=copyArr(jEos),padEmb=copyArr(jPad); int tkvElem=T_KV*T_KV_LEN*T_HD; - // tK/tV not needed — KV copied directly from output to input each step + std::vector tKV(T_L*2*tkvElem,0); // intermediate KV buffer (avoids output overwrite race) float mask[T_KV_LEN]; for(int i=0;i allCodes,cb0Hist; @@ -749,20 +749,19 @@ ExecuTorchJni::runTtsPipelineImpl( memcpy(tInMask, mask, T_KV_LEN*4); memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4); memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4); - // KV: copy directly from PREVIOUS output to input (skip intermediate buffer) - if(pos > 0) { - for(int i=0;iget_output(2+i*2).toTensor().const_data_ptr(), tkvElem*4); - memcpy(tInKV[i*2+1], tMethod->get_output(3+i*2).toTensor().const_data_ptr(), tkvElem*4); - } + for(int i=0;iexecute(); if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;} memcpy(hidden, tMethod->get_output(0).toTensor().const_data_ptr(), DIM*4); memcpy(logits, tMethod->get_output(1).toTensor().const_data_ptr(), VOCAB*4); - // KV NOT copied to tK/tV — read from output directly next step + for(int i=0;iget_output(2+i*2).toTensor().const_data_ptr(), tkvElem*4); + memcpy(tKV.data()+(i*2+1)*tkvElem, tMethod->get_output(3+i*2).toTensor().const_data_ptr(), tkvElem*4); + } pos++; }; diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index 315bc82..98263f9 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -209,7 +209,7 @@ class Qwen3TtsEngine( return session } - // Speech decoder V2: CPU ONNX (GPU tested: no gain, +300ms overhead) + // Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile) val v2Path = "$path/v2_pre_conv" if (File("$v2Path/model.onnx").exists()) { nlog("Loading V2 speech decoder (CPU ONNX)...")