Fix KV buffer + revert HTP decoder (BigVGAN too complex for HTP)

- Restored intermediate KV buffer for talker (direct output→input
  caused trembling from buffer overwrite during execute())
- BigVGAN HTP compilation takes >5min, not viable
- RTF 1.35 with clean audio quality

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-09 12:37:50 +02:00
parent 985fd9cff9
commit 4dcc4bb8b3
2 changed files with 9 additions and 10 deletions

View File

@ -712,7 +712,7 @@ ExecuTorchJni::runTtsPipelineImpl(
auto eosEmb=copyArr(jEos),padEmb=copyArr(jPad); auto eosEmb=copyArr(jEos),padEmb=copyArr(jPad);
int tkvElem=T_KV*T_KV_LEN*T_HD; int tkvElem=T_KV*T_KV_LEN*T_HD;
// tK/tV not needed — KV copied directly from output to input each step std::vector<float> tKV(T_L*2*tkvElem,0); // intermediate KV buffer (avoids output overwrite race)
float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f; float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
float hidden[DIM]={},logits[VOCAB]={}; float hidden[DIM]={},logits[VOCAB]={};
std::vector<int> allCodes,cb0Hist; std::vector<int> allCodes,cb0Hist;
@ -749,20 +749,19 @@ ExecuTorchJni::runTtsPipelineImpl(
memcpy(tInMask, mask, T_KV_LEN*4); memcpy(tInMask, mask, T_KV_LEN*4);
memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4); memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4);
memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4); memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4);
// KV: copy directly from PREVIOUS output to input (skip intermediate buffer) for(int i=0;i<T_L;i++){
if(pos > 0) { memcpy(tInKV[i*2], tKV.data()+(i*2)*tkvElem, tkvElem*4);
for(int i=0;i<T_L;i++){ memcpy(tInKV[i*2+1], tKV.data()+(i*2+1)*tkvElem, tkvElem*4);
memcpy(tInKV[i*2], tMethod->get_output(2+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
memcpy(tInKV[i*2+1], tMethod->get_output(3+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
}
} }
// (pos==0: first call, input KV is already zeros from prepare_input_tensors)
auto status = tMethod->execute(); auto status = tMethod->execute();
if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;} if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;}
memcpy(hidden, tMethod->get_output(0).toTensor().const_data_ptr<float>(), DIM*4); memcpy(hidden, tMethod->get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
memcpy(logits, tMethod->get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4); memcpy(logits, tMethod->get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
// KV NOT copied to tK/tV — read from output directly next step for(int i=0;i<T_L;i++){
memcpy(tKV.data()+(i*2)*tkvElem, tMethod->get_output(2+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
memcpy(tKV.data()+(i*2+1)*tkvElem, tMethod->get_output(3+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
}
pos++; pos++;
}; };

View File

@ -209,7 +209,7 @@ class Qwen3TtsEngine(
return session return session
} }
// Speech decoder V2: CPU ONNX (GPU tested: no gain, +300ms overhead) // Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile)
val v2Path = "$path/v2_pre_conv" val v2Path = "$path/v2_pre_conv"
if (File("$v2Path/model.onnx").exists()) { if (File("$v2Path/model.onnx").exists()) {
nlog("Loading V2 speech decoder (CPU ONNX)...") nlog("Loading V2 speech decoder (CPU ONNX)...")