Fix KV buffer + revert HTP decoder (BigVGAN too complex for HTP)
- Restored intermediate KV buffer for talker (direct output→input caused trembling from buffer overwrite during execute()) - BigVGAN HTP compilation takes >5min, not viable - RTF 1.35 with clean audio quality Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
985fd9cff9
commit
4dcc4bb8b3
|
|
@ -712,7 +712,7 @@ ExecuTorchJni::runTtsPipelineImpl(
|
||||||
auto eosEmb=copyArr(jEos),padEmb=copyArr(jPad);
|
auto eosEmb=copyArr(jEos),padEmb=copyArr(jPad);
|
||||||
|
|
||||||
int tkvElem=T_KV*T_KV_LEN*T_HD;
|
int tkvElem=T_KV*T_KV_LEN*T_HD;
|
||||||
// tK/tV not needed — KV copied directly from output to input each step
|
std::vector<float> tKV(T_L*2*tkvElem,0); // intermediate KV buffer (avoids output overwrite race)
|
||||||
float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
|
float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
|
||||||
float hidden[DIM]={},logits[VOCAB]={};
|
float hidden[DIM]={},logits[VOCAB]={};
|
||||||
std::vector<int> allCodes,cb0Hist;
|
std::vector<int> allCodes,cb0Hist;
|
||||||
|
|
@ -749,20 +749,19 @@ ExecuTorchJni::runTtsPipelineImpl(
|
||||||
memcpy(tInMask, mask, T_KV_LEN*4);
|
memcpy(tInMask, mask, T_KV_LEN*4);
|
||||||
memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4);
|
memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4);
|
||||||
memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4);
|
memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4);
|
||||||
// KV: copy directly from PREVIOUS output to input (skip intermediate buffer)
|
|
||||||
if(pos > 0) {
|
|
||||||
for(int i=0;i<T_L;i++){
|
for(int i=0;i<T_L;i++){
|
||||||
memcpy(tInKV[i*2], tMethod->get_output(2+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
|
memcpy(tInKV[i*2], tKV.data()+(i*2)*tkvElem, tkvElem*4);
|
||||||
memcpy(tInKV[i*2+1], tMethod->get_output(3+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
|
memcpy(tInKV[i*2+1], tKV.data()+(i*2+1)*tkvElem, tkvElem*4);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// (pos==0: first call, input KV is already zeros from prepare_input_tensors)
|
|
||||||
|
|
||||||
auto status = tMethod->execute();
|
auto status = tMethod->execute();
|
||||||
if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;}
|
if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;}
|
||||||
memcpy(hidden, tMethod->get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
|
memcpy(hidden, tMethod->get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
|
||||||
memcpy(logits, tMethod->get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
|
memcpy(logits, tMethod->get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
|
||||||
// KV NOT copied to tK/tV — read from output directly next step
|
for(int i=0;i<T_L;i++){
|
||||||
|
memcpy(tKV.data()+(i*2)*tkvElem, tMethod->get_output(2+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
|
||||||
|
memcpy(tKV.data()+(i*2+1)*tkvElem, tMethod->get_output(3+i*2).toTensor().const_data_ptr<float>(), tkvElem*4);
|
||||||
|
}
|
||||||
pos++;
|
pos++;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -209,7 +209,7 @@ class Qwen3TtsEngine(
|
||||||
return session
|
return session
|
||||||
}
|
}
|
||||||
|
|
||||||
// Speech decoder V2: CPU ONNX (GPU tested: no gain, +300ms overhead)
|
// Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile)
|
||||||
val v2Path = "$path/v2_pre_conv"
|
val v2Path = "$path/v2_pre_conv"
|
||||||
if (File("$v2Path/model.onnx").exists()) {
|
if (File("$v2Path/model.onnx").exists()) {
|
||||||
nlog("Loading V2 speech decoder (CPU ONNX)...")
|
nlog("Loading V2 speech decoder (CPU ONNX)...")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue