Optimize CP+talker: eliminate prepare_input_tensors per step

Cache input tensor pointers after first prepare_input_tensors call,
then memcpy directly into them for all subsequent steps.

Eliminates ~14000 mallocs per pipeline run (986 CP + 58 talker calls).
Generation: 4640ms → 4007ms (-633ms), total RTF: 1.6 → 1.51

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-09 12:16:38 +02:00
parent e647911329
commit 14f7e5b05f
1 changed files with 42 additions and 23 deletions

View File

@ -728,21 +728,30 @@ ExecuTorchJni::runTtsPipelineImpl(
Method* tMethod = tMethodRes.get();
Method* cMethod = cMethodRes.get();
// Talker step: prepare_input_tensors + memcpy + execute (like cp_et_runner)
// Talker: prepare once, cache pointers, reuse for all 58+ steps
{auto prep=executorch::extension::prepare_input_tensors(*tMethod);}
float* tInEmb = tMethod->mutable_input(0).toTensor().mutable_data_ptr<float>();
float* tInMask = tMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
float* tInCos = tMethod->mutable_input(2).toTensor().mutable_data_ptr<float>();
float* tInSin = tMethod->mutable_input(3).toTensor().mutable_data_ptr<float>();
float* tInKV[T_L*2];
for(int i=0;i<T_L;i++){
tInKV[i*2] = tMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>();
tInKV[i*2+1] = tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>();
}
auto talkerStep = [&](const float* emb) {
int pi=std::min(pos,249);
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
if(mi>=0) mask[mi]=0.0f;
auto prep = executorch::extension::prepare_input_tensors(*tMethod);
if(!prep.ok()){ET_LOG(Error,"Talker prep fail");return;}
memcpy(tMethod->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
memcpy(tMethod->mutable_input(1).toTensor().mutable_data_ptr<float>(), mask, T_KV_LEN*4);
memcpy(tMethod->mutable_input(2).toTensor().mutable_data_ptr<float>(), tCos.data()+pi*T_HD, T_HD*4);
memcpy(tMethod->mutable_input(3).toTensor().mutable_data_ptr<float>(), tSin.data()+pi*T_HD, T_HD*4);
memcpy(tInEmb, emb, DIM*4);
memcpy(tInMask, mask, T_KV_LEN*4);
memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4);
memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4);
for(int i=0;i<T_L;i++){
memcpy(tMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), tK.data()+i*tkvElem, tkvElem*4);
memcpy(tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), tV.data()+i*tkvElem, tkvElem*4);
memcpy(tInKV[i*2], tK.data()+i*tkvElem, tkvElem*4);
memcpy(tInKV[i*2+1], tV.data()+i*tkvElem, tkvElem*4);
}
auto status = tMethod->execute();
if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;}
@ -755,27 +764,37 @@ ExecuTorchJni::runTtsPipelineImpl(
pos++;
};
// CP step: 17 autoregressive steps using Method directly
auto cpStep = [&](const float* h, int cb0, int* codes) {
// CP step: 17 autoregressive steps with cached input pointers
// prepare_input_tensors called ONCE, then reuse pointers for all 17×58 steps
int ckvElem=C_KV*C_KV_LEN*C_HD;
std::vector<float> ckv(C_L*2*ckvElem,0);
{auto prep=executorch::extension::prepare_input_tensors(*cMethod);} // first alloc
// Cache input data pointers (stable after prepare)
float* cpInEmb = cMethod->mutable_input(0).toTensor().mutable_data_ptr<float>();
float* cpInMask = cMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
float* cpInCos = cMethod->mutable_input(2).toTensor().mutable_data_ptr<float>();
float* cpInSin = cMethod->mutable_input(3).toTensor().mutable_data_ptr<float>();
float* cpInKV[C_L*2];
for(int i=0;i<C_L;i++){
cpInKV[i*2] = cMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>();
cpInKV[i*2+1] = cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>();
}
auto cpStep = [&](const float* h, int cb0, int* codes) {
memset(ckv.data(), 0, ckv.size()*4); // reset KV caches
for(int step=0;step<17;step++){
const float*emb;
if(step==0) emb=h;
else if(step==1) emb=codecEmb.data()+std::min(std::max(cb0,0),VOCAB-1)*DIM;
else emb=cpEmbs.data()+((long)(step-2)*CB_SIZE+std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM;
auto prep=executorch::extension::prepare_input_tensors(*cMethod);
if(!prep.ok()) break;
memcpy(cMethod->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
float*mp=cMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
for(int p=0;p<C_KV_LEN;p++) mp[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
memcpy(cMethod->mutable_input(2).toTensor().mutable_data_ptr<float>(), cCos.data()+step*C_HD, C_HD*4);
memcpy(cMethod->mutable_input(3).toTensor().mutable_data_ptr<float>(), cSin.data()+step*C_HD, C_HD*4);
for(int i=0;i<C_L;i++){
memcpy(cMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), ckv.data()+(i*2)*ckvElem, ckvElem*4);
memcpy(cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), ckv.data()+(i*2+1)*ckvElem, ckvElem*4);
}
// Write directly to cached pointers (no prepare_input_tensors!)
memcpy(cpInEmb, emb, DIM*4);
for(int p=0;p<C_KV_LEN;p++) cpInMask[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
memcpy(cpInCos, cCos.data()+step*C_HD, C_HD*4);
memcpy(cpInSin, cSin.data()+step*C_HD, C_HD*4);
for(int i=0;i<C_L*2;i++) memcpy(cpInKV[i], ckv.data()+i*ckvElem, ckvElem*4);
auto status=cMethod->execute();
if(status!=Error::Ok) break;
const float*ho=cMethod->get_output(0).toTensor().const_data_ptr<float>();