Optimize CP+talker: eliminate prepare_input_tensors per step
Cache input tensor pointers after first prepare_input_tensors call, then memcpy directly into them for all subsequent steps. Eliminates ~14000 mallocs per pipeline run (986 CP + 58 talker calls). Generation: 4640ms → 4007ms (-633ms), total RTF: 1.6 → 1.51 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e647911329
commit
14f7e5b05f
|
|
@ -728,21 +728,30 @@ ExecuTorchJni::runTtsPipelineImpl(
|
|||
Method* tMethod = tMethodRes.get();
|
||||
Method* cMethod = cMethodRes.get();
|
||||
|
||||
// Talker step: prepare_input_tensors + memcpy + execute (like cp_et_runner)
|
||||
// Talker: prepare once, cache pointers, reuse for all 58+ steps
|
||||
{auto prep=executorch::extension::prepare_input_tensors(*tMethod);}
|
||||
float* tInEmb = tMethod->mutable_input(0).toTensor().mutable_data_ptr<float>();
|
||||
float* tInMask = tMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
|
||||
float* tInCos = tMethod->mutable_input(2).toTensor().mutable_data_ptr<float>();
|
||||
float* tInSin = tMethod->mutable_input(3).toTensor().mutable_data_ptr<float>();
|
||||
float* tInKV[T_L*2];
|
||||
for(int i=0;i<T_L;i++){
|
||||
tInKV[i*2] = tMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>();
|
||||
tInKV[i*2+1] = tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>();
|
||||
}
|
||||
|
||||
auto talkerStep = [&](const float* emb) {
|
||||
int pi=std::min(pos,249);
|
||||
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
||||
if(mi>=0) mask[mi]=0.0f;
|
||||
|
||||
auto prep = executorch::extension::prepare_input_tensors(*tMethod);
|
||||
if(!prep.ok()){ET_LOG(Error,"Talker prep fail");return;}
|
||||
memcpy(tMethod->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
|
||||
memcpy(tMethod->mutable_input(1).toTensor().mutable_data_ptr<float>(), mask, T_KV_LEN*4);
|
||||
memcpy(tMethod->mutable_input(2).toTensor().mutable_data_ptr<float>(), tCos.data()+pi*T_HD, T_HD*4);
|
||||
memcpy(tMethod->mutable_input(3).toTensor().mutable_data_ptr<float>(), tSin.data()+pi*T_HD, T_HD*4);
|
||||
memcpy(tInEmb, emb, DIM*4);
|
||||
memcpy(tInMask, mask, T_KV_LEN*4);
|
||||
memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4);
|
||||
memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4);
|
||||
for(int i=0;i<T_L;i++){
|
||||
memcpy(tMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), tK.data()+i*tkvElem, tkvElem*4);
|
||||
memcpy(tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), tV.data()+i*tkvElem, tkvElem*4);
|
||||
memcpy(tInKV[i*2], tK.data()+i*tkvElem, tkvElem*4);
|
||||
memcpy(tInKV[i*2+1], tV.data()+i*tkvElem, tkvElem*4);
|
||||
}
|
||||
auto status = tMethod->execute();
|
||||
if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;}
|
||||
|
|
@ -755,27 +764,37 @@ ExecuTorchJni::runTtsPipelineImpl(
|
|||
pos++;
|
||||
};
|
||||
|
||||
// CP step: 17 autoregressive steps using Method directly
|
||||
auto cpStep = [&](const float* h, int cb0, int* codes) {
|
||||
// CP step: 17 autoregressive steps with cached input pointers
|
||||
// prepare_input_tensors called ONCE, then reuse pointers for all 17×58 steps
|
||||
int ckvElem=C_KV*C_KV_LEN*C_HD;
|
||||
std::vector<float> ckv(C_L*2*ckvElem,0);
|
||||
{auto prep=executorch::extension::prepare_input_tensors(*cMethod);} // first alloc
|
||||
// Cache input data pointers (stable after prepare)
|
||||
float* cpInEmb = cMethod->mutable_input(0).toTensor().mutable_data_ptr<float>();
|
||||
float* cpInMask = cMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
|
||||
float* cpInCos = cMethod->mutable_input(2).toTensor().mutable_data_ptr<float>();
|
||||
float* cpInSin = cMethod->mutable_input(3).toTensor().mutable_data_ptr<float>();
|
||||
float* cpInKV[C_L*2];
|
||||
for(int i=0;i<C_L;i++){
|
||||
cpInKV[i*2] = cMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>();
|
||||
cpInKV[i*2+1] = cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>();
|
||||
}
|
||||
|
||||
auto cpStep = [&](const float* h, int cb0, int* codes) {
|
||||
memset(ckv.data(), 0, ckv.size()*4); // reset KV caches
|
||||
for(int step=0;step<17;step++){
|
||||
const float*emb;
|
||||
if(step==0) emb=h;
|
||||
else if(step==1) emb=codecEmb.data()+std::min(std::max(cb0,0),VOCAB-1)*DIM;
|
||||
else emb=cpEmbs.data()+((long)(step-2)*CB_SIZE+std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM;
|
||||
|
||||
auto prep=executorch::extension::prepare_input_tensors(*cMethod);
|
||||
if(!prep.ok()) break;
|
||||
memcpy(cMethod->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
|
||||
float*mp=cMethod->mutable_input(1).toTensor().mutable_data_ptr<float>();
|
||||
for(int p=0;p<C_KV_LEN;p++) mp[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
|
||||
memcpy(cMethod->mutable_input(2).toTensor().mutable_data_ptr<float>(), cCos.data()+step*C_HD, C_HD*4);
|
||||
memcpy(cMethod->mutable_input(3).toTensor().mutable_data_ptr<float>(), cSin.data()+step*C_HD, C_HD*4);
|
||||
for(int i=0;i<C_L;i++){
|
||||
memcpy(cMethod->mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), ckv.data()+(i*2)*ckvElem, ckvElem*4);
|
||||
memcpy(cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), ckv.data()+(i*2+1)*ckvElem, ckvElem*4);
|
||||
}
|
||||
// Write directly to cached pointers (no prepare_input_tensors!)
|
||||
memcpy(cpInEmb, emb, DIM*4);
|
||||
for(int p=0;p<C_KV_LEN;p++) cpInMask[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
|
||||
memcpy(cpInCos, cCos.data()+step*C_HD, C_HD*4);
|
||||
memcpy(cpInSin, cSin.data()+step*C_HD, C_HD*4);
|
||||
for(int i=0;i<C_L*2;i++) memcpy(cpInKV[i], ckv.data()+i*ckvElem, ckvElem*4);
|
||||
|
||||
auto status=cMethod->execute();
|
||||
if(status!=Error::Ok) break;
|
||||
const float*ho=cMethod->get_output(0).toTensor().const_data_ptr<float>();
|
||||
|
|
|
|||
Loading…
Reference in New Issue