diff --git a/executorch-custom/jni_layer_tts.cpp b/executorch-custom/jni_layer_tts.cpp index 16a091e..b5fa121 100644 --- a/executorch-custom/jni_layer_tts.cpp +++ b/executorch-custom/jni_layer_tts.cpp @@ -728,21 +728,30 @@ ExecuTorchJni::runTtsPipelineImpl( Method* tMethod = tMethodRes.get(); Method* cMethod = cMethodRes.get(); - // Talker step: prepare_input_tensors + memcpy + execute (like cp_et_runner) + // Talker: prepare once, cache pointers, reuse for all 58+ steps + {auto prep=executorch::extension::prepare_input_tensors(*tMethod);} + float* tInEmb = tMethod->mutable_input(0).toTensor().mutable_data_ptr(); + float* tInMask = tMethod->mutable_input(1).toTensor().mutable_data_ptr(); + float* tInCos = tMethod->mutable_input(2).toTensor().mutable_data_ptr(); + float* tInSin = tMethod->mutable_input(3).toTensor().mutable_data_ptr(); + float* tInKV[T_L*2]; + for(int i=0;imutable_input(4+i*2).toTensor().mutable_data_ptr(); + tInKV[i*2+1] = tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr(); + } + auto talkerStep = [&](const float* emb) { int pi=std::min(pos,249); int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1); if(mi>=0) mask[mi]=0.0f; - auto prep = executorch::extension::prepare_input_tensors(*tMethod); - if(!prep.ok()){ET_LOG(Error,"Talker prep fail");return;} - memcpy(tMethod->mutable_input(0).toTensor().mutable_data_ptr(), emb, DIM*4); - memcpy(tMethod->mutable_input(1).toTensor().mutable_data_ptr(), mask, T_KV_LEN*4); - memcpy(tMethod->mutable_input(2).toTensor().mutable_data_ptr(), tCos.data()+pi*T_HD, T_HD*4); - memcpy(tMethod->mutable_input(3).toTensor().mutable_data_ptr(), tSin.data()+pi*T_HD, T_HD*4); + memcpy(tInEmb, emb, DIM*4); + memcpy(tInMask, mask, T_KV_LEN*4); + memcpy(tInCos, tCos.data()+pi*T_HD, T_HD*4); + memcpy(tInSin, tSin.data()+pi*T_HD, T_HD*4); for(int i=0;imutable_input(4+i*2).toTensor().mutable_data_ptr(), tK.data()+i*tkvElem, tkvElem*4); - memcpy(tMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr(), tV.data()+i*tkvElem, tkvElem*4); + memcpy(tInKV[i*2], tK.data()+i*tkvElem, tkvElem*4); + memcpy(tInKV[i*2+1], tV.data()+i*tkvElem, tkvElem*4); } auto status = tMethod->execute(); if(status!=Error::Ok){ET_LOG(Error,"Talker exec fail: %d",(int)status);return;} @@ -755,27 +764,37 @@ ExecuTorchJni::runTtsPipelineImpl( pos++; }; - // CP step: 17 autoregressive steps using Method directly + // CP step: 17 autoregressive steps with cached input pointers + // prepare_input_tensors called ONCE, then reuse pointers for all 17×58 steps + int ckvElem=C_KV*C_KV_LEN*C_HD; + std::vector ckv(C_L*2*ckvElem,0); + {auto prep=executorch::extension::prepare_input_tensors(*cMethod);} // first alloc + // Cache input data pointers (stable after prepare) + float* cpInEmb = cMethod->mutable_input(0).toTensor().mutable_data_ptr(); + float* cpInMask = cMethod->mutable_input(1).toTensor().mutable_data_ptr(); + float* cpInCos = cMethod->mutable_input(2).toTensor().mutable_data_ptr(); + float* cpInSin = cMethod->mutable_input(3).toTensor().mutable_data_ptr(); + float* cpInKV[C_L*2]; + for(int i=0;imutable_input(4+i*2).toTensor().mutable_data_ptr(); + cpInKV[i*2+1] = cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr(); + } + auto cpStep = [&](const float* h, int cb0, int* codes) { - int ckvElem=C_KV*C_KV_LEN*C_HD; - std::vector ckv(C_L*2*ckvElem,0); + memset(ckv.data(), 0, ckv.size()*4); // reset KV caches for(int step=0;step<17;step++){ const float*emb; if(step==0) emb=h; else if(step==1) emb=codecEmb.data()+std::min(std::max(cb0,0),VOCAB-1)*DIM; else emb=cpEmbs.data()+((long)(step-2)*CB_SIZE+std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM; - auto prep=executorch::extension::prepare_input_tensors(*cMethod); - if(!prep.ok()) break; - memcpy(cMethod->mutable_input(0).toTensor().mutable_data_ptr(), emb, DIM*4); - float*mp=cMethod->mutable_input(1).toTensor().mutable_data_ptr(); - for(int p=0;p=C_KV_LEN-1-step)?0.0f:-1e9f; - memcpy(cMethod->mutable_input(2).toTensor().mutable_data_ptr(), cCos.data()+step*C_HD, C_HD*4); - memcpy(cMethod->mutable_input(3).toTensor().mutable_data_ptr(), cSin.data()+step*C_HD, C_HD*4); - for(int i=0;imutable_input(4+i*2).toTensor().mutable_data_ptr(), ckv.data()+(i*2)*ckvElem, ckvElem*4); - memcpy(cMethod->mutable_input(5+i*2).toTensor().mutable_data_ptr(), ckv.data()+(i*2+1)*ckvElem, ckvElem*4); - } + // Write directly to cached pointers (no prepare_input_tensors!) + memcpy(cpInEmb, emb, DIM*4); + for(int p=0;p=C_KV_LEN-1-step)?0.0f:-1e9f; + memcpy(cpInCos, cCos.data()+step*C_HD, C_HD*4); + memcpy(cpInSin, cSin.data()+step*C_HD, C_HD*4); + for(int i=0;iexecute(); if(status!=Error::Ok) break; const float*ho=cMethod->get_output(0).toTensor().const_data_ptr();