diff --git a/executorch-custom/cp_et_runner.cpp b/executorch-custom/cp_et_runner.cpp new file mode 100644 index 0000000..7ebbf5b --- /dev/null +++ b/executorch-custom/cp_et_runner.cpp @@ -0,0 +1,222 @@ +/** + * TTS Code Predictor Runner — ExecuTorch .pte on NPU HTP. + * Based on executor_runner.cpp but with socket IPC for the app. + * Same protocol as the GGUF CP runner. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +DEFINE_string(model_path, "", "Path to .pte file"); +DEFINE_string(sock_path, "/data/local/tmp/kazeia/cp_et.sock", "Socket path"); +DEFINE_int32(tcp_port, 8790, "TCP port (0=disabled, use unix socket)"); +DEFINE_string(heads_path, "/data/local/tmp/kazeia/models/cp_heads.bin", "Heads file"); +DEFINE_string(embs_path, "/data/local/tmp/kazeia/models/cp_codec_embs.bin", "Codec embs file"); +DEFINE_string(cos_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_cos.npy", "Cos file"); +DEFINE_string(sin_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_sin.npy", "Sin file"); + +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; + +static const int N_EMBD=1024, N_VOCAB=2048, N_CB=15, N_KV=8, HD=128, KV_LEN=16, N_L=5; + +static bool read_exact(int fd,void*buf,size_t n){ + size_t d=0;while(dmethod_meta("forward"); + ET_CHECK_MSG(method_meta.ok(), "Failed to get method meta"); + + std::vector> planned_bufs; + std::vector> planned_spans; + size_t n_planned = method_meta->num_memory_planned_buffers(); + for (size_t id = 0; id < n_planned; id++) { + size_t sz = (size_t)method_meta->memory_planned_buffer_size(id).get(); + planned_bufs.push_back(std::make_unique(sz)); + planned_spans.push_back({planned_bufs.back().get(), sz}); + } + HierarchicalAllocator planned_memory({planned_spans.data(), planned_spans.size()}); + MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator); + + // Load method + auto method = program->load_method("forward", &memory_manager); + ET_CHECK_MSG(method.ok(), "Failed to load method: 0x%x", (int)method.error()); + + auto meta = method->method_meta(); + fprintf(stderr, "CP_ET: %zu inputs, %zu outputs\n", meta.num_inputs(), meta.num_outputs()); + + // Load heads, embeddings, rotary + float* heads = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4); + float* embs_data = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4); + FILE* fh = fopen(FLAGS_heads_path.c_str(), "rb"); + if (fh) { fread(heads, 4, N_CB*N_VOCAB*N_EMBD, fh); fclose(fh); } + FILE* fe = fopen(FLAGS_embs_path.c_str(), "rb"); + if (fe) { fread(embs_data, 4, N_CB*N_VOCAB*N_EMBD, fe); fclose(fe); } + float* rcos = load_npy(FLAGS_cos_path.c_str(), 17*HD); + float* rsin = load_npy(FLAGS_sin_path.c_str(), 17*HD); + + // Socket setup — TCP if tcp_port > 0, else Unix domain socket + int srv; + if (FLAGS_tcp_port > 0) { + srv = socket(AF_INET, SOCK_STREAM, 0); + int opt = 1; setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + struct sockaddr_in taddr = {}; taddr.sin_family = AF_INET; + taddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + taddr.sin_port = htons(FLAGS_tcp_port); + bind(srv, (struct sockaddr*)&taddr, sizeof(taddr)); + listen(srv, 2); + fprintf(stderr, "CP_ET READY on tcp://127.0.0.1:%d\n", FLAGS_tcp_port); + } else { + unlink(FLAGS_sock_path.c_str()); + srv = socket(AF_UNIX, SOCK_STREAM, 0); + struct sockaddr_un addr = {}; addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, FLAGS_sock_path.c_str(), sizeof(addr.sun_path)-1); + bind(srv, (struct sockaddr*)&addr, sizeof(addr)); + chmod(FLAGS_sock_path.c_str(), 0666); + listen(srv, 1); + fprintf(stderr, "CP_ET READY on %s\n", FLAGS_sock_path.c_str()); + } + + while (true) { + int cli = accept(srv, nullptr, nullptr); + if (cli < 0) break; + + float input[2 * N_EMBD]; + while (read_exact(cli, input, sizeof(input))) { + auto t0 = std::chrono::high_resolution_clock::now(); + float* hidden_in = input; + float* cb0_emb = input + N_EMBD; + int kv_elem = N_KV * KV_LEN * HD; + std::vector kv(N_L * 2 * kv_elem, 0.0f); + int codes[N_CB] = {}; + float* emb = hidden_in; + + for (int step = 0; step < 17; step++) { + if (step == 1) emb = cb0_emb; + else if (step >= 2) emb = embs_data + ((step-2)*N_VOCAB + codes[step-2]) * N_EMBD; + + // Prepare input tensors (allocates buffers matching the method's expectations) + auto prep = executorch::extension::prepare_input_tensors(method.get()); + if (!prep.ok()) { fprintf(stderr, "prep fail %d\n", step); break; } + + // Copy our data into the prepared tensors + // Input 0: emb [1,1,1024] + memcpy(method->mutable_input(0).toTensor().mutable_data_ptr(), emb, N_EMBD*4); + // Input 1: mask [1,1,1,16] + float* mp = method->mutable_input(1).toTensor().mutable_data_ptr(); + for (int p = 0; p < KV_LEN; p++) mp[p] = (p >= KV_LEN-1-step) ? 0.0f : -1e9f; + // Input 2: cos [1,1,128] + memcpy(method->mutable_input(2).toTensor().mutable_data_ptr(), rcos+step*HD, HD*4); + // Input 3: sin [1,1,128] + memcpy(method->mutable_input(3).toTensor().mutable_data_ptr(), rsin+step*HD, HD*4); + // Inputs 4-13: KV caches [1,8,16,128] + for (int l = 0; l < N_L; l++) { + memcpy(method->mutable_input(4+l*2).toTensor().mutable_data_ptr(), + kv.data()+(l*2)*kv_elem, kv_elem*4); + memcpy(method->mutable_input(5+l*2).toTensor().mutable_data_ptr(), + kv.data()+(l*2+1)*kv_elem, kv_elem*4); + } + + auto status = method->execute(); + if (status != Error::Ok) { + fprintf(stderr, "exec fail step %d: %d\n", step, (int)status); + break; + } + + // Get hidden output + const float* h = method->get_output(0).toTensor().const_data_ptr(); + + // Head argmax on CPU + if (step >= 1 && step-1 < N_CB) { + int cb = step - 1; + const float* W = heads + cb * N_VOCAB * N_EMBD; + int best = 0; float bv = -1e30f; + for (int j = 0; j < N_VOCAB; j++) { + float dot = 0; + for (int k = 0; k < N_EMBD; k++) dot += h[k] * W[j*N_EMBD+k]; + if (dot > bv) { bv = dot; best = j; } + } + codes[cb] = best; + } + + // Update KV caches from outputs + for (int l = 0; l < N_L; l++) { + const float* ko = method->get_output(1+l*2).toTensor().const_data_ptr(); + const float* vo = method->get_output(2+l*2).toTensor().const_data_ptr(); + memcpy(kv.data()+(l*2)*kv_elem, ko, kv_elem*4); + memcpy(kv.data()+(l*2+1)*kv_elem, vo, kv_elem*4); + } + } + + auto t1 = std::chrono::high_resolution_clock::now(); + float ms = std::chrono::duration(t1-t0).count(); + write_exact(cli, codes, sizeof(codes)); + write_exact(cli, &ms, sizeof(ms)); + } + close(cli); + } + + free(heads); free(embs_data); free(rcos); free(rsin); + close(srv); unlink(FLAGS_sock_path.c_str()); + return 0; +} diff --git a/executorch-custom/cp_et_test_client.cpp b/executorch-custom/cp_et_test_client.cpp new file mode 100644 index 0000000..063ae0a --- /dev/null +++ b/executorch-custom/cp_et_test_client.cpp @@ -0,0 +1,122 @@ +/** + * CP ET Test Client — reads batch input file, sends to cp_et_runner socket, + * collects output codes. Runs ON DEVICE as root to avoid adb forward issues. + * + * Usage: cp_et_test_client --input=/path/input.bin --output=/path/output.bin + * --sock_path=/data/local/tmp/kazeia/cp_et.sock + * + * Input format: int32 n_frames, then per frame: float32[1024] hidden + float32[1024] cb0_emb + * Output format: int32 n_frames, then per frame: int32[15] codes + float32 timing_ms + */ +#include +#include +#include +#include +#include +#include +#include + +static bool read_exact(int fd, void* buf, size_t n) { + size_t d = 0; + while (d < n) { + ssize_t r = read(fd, (char*)buf + d, n - d); + if (r <= 0) return false; + d += r; + } + return true; +} + +static bool write_exact(int fd, const void* buf, size_t n) { + size_t d = 0; + while (d < n) { + ssize_t r = write(fd, (const char*)buf + d, n - d); + if (r <= 0) return false; + d += r; + } + return true; +} + +int main(int argc, char** argv) { + const char* input_path = nullptr; + const char* output_path = nullptr; + const char* sock_path = "/data/local/tmp/kazeia/cp_et.sock"; + + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "--input=", 8) == 0) input_path = argv[i] + 8; + else if (strncmp(argv[i], "--output=", 9) == 0) output_path = argv[i] + 9; + else if (strncmp(argv[i], "--sock_path=", 12) == 0) sock_path = argv[i] + 12; + } + + if (!input_path || !output_path) { + fprintf(stderr, "Usage: %s --input=in.bin --output=out.bin [--sock_path=...]\n", argv[0]); + return 1; + } + + // Read input file + FILE* fin = fopen(input_path, "rb"); + if (!fin) { fprintf(stderr, "Cannot open %s\n", input_path); return 1; } + + int32_t n_frames; + fread(&n_frames, 4, 1, fin); + fprintf(stderr, "Frames: %d\n", n_frames); + + const int N_EMBD = 1024; + float* inputs = (float*)malloc(n_frames * 2 * N_EMBD * sizeof(float)); + fread(inputs, sizeof(float), n_frames * 2 * N_EMBD, fin); + fclose(fin); + + // Connect to socket + int sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { perror("socket"); return 1; } + + struct sockaddr_un addr = {}; + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1); + + if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + perror("connect"); + return 1; + } + fprintf(stderr, "Connected to %s\n", sock_path); + + // Process frames + FILE* fout = fopen(output_path, "wb"); + fwrite(&n_frames, 4, 1, fout); + + float total_ms = 0; + for (int i = 0; i < n_frames; i++) { + float* frame = inputs + i * 2 * N_EMBD; + + // Send 8192 bytes + if (!write_exact(sock, frame, 2 * N_EMBD * sizeof(float))) { + fprintf(stderr, "Write failed at frame %d\n", i); + break; + } + + // Read 64 bytes: 15 ints + 1 float + int32_t codes[15]; + float timing; + if (!read_exact(sock, codes, sizeof(codes))) { + fprintf(stderr, "Read codes failed at frame %d\n", i); + break; + } + if (!read_exact(sock, &timing, sizeof(timing))) { + fprintf(stderr, "Read timing failed at frame %d\n", i); + break; + } + + fwrite(codes, sizeof(int32_t), 15, fout); + fwrite(&timing, sizeof(float), 1, fout); + total_ms += timing; + + fprintf(stderr, " Frame %d: %.1fms codes=[%d,%d,%d,...]\n", + i, timing, codes[0], codes[1], codes[2]); + } + + fclose(fout); + close(sock); + free(inputs); + + fprintf(stderr, "Done! Total: %.0fms (%.1fms/frame)\n", total_ms, total_ms / n_frames); + return 0; +} diff --git a/executorch-custom/tts_pipeline_jni.cpp b/executorch-custom/tts_pipeline_jni.cpp new file mode 100644 index 0000000..4c4f0c6 --- /dev/null +++ b/executorch-custom/tts_pipeline_jni.cpp @@ -0,0 +1,367 @@ +/** + * Native TTS pipeline: talker + CP autoregressive loop in C++. + * One JNI call runs the entire generation → returns all codebook codes. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define TAG "TtsPipeline" +#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) + +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::Program; +using executorch::runtime::Span; + +static const int DIM=1024, VOCAB=3072, CB_SIZE=2048, NUM_CB=16; +static const int T_L=28, T_KV=8, T_HD=128, T_KV_LEN=100; +static const int C_L=5, C_KV=8, C_HD=128, C_KV_LEN=16; +static const int CODEC_EOS=2150; + +static inline float dot_neon(const float* a, const float* b, int n) { + float32x4_t s0=vdupq_n_f32(0),s1=vdupq_n_f32(0),s2=vdupq_n_f32(0),s3=vdupq_n_f32(0); + int i=0; + for(;i+15bv){bv=d;best=j;}} + return best; +} + +// Top-k sampling with temperature +static int sample_topk(const float* logits, int vocab, float temp, int k) { + // Find top-k + struct IV { int i; float v; }; + std::vector topk(k, {0, -FLT_MAX}); + for (int i = 0; i < vocab; i++) { + if (logits[i] > topk[k-1].v) { + topk[k-1] = {i, logits[i]}; + // Bubble up + for (int j = k-2; j >= 0; j--) { + if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]); + else break; + } + } + } + // Softmax with temperature + float maxv = topk[0].v; + float sum = 0; + for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; } + // Sample + float r = (float)rand() / RAND_MAX * sum; + float acc = 0; + for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; } + return topk[0].i; +} + +struct PipelineState { + std::unique_ptr tLoader, cLoader; + std::unique_ptr tProg, cProg; + std::unique_ptr tMM, cMM; + Method* talker = nullptr; + Method* cp = nullptr; + std::vector> tBufs, cBufs; + bool loaded = false; +}; +static PipelineState* gState = nullptr; +static uint8_t tMethodPool[8*1024*1024], tTempPool[2*1024*1024]; +static uint8_t cMethodPool[4*1024*1024], cTempPool[1*1024*1024]; + +static Method* loadModel(const char* path, + std::unique_ptr& loader, + std::unique_ptr& program, + std::unique_ptr& mm, + std::vector>& bufs, + uint8_t* mp, size_t mps, uint8_t* tp, size_t tps) +{ + auto ld = executorch::extension::FileDataLoader::from(path); + if(!ld.ok()) return nullptr; + loader=std::make_unique(std::move(ld.get())); + auto prog=Program::load(&*loader); if(!prog.ok()) return nullptr; + program=std::make_unique(std::move(prog.get())); + auto meta=program->method_meta("forward"); if(!meta.ok()) return nullptr; + std::vector> spans; + for(size_t i=0;inum_memory_planned_buffers();i++){ + size_t sz=(size_t)meta->memory_planned_buffer_size(i).get(); + bufs.push_back(std::make_unique(sz)); + spans.push_back({bufs.back().get(),sz}); + } + auto*ma=new MemoryAllocator(mps,mp); + auto*ta=new MemoryAllocator(tps,tp); + auto*ha=new HierarchicalAllocator({spans.data(),spans.size()}); + mm=std::unique_ptr(new MemoryManager(ma,ha,ta)); + auto method=program->load_method("forward",mm.get()); + if(!method.ok()) return nullptr; + return new Method(std::move(method.get())); +} + +extern "C" { + +JNIEXPORT jboolean JNICALL +Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring jCP){ + executorch::runtime::runtime_init(); + if(gState&&gState->loaded) return JNI_TRUE; + const char*tp=env->GetStringUTFChars(jTP,nullptr); + const char*cp=env->GetStringUTFChars(jCP,nullptr); + gState=new PipelineState(); + LOGI("Loading talker+CP..."); + auto t0=std::chrono::high_resolution_clock::now(); + gState->talker=loadModel(tp,gState->tLoader,gState->tProg,gState->tMM,gState->tBufs,tMethodPool,sizeof(tMethodPool),tTempPool,sizeof(tTempPool)); + gState->cp=loadModel(cp,gState->cLoader,gState->cProg,gState->cMM,gState->cBufs,cMethodPool,sizeof(cMethodPool),cTempPool,sizeof(cTempPool)); + env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp); + if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;} + gState->loaded=true; + // Warmup both + {auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();} + {auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();} + auto t1=std::chrono::high_resolution_clock::now(); + LOGI("Loaded+warmup: %.0fms",std::chrono::duration(t1-t0).count()); + return JNI_TRUE; +} + +JNIEXPORT void JNICALL +Java_com_kazeia_tts_TtsPipeline_nativeDestroy(JNIEnv*,jclass){ + if(gState){delete gState->talker;delete gState->cp;delete gState;gState=nullptr;} +} + +// Helper: run one talker step +static void talkerStep(Method&m, const float*emb, float*mask, int pos, + const float*tCos, const float*tSin, float*tK, float*tV, + float*outHidden, float*outLogits) +{ + int kvElem = T_KV * T_KV_LEN * T_HD; + auto prep = executorch::extension::prepare_input_tensors(m); + memcpy(m.mutable_input(0).toTensor().mutable_data_ptr(), emb, DIM*4); + memcpy(m.mutable_input(1).toTensor().mutable_data_ptr(), mask, T_KV_LEN*4); + int pi = std::min(pos, 249); + memcpy(m.mutable_input(2).toTensor().mutable_data_ptr(), tCos+pi*T_HD, T_HD*4); + memcpy(m.mutable_input(3).toTensor().mutable_data_ptr(), tSin+pi*T_HD, T_HD*4); + for(int i=0;i(), tK+i*kvElem, kvElem*4); + memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr(), tV+i*kvElem, kvElem*4); + } + m.execute(); + memcpy(outHidden, m.get_output(0).toTensor().const_data_ptr(), DIM*4); + memcpy(outLogits, m.get_output(1).toTensor().const_data_ptr(), VOCAB*4); + for(int i=0;i(), kvElem*4); + memcpy(tV+i*kvElem, m.get_output(3+i*2).toTensor().const_data_ptr(), kvElem*4); + } +} + +// Helper: run full CP (17 steps) → 15 codes +static void cpForward(Method&m, const float*hidden, int cb0, + const float*codecEmb, const float*cpEmbs, const float*cpHeads, + const float*cCos, const float*cSin, int*codes) +{ + int kvElem = C_KV * C_KV_LEN * C_HD; + std::vector kv(C_L*2*kvElem, 0.0f); + + for(int step=0;step<17;step++){ + const float*emb; + if(step==0) emb=hidden; + else if(step==1) emb=codecEmb + std::min(std::max(cb0,0),VOCAB-1)*DIM; + else emb=cpEmbs + ((step-2)*CB_SIZE + std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM; + + auto prep=executorch::extension::prepare_input_tensors(m); + memcpy(m.mutable_input(0).toTensor().mutable_data_ptr(), emb, DIM*4); + // Mask + float*mp=m.mutable_input(1).toTensor().mutable_data_ptr(); + for(int p=0;p=C_KV_LEN-1-step)?0.0f:-1e9f; + memcpy(m.mutable_input(2).toTensor().mutable_data_ptr(), cCos+step*C_HD, C_HD*4); + memcpy(m.mutable_input(3).toTensor().mutable_data_ptr(), cSin+step*C_HD, C_HD*4); + for(int i=0;i(), kv.data()+(i*2)*kvElem, kvElem*4); + memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr(), kv.data()+(i*2+1)*kvElem, kvElem*4); + } + m.execute(); + const float*h=m.get_output(0).toTensor().const_data_ptr(); + if(step>=1&&step-1<15){ + codes[step-1]=argmax_head(h, cpHeads+(step-1)*CB_SIZE*DIM, CB_SIZE, DIM); + } + for(int i=0;i(), kvElem*4); + memcpy(kv.data()+(i*2+1)*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr(), kvElem*4); + } + } +} + +JNIEXPORT jintArray JNICALL +Java_com_kazeia_tts_TtsPipeline_nativeRun( + JNIEnv*env,jclass, + jfloatArray jPrefill,jint nPrefill, + jfloatArray jTrailing,jint nTrailing, + jfloatArray jCodecEmb, jfloatArray jCpEmbs, jfloatArray jCpHeads, + jfloatArray jTCos,jfloatArray jTSin, jfloatArray jCCos,jfloatArray jCSin, + jint maxTokens) +{ + if(!gState||!gState->loaded) return nullptr; + auto T0=std::chrono::high_resolution_clock::now(); + + // Copy all data from JNI (then release immediately) + int prefillSize = env->GetArrayLength(jPrefill); + std::vector prefill(prefillSize); + env->GetFloatArrayRegion(jPrefill, 0, prefillSize, prefill.data()); + + std::vector trailingData; + if(nTrailing>0){trailingData.resize(nTrailing*DIM);env->GetFloatArrayRegion(jTrailing,0,nTrailing*DIM,trailingData.data());} + + int codecSize=env->GetArrayLength(jCodecEmb); + std::vector codecEmb(codecSize); + env->GetFloatArrayRegion(jCodecEmb,0,codecSize,codecEmb.data()); + + int cpEmbsSize=env->GetArrayLength(jCpEmbs); + std::vector cpEmbs(cpEmbsSize); + env->GetFloatArrayRegion(jCpEmbs,0,cpEmbsSize,cpEmbs.data()); + + int headsSize=env->GetArrayLength(jCpHeads); + std::vector cpHeads(headsSize); + env->GetFloatArrayRegion(jCpHeads,0,headsSize,cpHeads.data()); + + int tcSize=env->GetArrayLength(jTCos); + std::vector tCos(tcSize),tSin(tcSize); + env->GetFloatArrayRegion(jTCos,0,tcSize,tCos.data()); + env->GetFloatArrayRegion(jTSin,0,tcSize,tSin.data()); + + int ccSize=env->GetArrayLength(jCCos); + std::vector cCos(ccSize),cSin(ccSize); + env->GetFloatArrayRegion(jCCos,0,ccSize,cCos.data()); + env->GetFloatArrayRegion(jCSin,0,ccSize,cSin.data()); + + // Pipeline state + int tkvElem=T_KV*T_KV_LEN*T_HD; + std::vector tK(T_L*tkvElem,0), tV(T_L*tkvElem,0); + float mask[T_KV_LEN]; for(int i=0;i allCodes; // flat: numTokens × 16 + std::vector cb0History; + int pos=0, currentCb0=-1; + + // ===== PREFILL ===== + auto tP0=std::chrono::high_resolution_clock::now(); + for(int step=0;step=0) mask[mi]=0.0f; + talkerStep(*gState->talker, prefill.data()+step*DIM, mask, pos, tCos.data(),tSin.data(), + tK.data(), tV.data(), hidden, logits); + pos++; + if(step==nPrefill-1){ + for(int j=CB_SIZE;j(tP1-tP0).count(), nPrefill, currentCb0); + + if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);} + + // ===== GENERATION ===== + float totalTalkerMs=0, totalCpMs=0; + int trailingIdx=0; + // Pad embedding (zeros + pad token is not available here, use zeros) + float padEmb[DIM]={}; // In practice should be tts_pad_embed, passed as param + + for(int gen=0;gencp, hidden, currentCb0, codecEmb.data(), cpEmbs.data(), cpHeads.data(), + cCos.data(), cSin.data(), cpCodes); + auto tc1=std::chrono::high_resolution_clock::now(); + totalCpMs+=std::chrono::duration(tc1-tc0).count(); + for(int i=0;i<15;i++) codes[i+1]=cpCodes[i]; + + for(int i=0;i=0) mask[mi]=0.0f; + auto tt0=std::chrono::high_resolution_clock::now(); + talkerStep(*gState->talker, nextEmb, mask, pos, tCos.data(),tSin.data(), + tK.data(), tV.data(), hidden, logits); + auto tt1=std::chrono::high_resolution_clock::now(); + totalTalkerMs+=std::chrono::duration(tt1-tt0).count(); + pos++; + + // Sample next cb0 (suppress non-codec, repetition penalty) + for(int j=CB_SIZE;j seen(cb0History.begin(),cb0History.end()); + for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f; + int nextCb0=sample_topk(logits,VOCAB,0.9f,50); + + if(nextCb0==CODEC_EOS){LOGI("EOS at step %d",gen+2);break;} + // Degeneration check + int histSz=(int)cb0History.size(); + if(histSz>=9){ + bool degen=true; + for(int i=histSz-9;i(T1-T0).count()); + + jintArray result=env->NewIntArray((int)allCodes.size()); + env->SetIntArrayRegion(result,0,(int)allCodes.size(),allCodes.data()); + return result; +} + +} // extern "C" diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index 1e17c20..3d7d866 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -130,6 +130,7 @@ class Qwen3TtsEngine( private var cpAllHeads: FloatArray? = null // all 15 heads concatenated [15*2048*1024] for batch NEON private var cpPteModule: org.pytorch.executorch.Module? = null // ExecuTorch CP on NPU (JNI) private var talkerPteModule: org.pytorch.executorch.Module? = null // ExecuTorch talker on NPU (JNI) + private var nativePipelineReady: Boolean = false // C++ native pipeline available private var talkerPteRotaryCos: FloatArray? = null private var talkerPteRotarySin: FloatArray? = null private var useEtCp: Boolean = false // CP via ExecuTorch runner process (root) @@ -307,6 +308,19 @@ class Qwen3TtsEngine( cpPteModule!!.forward(*cIns.toTypedArray()) nlog("CP warmup: ${System.currentTimeMillis() - cw}ms") } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") } + + // Init native C++ pipeline (loads models with own ExecuTorch runtime) + try { + val tn = System.currentTimeMillis() + nativePipelineReady = TtsPipeline.nativeInit( + "/data/local/tmp/kazeia/models/talker_transformer_fp16.pte", + "/data/local/tmp/kazeia/models/cp_transformer_fp16.pte" + ) + nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)") + } catch (e: Exception) { + nlog("Native pipeline init failed: ${e.message}") + nativePipelineReady = false + } } } catch (e: Exception) { nlog("Talker .pte JNI failed: ${e.message}") @@ -2272,23 +2286,47 @@ class Qwen3TtsEngine( val embeds = Array(nTotal) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = bb.float } } nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)") - // Build textEmbedsList: first nPrefill are "prefill", rest are trailing - val textEmbedsList = embeds.toList() - val allCodesArray = runInterleavedPte(textEmbedsList.subList(0, 1), maxGenTokens = nTotal - nPrefill) - // Note: runInterleavedPte handles prefill internally via buildPrefillEmbeddings + val allCodes: Array + if (nativePipelineReady) { + // Native C++ pipeline — zero Java overhead + val prefillFlat = FloatArray(nPrefill * TALKER_DIM) + for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM) + val nTrailing = nTotal - nPrefill + val trailingFlat = if (nTrailing > 0) FloatArray(nTrailing * TALKER_DIM).also { arr -> + for (i in 0 until nTrailing) System.arraycopy(embeds[nPrefill + i], 0, arr, i * TALKER_DIM, TALKER_DIM) + } else null - // Actually, generateFromEmbeds uses pre-computed embeds directly. - // Let's use runInterleavedPte properly by passing all embeds as textEmbedsList - // runInterleavedPte's buildPrefillEmbeddings will create the prefill from first embed - // But we need a different approach: pass all embeds directly. + // Load CP heads if not already + if (cpAllHeads == null) { + val headsFile = java.io.File("/data/local/tmp/kazeia/models/cp_heads.bin") + if (headsFile.exists()) { + val hb = headsFile.readBytes() + cpAllHeads = FloatArray(hb.size / 4) + ByteBuffer.wrap(hb).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(cpAllHeads!!) + } + } - // Simpler: use the run_pipeline code path which goes through generateSpeech → runInterleavedGeneration - // For now, let's just call runInterleavedPte with the right embeds structure - val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList() - val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList() - - // Call the PTE pipeline directly - val allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill) + nlog("Running native C++ pipeline...") + val flat = TtsPipeline.nativeRun( + prefillFlat, nPrefill, + trailingFlat, nTrailing, + codecEmbedding ?: FloatArray(0), + cpEmbeddings ?: FloatArray(0), + cpAllHeads ?: FloatArray(0), + talkerPteRotaryCos ?: FloatArray(0), talkerPteRotarySin ?: FloatArray(0), + cpRotaryCos ?: FloatArray(0), cpRotarySin ?: FloatArray(0), + nTotal - nPrefill + ) + if (flat == null || flat.isEmpty()) return ShortArray(0) + val nTokens = flat.size / NUM_CODEBOOKS + allCodes = Array(nTokens) { t -> IntArray(NUM_CODEBOOKS) { cb -> flat[t * NUM_CODEBOOKS + cb] } } + nlog("Native pipeline: $nTokens tokens") + } else { + // Fallback: Java pipeline + val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList() + val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList() + allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill) + } if (allCodes.isEmpty()) return ShortArray(0) val numRealTokens = allCodes.size diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/TtsPipeline.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/TtsPipeline.kt new file mode 100644 index 0000000..c12035e --- /dev/null +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/TtsPipeline.kt @@ -0,0 +1,27 @@ +package com.kazeia.tts + +/** Native C++ TTS pipeline: talker + CP loop with zero Java overhead. */ +object TtsPipeline { + init { System.loadLibrary("tts_pipeline") } + + /** Load and warmup both .pte models. Returns true on success. */ + external fun nativeInit(talkerPath: String, cpPath: String): Boolean + + /** Release native resources. */ + external fun nativeDestroy() + + /** + * Run full pipeline. Returns flat int array: [numTokens × 16] codebook codes. + * Each group of 16 ints = [CB0, CB1, ..., CB15] for one time step. + */ + external fun nativeRun( + prefillEmbeds: FloatArray, nPrefill: Int, + trailingEmbeds: FloatArray?, nTrailing: Int, + codecEmbedding: FloatArray, + cpEmbeddings: FloatArray, + cpHeads: FloatArray, + talkerCos: FloatArray, talkerSin: FloatArray, + cpCos: FloatArray, cpSin: FloatArray, + maxTokens: Int + ): IntArray? +} diff --git a/kazeia-android/app/src/main/jni/CMakeLists.txt b/kazeia-android/app/src/main/jni/CMakeLists.txt index 7016dcc..3ad0378 100644 --- a/kazeia-android/app/src/main/jni/CMakeLists.txt +++ b/kazeia-android/app/src/main/jni/CMakeLists.txt @@ -37,6 +37,10 @@ target_include_directories(whisper_jni PRIVATE target_link_libraries(whisper_jni whisper ggml ggml-base ggml-cpu android log) target_compile_options(whisper_jni PRIVATE -std=c++17 -O2) +# --- TTS Pipeline: built externally via ExecuTorch cmake, copied to jniLibs --- +# Build with: cd /opt/Kazeia/executorch/build-android && cmake --build . --target tts_pipeline_jni -j$(nproc) +# Then copy to jniLibs/arm64-v8a/libtts_pipeline.so + # --- NEON optimized ops for TTS heads --- add_library(neon_ops SHARED neon_ops.cpp) target_link_libraries(neon_ops log)