/** * TTS Code Predictor Runner — ExecuTorch .pte on NPU HTP. * Based on executor_runner.cpp but with socket IPC for the app. * Same protocol as the GGUF CP runner. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include DEFINE_string(model_path, "", "Path to .pte file"); DEFINE_string(sock_path, "/data/local/tmp/kazeia/cp_et.sock", "Socket path"); DEFINE_string(heads_path, "/data/local/tmp/kazeia/models/cp_heads.bin", "Heads file"); DEFINE_string(embs_path, "/data/local/tmp/kazeia/models/cp_codec_embs.bin", "Codec embs file"); DEFINE_string(cos_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_cos.npy", "Cos file"); DEFINE_string(sin_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_sin.npy", "Sin file"); using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::HierarchicalAllocator; using executorch::runtime::MemoryAllocator; using executorch::runtime::MemoryManager; using executorch::runtime::Method; using executorch::runtime::Program; using executorch::runtime::Result; using executorch::runtime::Span; static const int N_EMBD=1024, N_VOCAB=2048, N_CB=15, N_KV=8, HD=128, KV_LEN=16, N_L=5; static bool read_exact(int fd,void*buf,size_t n){ size_t d=0;while(dmethod_meta("forward"); ET_CHECK_MSG(method_meta.ok(), "Failed to get method meta"); std::vector> planned_bufs; std::vector> planned_spans; size_t n_planned = method_meta->num_memory_planned_buffers(); for (size_t id = 0; id < n_planned; id++) { size_t sz = (size_t)method_meta->memory_planned_buffer_size(id).get(); planned_bufs.push_back(std::make_unique(sz)); planned_spans.push_back({planned_bufs.back().get(), sz}); } HierarchicalAllocator planned_memory({planned_spans.data(), planned_spans.size()}); MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator); // Load method auto method = program->load_method("forward", &memory_manager); ET_CHECK_MSG(method.ok(), "Failed to load method: 0x%x", (int)method.error()); auto meta = method->method_meta(); fprintf(stderr, "CP_ET: %zu inputs, %zu outputs\n", meta.num_inputs(), meta.num_outputs()); // Load heads, embeddings, rotary float* heads = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4); float* embs_data = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4); FILE* fh = fopen(FLAGS_heads_path.c_str(), "rb"); if (fh) { fread(heads, 4, N_CB*N_VOCAB*N_EMBD, fh); fclose(fh); } FILE* fe = fopen(FLAGS_embs_path.c_str(), "rb"); if (fe) { fread(embs_data, 4, N_CB*N_VOCAB*N_EMBD, fe); fclose(fe); } float* rcos = load_npy(FLAGS_cos_path.c_str(), 17*HD); float* rsin = load_npy(FLAGS_sin_path.c_str(), 17*HD); // Socket setup unlink(FLAGS_sock_path.c_str()); int srv = socket(AF_UNIX, SOCK_STREAM, 0); struct sockaddr_un addr = {}; addr.sun_family = AF_UNIX; strncpy(addr.sun_path, FLAGS_sock_path.c_str(), sizeof(addr.sun_path)-1); bind(srv, (struct sockaddr*)&addr, sizeof(addr)); chmod(FLAGS_sock_path.c_str(), 0666); listen(srv, 1); fprintf(stderr, "CP_ET READY on %s\n", FLAGS_sock_path.c_str()); while (true) { int cli = accept(srv, nullptr, nullptr); if (cli < 0) break; float input[2 * N_EMBD]; while (read_exact(cli, input, sizeof(input))) { auto t0 = std::chrono::high_resolution_clock::now(); float* hidden_in = input; float* cb0_emb = input + N_EMBD; int kv_elem = N_KV * KV_LEN * HD; std::vector kv(N_L * 2 * kv_elem, 0.0f); int codes[N_CB] = {}; float* emb = hidden_in; for (int step = 0; step < 17; step++) { if (step == 1) emb = cb0_emb; else if (step >= 2) emb = embs_data + ((step-2)*N_VOCAB + codes[step-2]) * N_EMBD; // Prepare input tensors (allocates buffers matching the method's expectations) auto prep = executorch::extension::prepare_input_tensors(method.get()); if (!prep.ok()) { fprintf(stderr, "prep fail %d\n", step); break; } // Copy our data into the prepared tensors // Input 0: emb [1,1,1024] memcpy(method->mutable_input(0).toTensor().mutable_data_ptr(), emb, N_EMBD*4); // Input 1: mask [1,1,1,16] float* mp = method->mutable_input(1).toTensor().mutable_data_ptr(); for (int p = 0; p < KV_LEN; p++) mp[p] = (p >= KV_LEN-1-step) ? 0.0f : -1e9f; // Input 2: cos [1,1,128] memcpy(method->mutable_input(2).toTensor().mutable_data_ptr(), rcos+step*HD, HD*4); // Input 3: sin [1,1,128] memcpy(method->mutable_input(3).toTensor().mutable_data_ptr(), rsin+step*HD, HD*4); // Inputs 4-13: KV caches [1,8,16,128] for (int l = 0; l < N_L; l++) { memcpy(method->mutable_input(4+l*2).toTensor().mutable_data_ptr(), kv.data()+(l*2)*kv_elem, kv_elem*4); memcpy(method->mutable_input(5+l*2).toTensor().mutable_data_ptr(), kv.data()+(l*2+1)*kv_elem, kv_elem*4); } auto status = method->execute(); if (status != Error::Ok) { fprintf(stderr, "exec fail step %d: %d\n", step, (int)status); break; } // Get hidden output const float* h = method->get_output(0).toTensor().const_data_ptr(); // Head argmax on CPU if (step >= 1 && step-1 < N_CB) { int cb = step - 1; const float* W = heads + cb * N_VOCAB * N_EMBD; int best = 0; float bv = -1e30f; for (int j = 0; j < N_VOCAB; j++) { float dot = 0; for (int k = 0; k < N_EMBD; k++) dot += h[k] * W[j*N_EMBD+k]; if (dot > bv) { bv = dot; best = j; } } codes[cb] = best; } // Update KV caches from outputs for (int l = 0; l < N_L; l++) { const float* ko = method->get_output(1+l*2).toTensor().const_data_ptr(); const float* vo = method->get_output(2+l*2).toTensor().const_data_ptr(); memcpy(kv.data()+(l*2)*kv_elem, ko, kv_elem*4); memcpy(kv.data()+(l*2+1)*kv_elem, vo, kv_elem*4); } } auto t1 = std::chrono::high_resolution_clock::now(); float ms = std::chrono::duration(t1-t0).count(); write_exact(cli, codes, sizeof(codes)); write_exact(cli, &ms, sizeof(ms)); } close(cli); } free(heads); free(embs_data); free(rcos); free(rsin); close(srv); unlink(FLAGS_sock_path.c_str()); return 0; }