Native C++ pipeline: RTF 1.4 (was 3.6 in Java)

Full talker+CP autoregressive loop in C++ via JNI. Talker 20ms/step, CP 44ms/step, total 6.6s for 4.64s audio. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 10:09:32 +02:00 · 2026-04-09 10:09:32 +02:00 · 393ce79eb5
parent fb6045a635
commit 393ce79eb5
6 changed files with 795 additions and 15 deletions
--- a/executorch-custom/cp_et_runner.cpp
+++ b/executorch-custom/cp_et_runner.cpp
@ -0,0 +1,222 @@
+/**
+ * TTS Code Predictor Runner — ExecuTorch .pte on NPU HTP.
+ * Based on executor_runner.cpp but with socket IPC for the app.
+ * Same protocol as the GGUF CP runner.
+ */
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <chrono>
+#include <memory>
+#include <vector>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+
+DEFINE_string(model_path, "", "Path to .pte file");
+DEFINE_string(sock_path, "/data/local/tmp/kazeia/cp_et.sock", "Socket path");
+DEFINE_int32(tcp_port, 8790, "TCP port (0=disabled, use unix socket)");
+DEFINE_string(heads_path, "/data/local/tmp/kazeia/models/cp_heads.bin", "Heads file");
+DEFINE_string(embs_path, "/data/local/tmp/kazeia/models/cp_codec_embs.bin", "Codec embs file");
+DEFINE_string(cos_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_cos.npy", "Cos file");
+DEFINE_string(sin_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_sin.npy", "Sin file");
+
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+static const int N_EMBD=1024, N_VOCAB=2048, N_CB=15, N_KV=8, HD=128, KV_LEN=16, N_L=5;
+
+static bool read_exact(int fd,void*buf,size_t n){
+    size_t d=0;while(d<n){ssize_t r=read(fd,(char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
+}
+static bool write_exact(int fd,const void*buf,size_t n){
+    size_t d=0;while(d<n){ssize_t r=write(fd,(const char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
+}
+
+static float* load_npy(const char*p,int n){
+    FILE*f=fopen(p,"rb");if(!f)return nullptr;
+    unsigned char h[10];fread(h,1,10,f);
+    int hl=h[8]|(h[9]<<8);fseek(f,10+hl,SEEK_SET);
+    float*d=(float*)malloc(n*4);fread(d,4,n,f);fclose(f);return d;
+}
+
+static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
+static uint8_t temp_allocator_pool[1024U * 1024U]; // 1MB
+
+int main(int argc, char** argv) {
+    executorch::runtime::runtime_init();
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    if (FLAGS_model_path.empty()) {
+        fprintf(stderr, "Usage: cp_et_runner --model_path=model.pte\n");
+        return 1;
+    }
+
+    // Load program
+    auto loader = executorch::extension::FileDataLoader::from(FLAGS_model_path.c_str());
+    ET_CHECK_MSG(loader.ok(), "Failed to load %s", FLAGS_model_path.c_str());
+
+    auto program = Program::load(&loader.get());
+    ET_CHECK_MSG(program.ok(), "Failed to parse program");
+
+    // Setup memory — allocate planned buffers from program metadata
+    MemoryAllocator method_allocator(sizeof(method_allocator_pool), method_allocator_pool);
+    auto temp_allocator = MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool);
+
+    auto method_meta = program->method_meta("forward");
+    ET_CHECK_MSG(method_meta.ok(), "Failed to get method meta");
+
+    std::vector<std::unique_ptr<uint8_t[]>> planned_bufs;
+    std::vector<Span<uint8_t>> planned_spans;
+    size_t n_planned = method_meta->num_memory_planned_buffers();
+    for (size_t id = 0; id < n_planned; id++) {
+        size_t sz = (size_t)method_meta->memory_planned_buffer_size(id).get();
+        planned_bufs.push_back(std::make_unique<uint8_t[]>(sz));
+        planned_spans.push_back({planned_bufs.back().get(), sz});
+    }
+    HierarchicalAllocator planned_memory({planned_spans.data(), planned_spans.size()});
+    MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
+
+    // Load method
+    auto method = program->load_method("forward", &memory_manager);
+    ET_CHECK_MSG(method.ok(), "Failed to load method: 0x%x", (int)method.error());
+
+    auto meta = method->method_meta();
+    fprintf(stderr, "CP_ET: %zu inputs, %zu outputs\n", meta.num_inputs(), meta.num_outputs());
+
+    // Load heads, embeddings, rotary
+    float* heads = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
+    float* embs_data = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
+    FILE* fh = fopen(FLAGS_heads_path.c_str(), "rb");
+    if (fh) { fread(heads, 4, N_CB*N_VOCAB*N_EMBD, fh); fclose(fh); }
+    FILE* fe = fopen(FLAGS_embs_path.c_str(), "rb");
+    if (fe) { fread(embs_data, 4, N_CB*N_VOCAB*N_EMBD, fe); fclose(fe); }
+    float* rcos = load_npy(FLAGS_cos_path.c_str(), 17*HD);
+    float* rsin = load_npy(FLAGS_sin_path.c_str(), 17*HD);
+
+    // Socket setup — TCP if tcp_port > 0, else Unix domain socket
+    int srv;
+    if (FLAGS_tcp_port > 0) {
+        srv = socket(AF_INET, SOCK_STREAM, 0);
+        int opt = 1; setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+        struct sockaddr_in taddr = {}; taddr.sin_family = AF_INET;
+        taddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+        taddr.sin_port = htons(FLAGS_tcp_port);
+        bind(srv, (struct sockaddr*)&taddr, sizeof(taddr));
+        listen(srv, 2);
+        fprintf(stderr, "CP_ET READY on tcp://127.0.0.1:%d\n", FLAGS_tcp_port);
+    } else {
+        unlink(FLAGS_sock_path.c_str());
+        srv = socket(AF_UNIX, SOCK_STREAM, 0);
+        struct sockaddr_un addr = {}; addr.sun_family = AF_UNIX;
+        strncpy(addr.sun_path, FLAGS_sock_path.c_str(), sizeof(addr.sun_path)-1);
+        bind(srv, (struct sockaddr*)&addr, sizeof(addr));
+        chmod(FLAGS_sock_path.c_str(), 0666);
+        listen(srv, 1);
+        fprintf(stderr, "CP_ET READY on %s\n", FLAGS_sock_path.c_str());
+    }
+
+    while (true) {
+        int cli = accept(srv, nullptr, nullptr);
+        if (cli < 0) break;
+
+        float input[2 * N_EMBD];
+        while (read_exact(cli, input, sizeof(input))) {
+            auto t0 = std::chrono::high_resolution_clock::now();
+            float* hidden_in = input;
+            float* cb0_emb = input + N_EMBD;
+            int kv_elem = N_KV * KV_LEN * HD;
+            std::vector<float> kv(N_L * 2 * kv_elem, 0.0f);
+            int codes[N_CB] = {};
+            float* emb = hidden_in;
+
+            for (int step = 0; step < 17; step++) {
+                if (step == 1) emb = cb0_emb;
+                else if (step >= 2) emb = embs_data + ((step-2)*N_VOCAB + codes[step-2]) * N_EMBD;
+
+                // Prepare input tensors (allocates buffers matching the method's expectations)
+                auto prep = executorch::extension::prepare_input_tensors(method.get());
+                if (!prep.ok()) { fprintf(stderr, "prep fail %d\n", step); break; }
+
+                // Copy our data into the prepared tensors
+                // Input 0: emb [1,1,1024]
+                memcpy(method->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, N_EMBD*4);
+                // Input 1: mask [1,1,1,16]
+                float* mp = method->mutable_input(1).toTensor().mutable_data_ptr<float>();
+                for (int p = 0; p < KV_LEN; p++) mp[p] = (p >= KV_LEN-1-step) ? 0.0f : -1e9f;
+                // Input 2: cos [1,1,128]
+                memcpy(method->mutable_input(2).toTensor().mutable_data_ptr<float>(), rcos+step*HD, HD*4);
+                // Input 3: sin [1,1,128]
+                memcpy(method->mutable_input(3).toTensor().mutable_data_ptr<float>(), rsin+step*HD, HD*4);
+                // Inputs 4-13: KV caches [1,8,16,128]
+                for (int l = 0; l < N_L; l++) {
+                    memcpy(method->mutable_input(4+l*2).toTensor().mutable_data_ptr<float>(),
+                           kv.data()+(l*2)*kv_elem, kv_elem*4);
+                    memcpy(method->mutable_input(5+l*2).toTensor().mutable_data_ptr<float>(),
+                           kv.data()+(l*2+1)*kv_elem, kv_elem*4);
+                }
+
+                auto status = method->execute();
+                if (status != Error::Ok) {
+                    fprintf(stderr, "exec fail step %d: %d\n", step, (int)status);
+                    break;
+                }
+
+                // Get hidden output
+                const float* h = method->get_output(0).toTensor().const_data_ptr<float>();
+
+                // Head argmax on CPU
+                if (step >= 1 && step-1 < N_CB) {
+                    int cb = step - 1;
+                    const float* W = heads + cb * N_VOCAB * N_EMBD;
+                    int best = 0; float bv = -1e30f;
+                    for (int j = 0; j < N_VOCAB; j++) {
+                        float dot = 0;
+                        for (int k = 0; k < N_EMBD; k++) dot += h[k] * W[j*N_EMBD+k];
+                        if (dot > bv) { bv = dot; best = j; }
+                    }
+                    codes[cb] = best;
+                }
+
+                // Update KV caches from outputs
+                for (int l = 0; l < N_L; l++) {
+                    const float* ko = method->get_output(1+l*2).toTensor().const_data_ptr<float>();
+                    const float* vo = method->get_output(2+l*2).toTensor().const_data_ptr<float>();
+                    memcpy(kv.data()+(l*2)*kv_elem, ko, kv_elem*4);
+                    memcpy(kv.data()+(l*2+1)*kv_elem, vo, kv_elem*4);
+                }
+            }
+
+            auto t1 = std::chrono::high_resolution_clock::now();
+            float ms = std::chrono::duration<float, std::milli>(t1-t0).count();
+            write_exact(cli, codes, sizeof(codes));
+            write_exact(cli, &ms, sizeof(ms));
+        }
+        close(cli);
+    }
+
+    free(heads); free(embs_data); free(rcos); free(rsin);
+    close(srv); unlink(FLAGS_sock_path.c_str());
+    return 0;
+}
--- a/executorch-custom/cp_et_test_client.cpp
+++ b/executorch-custom/cp_et_test_client.cpp
@ -0,0 +1,122 @@
+/**
+ * CP ET Test Client — reads batch input file, sends to cp_et_runner socket,
+ * collects output codes. Runs ON DEVICE as root to avoid adb forward issues.
+ *
+ * Usage: cp_et_test_client --input=/path/input.bin --output=/path/output.bin
+ *        --sock_path=/data/local/tmp/kazeia/cp_et.sock
+ *
+ * Input format:  int32 n_frames, then per frame: float32[1024] hidden + float32[1024] cb0_emb
+ * Output format: int32 n_frames, then per frame: int32[15] codes + float32 timing_ms
+ */
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+static bool read_exact(int fd, void* buf, size_t n) {
+    size_t d = 0;
+    while (d < n) {
+        ssize_t r = read(fd, (char*)buf + d, n - d);
+        if (r <= 0) return false;
+        d += r;
+    }
+    return true;
+}
+
+static bool write_exact(int fd, const void* buf, size_t n) {
+    size_t d = 0;
+    while (d < n) {
+        ssize_t r = write(fd, (const char*)buf + d, n - d);
+        if (r <= 0) return false;
+        d += r;
+    }
+    return true;
+}
+
+int main(int argc, char** argv) {
+    const char* input_path = nullptr;
+    const char* output_path = nullptr;
+    const char* sock_path = "/data/local/tmp/kazeia/cp_et.sock";
+
+    for (int i = 1; i < argc; i++) {
+        if (strncmp(argv[i], "--input=", 8) == 0) input_path = argv[i] + 8;
+        else if (strncmp(argv[i], "--output=", 9) == 0) output_path = argv[i] + 9;
+        else if (strncmp(argv[i], "--sock_path=", 12) == 0) sock_path = argv[i] + 12;
+    }
+
+    if (!input_path || !output_path) {
+        fprintf(stderr, "Usage: %s --input=in.bin --output=out.bin [--sock_path=...]\n", argv[0]);
+        return 1;
+    }
+
+    // Read input file
+    FILE* fin = fopen(input_path, "rb");
+    if (!fin) { fprintf(stderr, "Cannot open %s\n", input_path); return 1; }
+
+    int32_t n_frames;
+    fread(&n_frames, 4, 1, fin);
+    fprintf(stderr, "Frames: %d\n", n_frames);
+
+    const int N_EMBD = 1024;
+    float* inputs = (float*)malloc(n_frames * 2 * N_EMBD * sizeof(float));
+    fread(inputs, sizeof(float), n_frames * 2 * N_EMBD, fin);
+    fclose(fin);
+
+    // Connect to socket
+    int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock < 0) { perror("socket"); return 1; }
+
+    struct sockaddr_un addr = {};
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1);
+
+    if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+        perror("connect");
+        return 1;
+    }
+    fprintf(stderr, "Connected to %s\n", sock_path);
+
+    // Process frames
+    FILE* fout = fopen(output_path, "wb");
+    fwrite(&n_frames, 4, 1, fout);
+
+    float total_ms = 0;
+    for (int i = 0; i < n_frames; i++) {
+        float* frame = inputs + i * 2 * N_EMBD;
+
+        // Send 8192 bytes
+        if (!write_exact(sock, frame, 2 * N_EMBD * sizeof(float))) {
+            fprintf(stderr, "Write failed at frame %d\n", i);
+            break;
+        }
+
+        // Read 64 bytes: 15 ints + 1 float
+        int32_t codes[15];
+        float timing;
+        if (!read_exact(sock, codes, sizeof(codes))) {
+            fprintf(stderr, "Read codes failed at frame %d\n", i);
+            break;
+        }
+        if (!read_exact(sock, &timing, sizeof(timing))) {
+            fprintf(stderr, "Read timing failed at frame %d\n", i);
+            break;
+        }
+
+        fwrite(codes, sizeof(int32_t), 15, fout);
+        fwrite(&timing, sizeof(float), 1, fout);
+        total_ms += timing;
+
+        fprintf(stderr, "  Frame %d: %.1fms  codes=[%d,%d,%d,...]\n",
+                i, timing, codes[0], codes[1], codes[2]);
+    }
+
+    fclose(fout);
+    close(sock);
+    free(inputs);
+
+    fprintf(stderr, "Done! Total: %.0fms (%.1fms/frame)\n", total_ms, total_ms / n_frames);
+    return 0;
+}
--- a/executorch-custom/tts_pipeline_jni.cpp
+++ b/executorch-custom/tts_pipeline_jni.cpp
@ -0,0 +1,367 @@
+/**
+ * Native TTS pipeline: talker + CP autoregressive loop in C++.
+ * One JNI call runs the entire generation → returns all codebook codes.
+ */
+#include <jni.h>
+#include <arm_neon.h>
+#include <android/log.h>
+#include <cstring>
+#include <cstdlib>
+#include <cfloat>
+#include <cmath>
+#include <chrono>
+#include <memory>
+#include <vector>
+#include <unordered_set>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#define TAG "TtsPipeline"
+#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
+
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Span;
+
+static const int DIM=1024, VOCAB=3072, CB_SIZE=2048, NUM_CB=16;
+static const int T_L=28, T_KV=8, T_HD=128, T_KV_LEN=100;
+static const int C_L=5, C_KV=8, C_HD=128, C_KV_LEN=16;
+static const int CODEC_EOS=2150;
+
+static inline float dot_neon(const float* a, const float* b, int n) {
+    float32x4_t s0=vdupq_n_f32(0),s1=vdupq_n_f32(0),s2=vdupq_n_f32(0),s3=vdupq_n_f32(0);
+    int i=0;
+    for(;i+15<n;i+=16){
+        s0=vfmaq_f32(s0,vld1q_f32(a+i),vld1q_f32(b+i));
+        s1=vfmaq_f32(s1,vld1q_f32(a+i+4),vld1q_f32(b+i+4));
+        s2=vfmaq_f32(s2,vld1q_f32(a+i+8),vld1q_f32(b+i+8));
+        s3=vfmaq_f32(s3,vld1q_f32(a+i+12),vld1q_f32(b+i+12));
+    }
+    float r=vaddvq_f32(vaddq_f32(vaddq_f32(s0,s1),vaddq_f32(s2,s3)));
+    for(;i<n;i++) r+=a[i]*b[i];
+    return r;
+}
+
+static int argmax_head(const float*h,const float*W,int vocab,int dim){
+    int best=0;float bv=-FLT_MAX;
+    for(int j=0;j<vocab;j++){float d=dot_neon(h,W+j*dim,dim);if(d>bv){bv=d;best=j;}}
+    return best;
+}
+
+// Top-k sampling with temperature
+static int sample_topk(const float* logits, int vocab, float temp, int k) {
+    // Find top-k
+    struct IV { int i; float v; };
+    std::vector<IV> topk(k, {0, -FLT_MAX});
+    for (int i = 0; i < vocab; i++) {
+        if (logits[i] > topk[k-1].v) {
+            topk[k-1] = {i, logits[i]};
+            // Bubble up
+            for (int j = k-2; j >= 0; j--) {
+                if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
+                else break;
+            }
+        }
+    }
+    // Softmax with temperature
+    float maxv = topk[0].v;
+    float sum = 0;
+    for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
+    // Sample
+    float r = (float)rand() / RAND_MAX * sum;
+    float acc = 0;
+    for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
+    return topk[0].i;
+}
+
+struct PipelineState {
+    std::unique_ptr<executorch::extension::FileDataLoader> tLoader, cLoader;
+    std::unique_ptr<Program> tProg, cProg;
+    std::unique_ptr<MemoryManager> tMM, cMM;
+    Method* talker = nullptr;
+    Method* cp = nullptr;
+    std::vector<std::unique_ptr<uint8_t[]>> tBufs, cBufs;
+    bool loaded = false;
+};
+static PipelineState* gState = nullptr;
+static uint8_t tMethodPool[8*1024*1024], tTempPool[2*1024*1024];
+static uint8_t cMethodPool[4*1024*1024], cTempPool[1*1024*1024];
+
+static Method* loadModel(const char* path,
+    std::unique_ptr<executorch::extension::FileDataLoader>& loader,
+    std::unique_ptr<Program>& program,
+    std::unique_ptr<MemoryManager>& mm,
+    std::vector<std::unique_ptr<uint8_t[]>>& bufs,
+    uint8_t* mp, size_t mps, uint8_t* tp, size_t tps)
+{
+    auto ld = executorch::extension::FileDataLoader::from(path);
+    if(!ld.ok()) return nullptr;
+    loader=std::make_unique<executorch::extension::FileDataLoader>(std::move(ld.get()));
+    auto prog=Program::load(&*loader); if(!prog.ok()) return nullptr;
+    program=std::make_unique<Program>(std::move(prog.get()));
+    auto meta=program->method_meta("forward"); if(!meta.ok()) return nullptr;
+    std::vector<Span<uint8_t>> spans;
+    for(size_t i=0;i<meta->num_memory_planned_buffers();i++){
+        size_t sz=(size_t)meta->memory_planned_buffer_size(i).get();
+        bufs.push_back(std::make_unique<uint8_t[]>(sz));
+        spans.push_back({bufs.back().get(),sz});
+    }
+    auto*ma=new MemoryAllocator(mps,mp);
+    auto*ta=new MemoryAllocator(tps,tp);
+    auto*ha=new HierarchicalAllocator({spans.data(),spans.size()});
+    mm=std::unique_ptr<MemoryManager>(new MemoryManager(ma,ha,ta));
+    auto method=program->load_method("forward",mm.get());
+    if(!method.ok()) return nullptr;
+    return new Method(std::move(method.get()));
+}
+
+extern "C" {
+
+JNIEXPORT jboolean JNICALL
+Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring jCP){
+    executorch::runtime::runtime_init();
+    if(gState&&gState->loaded) return JNI_TRUE;
+    const char*tp=env->GetStringUTFChars(jTP,nullptr);
+    const char*cp=env->GetStringUTFChars(jCP,nullptr);
+    gState=new PipelineState();
+    LOGI("Loading talker+CP...");
+    auto t0=std::chrono::high_resolution_clock::now();
+    gState->talker=loadModel(tp,gState->tLoader,gState->tProg,gState->tMM,gState->tBufs,tMethodPool,sizeof(tMethodPool),tTempPool,sizeof(tTempPool));
+    gState->cp=loadModel(cp,gState->cLoader,gState->cProg,gState->cMM,gState->cBufs,cMethodPool,sizeof(cMethodPool),cTempPool,sizeof(cTempPool));
+    env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
+    if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
+    gState->loaded=true;
+    // Warmup both
+    {auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
+    {auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
+    auto t1=std::chrono::high_resolution_clock::now();
+    LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
+    return JNI_TRUE;
+}
+
+JNIEXPORT void JNICALL
+Java_com_kazeia_tts_TtsPipeline_nativeDestroy(JNIEnv*,jclass){
+    if(gState){delete gState->talker;delete gState->cp;delete gState;gState=nullptr;}
+}
+
+// Helper: run one talker step
+static void talkerStep(Method&m, const float*emb, float*mask, int pos,
+    const float*tCos, const float*tSin, float*tK, float*tV,
+    float*outHidden, float*outLogits)
+{
+    int kvElem = T_KV * T_KV_LEN * T_HD;
+    auto prep = executorch::extension::prepare_input_tensors(m);
+    memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
+    memcpy(m.mutable_input(1).toTensor().mutable_data_ptr<float>(), mask, T_KV_LEN*4);
+    int pi = std::min(pos, 249);
+    memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), tCos+pi*T_HD, T_HD*4);
+    memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), tSin+pi*T_HD, T_HD*4);
+    for(int i=0;i<T_L;i++){
+        memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), tK+i*kvElem, kvElem*4);
+        memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), tV+i*kvElem, kvElem*4);
+    }
+    m.execute();
+    memcpy(outHidden, m.get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
+    memcpy(outLogits, m.get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
+    for(int i=0;i<T_L;i++){
+        memcpy(tK+i*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
+        memcpy(tV+i*kvElem, m.get_output(3+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
+    }
+}
+
+// Helper: run full CP (17 steps) → 15 codes
+static void cpForward(Method&m, const float*hidden, int cb0,
+    const float*codecEmb, const float*cpEmbs, const float*cpHeads,
+    const float*cCos, const float*cSin, int*codes)
+{
+    int kvElem = C_KV * C_KV_LEN * C_HD;
+    std::vector<float> kv(C_L*2*kvElem, 0.0f);
+
+    for(int step=0;step<17;step++){
+        const float*emb;
+        if(step==0) emb=hidden;
+        else if(step==1) emb=codecEmb + std::min(std::max(cb0,0),VOCAB-1)*DIM;
+        else emb=cpEmbs + ((step-2)*CB_SIZE + std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM;
+
+        auto prep=executorch::extension::prepare_input_tensors(m);
+        memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
+        // Mask
+        float*mp=m.mutable_input(1).toTensor().mutable_data_ptr<float>();
+        for(int p=0;p<C_KV_LEN;p++) mp[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
+        memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), cCos+step*C_HD, C_HD*4);
+        memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), cSin+step*C_HD, C_HD*4);
+        for(int i=0;i<C_L;i++){
+            memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2)*kvElem, kvElem*4);
+            memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2+1)*kvElem, kvElem*4);
+        }
+        m.execute();
+        const float*h=m.get_output(0).toTensor().const_data_ptr<float>();
+        if(step>=1&&step-1<15){
+            codes[step-1]=argmax_head(h, cpHeads+(step-1)*CB_SIZE*DIM, CB_SIZE, DIM);
+        }
+        for(int i=0;i<C_L;i++){
+            memcpy(kv.data()+(i*2)*kvElem, m.get_output(1+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
+            memcpy(kv.data()+(i*2+1)*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
+        }
+    }
+}
+
+JNIEXPORT jintArray JNICALL
+Java_com_kazeia_tts_TtsPipeline_nativeRun(
+    JNIEnv*env,jclass,
+    jfloatArray jPrefill,jint nPrefill,
+    jfloatArray jTrailing,jint nTrailing,
+    jfloatArray jCodecEmb, jfloatArray jCpEmbs, jfloatArray jCpHeads,
+    jfloatArray jTCos,jfloatArray jTSin, jfloatArray jCCos,jfloatArray jCSin,
+    jint maxTokens)
+{
+    if(!gState||!gState->loaded) return nullptr;
+    auto T0=std::chrono::high_resolution_clock::now();
+
+    // Copy all data from JNI (then release immediately)
+    int prefillSize = env->GetArrayLength(jPrefill);
+    std::vector<float> prefill(prefillSize);
+    env->GetFloatArrayRegion(jPrefill, 0, prefillSize, prefill.data());
+
+    std::vector<float> trailingData;
+    if(nTrailing>0){trailingData.resize(nTrailing*DIM);env->GetFloatArrayRegion(jTrailing,0,nTrailing*DIM,trailingData.data());}
+
+    int codecSize=env->GetArrayLength(jCodecEmb);
+    std::vector<float> codecEmb(codecSize);
+    env->GetFloatArrayRegion(jCodecEmb,0,codecSize,codecEmb.data());
+
+    int cpEmbsSize=env->GetArrayLength(jCpEmbs);
+    std::vector<float> cpEmbs(cpEmbsSize);
+    env->GetFloatArrayRegion(jCpEmbs,0,cpEmbsSize,cpEmbs.data());
+
+    int headsSize=env->GetArrayLength(jCpHeads);
+    std::vector<float> cpHeads(headsSize);
+    env->GetFloatArrayRegion(jCpHeads,0,headsSize,cpHeads.data());
+
+    int tcSize=env->GetArrayLength(jTCos);
+    std::vector<float> tCos(tcSize),tSin(tcSize);
+    env->GetFloatArrayRegion(jTCos,0,tcSize,tCos.data());
+    env->GetFloatArrayRegion(jTSin,0,tcSize,tSin.data());
+
+    int ccSize=env->GetArrayLength(jCCos);
+    std::vector<float> cCos(ccSize),cSin(ccSize);
+    env->GetFloatArrayRegion(jCCos,0,ccSize,cCos.data());
+    env->GetFloatArrayRegion(jCSin,0,ccSize,cSin.data());
+
+    // Pipeline state
+    int tkvElem=T_KV*T_KV_LEN*T_HD;
+    std::vector<float> tK(T_L*tkvElem,0), tV(T_L*tkvElem,0);
+    float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
+    float hidden[DIM]={}, logits[VOCAB]={};
+
+    std::vector<int> allCodes; // flat: numTokens × 16
+    std::vector<int> cb0History;
+    int pos=0, currentCb0=-1;
+
+    // ===== PREFILL =====
+    auto tP0=std::chrono::high_resolution_clock::now();
+    for(int step=0;step<nPrefill;step++){
+        int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
+        if(mi>=0) mask[mi]=0.0f;
+        talkerStep(*gState->talker, prefill.data()+step*DIM, mask, pos, tCos.data(),tSin.data(),
+                   tK.data(), tV.data(), hidden, logits);
+        pos++;
+        if(step==nPrefill-1){
+            for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
+            currentCb0=sample_topk(logits,VOCAB,0.9f,50);
+        }
+    }
+    auto tP1=std::chrono::high_resolution_clock::now();
+    LOGI("Prefill: %.0fms, %d steps, cb0=%d",
+         std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
+
+    if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
+
+    // ===== GENERATION =====
+    float totalTalkerMs=0, totalCpMs=0;
+    int trailingIdx=0;
+    // Pad embedding (zeros + pad token is not available here, use zeros)
+    float padEmb[DIM]={}; // In practice should be tts_pad_embed, passed as param
+
+    for(int gen=0;gen<maxTokens;gen++){
+        int codes[NUM_CB]={}; codes[0]=currentCb0;
+
+        // CP
+        auto tc0=std::chrono::high_resolution_clock::now();
+        int cpCodes[15]={};
+        cpForward(*gState->cp, hidden, currentCb0, codecEmb.data(), cpEmbs.data(), cpHeads.data(),
+                  cCos.data(), cSin.data(), cpCodes);
+        auto tc1=std::chrono::high_resolution_clock::now();
+        totalCpMs+=std::chrono::duration<float,std::milli>(tc1-tc0).count();
+        for(int i=0;i<15;i++) codes[i+1]=cpCodes[i];
+
+        for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
+        cb0History.push_back(currentCb0);
+
+        // Build next talker input: sum codec embeddings
+        float nextEmb[DIM]={};
+        // cb0 embedding
+        const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
+        for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
+        // cb1-15 embeddings
+        for(int cb=0;cb<15;cb++){
+            const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
+            for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
+        }
+        // Add trailing text or pad
+        if(trailingIdx<nTrailing){
+            const float*te=trailingData.data()+trailingIdx*DIM;
+            for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
+            trailingIdx++;
+        }
+        // (pad embedding = zeros, already added implicitly)
+
+        // Talker step
+        int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
+        if(mi>=0) mask[mi]=0.0f;
+        auto tt0=std::chrono::high_resolution_clock::now();
+        talkerStep(*gState->talker, nextEmb, mask, pos, tCos.data(),tSin.data(),
+                   tK.data(), tV.data(), hidden, logits);
+        auto tt1=std::chrono::high_resolution_clock::now();
+        totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
+        pos++;
+
+        // Sample next cb0 (suppress non-codec, repetition penalty)
+        for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
+        std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
+        for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
+        int nextCb0=sample_topk(logits,VOCAB,0.9f,50);
+
+        if(nextCb0==CODEC_EOS){LOGI("EOS at step %d",gen+2);break;}
+        // Degeneration check
+        int histSz=(int)cb0History.size();
+        if(histSz>=9){
+            bool degen=true;
+            for(int i=histSz-9;i<histSz;i++) if(cb0History[i]!=nextCb0){degen=false;break;}
+            if(degen){LOGI("Degeneration at step %d",gen+2);break;}
+        }
+        currentCb0=nextCb0;
+    }
+
+    int nTokens=(int)allCodes.size()/NUM_CB;
+    auto T1=std::chrono::high_resolution_clock::now();
+    LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
+         nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),
+         totalCpMs, totalCpMs/std::max(nTokens,1),
+         std::chrono::duration<float,std::milli>(T1-T0).count());
+
+    jintArray result=env->NewIntArray((int)allCodes.size());
+    env->SetIntArrayRegion(result,0,(int)allCodes.size(),allCodes.data());
+    return result;
+}
+
+} // extern "C"
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@ -130,6 +130,7 @@ class Qwen3TtsEngine(
    private var cpAllHeads: FloatArray? = null        // all 15 heads concatenated [15*2048*1024] for batch NEON
    private var cpPteModule: org.pytorch.executorch.Module? = null  // ExecuTorch CP on NPU (JNI)
    private var talkerPteModule: org.pytorch.executorch.Module? = null  // ExecuTorch talker on NPU (JNI)
+    private var nativePipelineReady: Boolean = false  // C++ native pipeline available
    private var talkerPteRotaryCos: FloatArray? = null
    private var talkerPteRotarySin: FloatArray? = null
    private var useEtCp: Boolean = false  // CP via ExecuTorch runner process (root)
@ -307,6 +308,19 @@ class Qwen3TtsEngine(
                                    cpPteModule!!.forward(*cIns.toTypedArray())
                                    nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
                                } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
+
+                                // Init native C++ pipeline (loads models with own ExecuTorch runtime)
+                                try {
+                                    val tn = System.currentTimeMillis()
+                                    nativePipelineReady = TtsPipeline.nativeInit(
+                                        "/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
+                                        "/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
+                                    )
+                                    nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
+                                } catch (e: Exception) {
+                                    nlog("Native pipeline init failed: ${e.message}")
+                                    nativePipelineReady = false
+                                }
                            }
                        } catch (e: Exception) {
                            nlog("Talker .pte JNI failed: ${e.message}")
@ -2272,23 +2286,47 @@ class Qwen3TtsEngine(
        val embeds = Array(nTotal) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = bb.float } }
        nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")

-        // Build textEmbedsList: first nPrefill are "prefill", rest are trailing
-        val textEmbedsList = embeds.toList()
-        val allCodesArray = runInterleavedPte(textEmbedsList.subList(0, 1), maxGenTokens = nTotal - nPrefill)
-        // Note: runInterleavedPte handles prefill internally via buildPrefillEmbeddings
+        val allCodes: Array<IntArray>
+        if (nativePipelineReady) {
+            // Native C++ pipeline — zero Java overhead
+            val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
+            for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
+            val nTrailing = nTotal - nPrefill
+            val trailingFlat = if (nTrailing > 0) FloatArray(nTrailing * TALKER_DIM).also { arr ->
+                for (i in 0 until nTrailing) System.arraycopy(embeds[nPrefill + i], 0, arr, i * TALKER_DIM, TALKER_DIM)
+            } else null

-        // Actually, generateFromEmbeds uses pre-computed embeds directly.
-        // Let's use runInterleavedPte properly by passing all embeds as textEmbedsList
-        // runInterleavedPte's buildPrefillEmbeddings will create the prefill from first embed
-        // But we need a different approach: pass all embeds directly.
+            // Load CP heads if not already
+            if (cpAllHeads == null) {
+                val headsFile = java.io.File("/data/local/tmp/kazeia/models/cp_heads.bin")
+                if (headsFile.exists()) {
+                    val hb = headsFile.readBytes()
+                    cpAllHeads = FloatArray(hb.size / 4)
+                    ByteBuffer.wrap(hb).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(cpAllHeads!!)
+                }
+            }

-        // Simpler: use the run_pipeline code path which goes through generateSpeech → runInterleavedGeneration
-        // For now, let's just call runInterleavedPte with the right embeds structure
-        val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList()
-        val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList()
-
-        // Call the PTE pipeline directly
-        val allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
+            nlog("Running native C++ pipeline...")
+            val flat = TtsPipeline.nativeRun(
+                prefillFlat, nPrefill,
+                trailingFlat, nTrailing,
+                codecEmbedding ?: FloatArray(0),
+                cpEmbeddings ?: FloatArray(0),
+                cpAllHeads ?: FloatArray(0),
+                talkerPteRotaryCos ?: FloatArray(0), talkerPteRotarySin ?: FloatArray(0),
+                cpRotaryCos ?: FloatArray(0), cpRotarySin ?: FloatArray(0),
+                nTotal - nPrefill
+            )
+            if (flat == null || flat.isEmpty()) return ShortArray(0)
+            val nTokens = flat.size / NUM_CODEBOOKS
+            allCodes = Array(nTokens) { t -> IntArray(NUM_CODEBOOKS) { cb -> flat[t * NUM_CODEBOOKS + cb] } }
+            nlog("Native pipeline: $nTokens tokens")
+        } else {
+            // Fallback: Java pipeline
+            val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList()
+            val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList()
+            allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
+        }

        if (allCodes.isEmpty()) return ShortArray(0)
        val numRealTokens = allCodes.size
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/TtsPipeline.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/TtsPipeline.kt
@ -0,0 +1,27 @@
+package com.kazeia.tts
+
+/** Native C++ TTS pipeline: talker + CP loop with zero Java overhead. */
+object TtsPipeline {
+    init { System.loadLibrary("tts_pipeline") }
+
+    /** Load and warmup both .pte models. Returns true on success. */
+    external fun nativeInit(talkerPath: String, cpPath: String): Boolean
+
+    /** Release native resources. */
+    external fun nativeDestroy()
+
+    /**
+     * Run full pipeline. Returns flat int array: [numTokens × 16] codebook codes.
+     * Each group of 16 ints = [CB0, CB1, ..., CB15] for one time step.
+     */
+    external fun nativeRun(
+        prefillEmbeds: FloatArray, nPrefill: Int,
+        trailingEmbeds: FloatArray?, nTrailing: Int,
+        codecEmbedding: FloatArray,
+        cpEmbeddings: FloatArray,
+        cpHeads: FloatArray,
+        talkerCos: FloatArray, talkerSin: FloatArray,
+        cpCos: FloatArray, cpSin: FloatArray,
+        maxTokens: Int
+    ): IntArray?
+}
--- a/kazeia-android/app/src/main/jni/CMakeLists.txt
+++ b/kazeia-android/app/src/main/jni/CMakeLists.txt
@ -37,6 +37,10 @@ target_include_directories(whisper_jni PRIVATE
 target_link_libraries(whisper_jni whisper ggml ggml-base ggml-cpu android log)
 target_compile_options(whisper_jni PRIVATE -std=c++17 -O2)

+# --- TTS Pipeline: built externally via ExecuTorch cmake, copied to jniLibs ---
+# Build with: cd /opt/Kazeia/executorch/build-android && cmake --build . --target tts_pipeline_jni -j$(nproc)
+# Then copy to jniLibs/arm64-v8a/libtts_pipeline.so
+
 # --- NEON optimized ops for TTS heads ---
 add_library(neon_ops SHARED neon_ops.cpp)
 target_link_libraries(neon_ops log)