Native C++ pipeline: RTF 1.4 (was 3.6 in Java)

Full talker+CP autoregressive loop in C++ via JNI.
Talker 20ms/step, CP 44ms/step, total 6.6s for 4.64s audio.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-09 10:09:32 +02:00
parent fb6045a635
commit 393ce79eb5
6 changed files with 795 additions and 15 deletions

View File

@ -0,0 +1,222 @@
/**
* TTS Code Predictor Runner ExecuTorch .pte on NPU HTP.
* Based on executor_runner.cpp but with socket IPC for the app.
* Same protocol as the GGUF CP runner.
*/
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <chrono>
#include <memory>
#include <vector>
#include <gflags/gflags.h>
#include <executorch/extension/data_loader/file_data_loader.h>
#include <executorch/extension/runner_util/inputs.h>
#include <executorch/runtime/executor/method.h>
#include <executorch/runtime/executor/program.h>
#include <executorch/runtime/platform/runtime.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
DEFINE_string(model_path, "", "Path to .pte file");
DEFINE_string(sock_path, "/data/local/tmp/kazeia/cp_et.sock", "Socket path");
DEFINE_int32(tcp_port, 8790, "TCP port (0=disabled, use unix socket)");
DEFINE_string(heads_path, "/data/local/tmp/kazeia/models/cp_heads.bin", "Heads file");
DEFINE_string(embs_path, "/data/local/tmp/kazeia/models/cp_codec_embs.bin", "Codec embs file");
DEFINE_string(cos_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_cos.npy", "Cos file");
DEFINE_string(sin_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_sin.npy", "Sin file");
using executorch::runtime::Error;
using executorch::runtime::EValue;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::Method;
using executorch::runtime::Program;
using executorch::runtime::Result;
using executorch::runtime::Span;
static const int N_EMBD=1024, N_VOCAB=2048, N_CB=15, N_KV=8, HD=128, KV_LEN=16, N_L=5;
static bool read_exact(int fd,void*buf,size_t n){
size_t d=0;while(d<n){ssize_t r=read(fd,(char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
}
static bool write_exact(int fd,const void*buf,size_t n){
size_t d=0;while(d<n){ssize_t r=write(fd,(const char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
}
static float* load_npy(const char*p,int n){
FILE*f=fopen(p,"rb");if(!f)return nullptr;
unsigned char h[10];fread(h,1,10,f);
int hl=h[8]|(h[9]<<8);fseek(f,10+hl,SEEK_SET);
float*d=(float*)malloc(n*4);fread(d,4,n,f);fclose(f);return d;
}
static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
static uint8_t temp_allocator_pool[1024U * 1024U]; // 1MB
int main(int argc, char** argv) {
executorch::runtime::runtime_init();
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_path.empty()) {
fprintf(stderr, "Usage: cp_et_runner --model_path=model.pte\n");
return 1;
}
// Load program
auto loader = executorch::extension::FileDataLoader::from(FLAGS_model_path.c_str());
ET_CHECK_MSG(loader.ok(), "Failed to load %s", FLAGS_model_path.c_str());
auto program = Program::load(&loader.get());
ET_CHECK_MSG(program.ok(), "Failed to parse program");
// Setup memory — allocate planned buffers from program metadata
MemoryAllocator method_allocator(sizeof(method_allocator_pool), method_allocator_pool);
auto temp_allocator = MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool);
auto method_meta = program->method_meta("forward");
ET_CHECK_MSG(method_meta.ok(), "Failed to get method meta");
std::vector<std::unique_ptr<uint8_t[]>> planned_bufs;
std::vector<Span<uint8_t>> planned_spans;
size_t n_planned = method_meta->num_memory_planned_buffers();
for (size_t id = 0; id < n_planned; id++) {
size_t sz = (size_t)method_meta->memory_planned_buffer_size(id).get();
planned_bufs.push_back(std::make_unique<uint8_t[]>(sz));
planned_spans.push_back({planned_bufs.back().get(), sz});
}
HierarchicalAllocator planned_memory({planned_spans.data(), planned_spans.size()});
MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
// Load method
auto method = program->load_method("forward", &memory_manager);
ET_CHECK_MSG(method.ok(), "Failed to load method: 0x%x", (int)method.error());
auto meta = method->method_meta();
fprintf(stderr, "CP_ET: %zu inputs, %zu outputs\n", meta.num_inputs(), meta.num_outputs());
// Load heads, embeddings, rotary
float* heads = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
float* embs_data = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
FILE* fh = fopen(FLAGS_heads_path.c_str(), "rb");
if (fh) { fread(heads, 4, N_CB*N_VOCAB*N_EMBD, fh); fclose(fh); }
FILE* fe = fopen(FLAGS_embs_path.c_str(), "rb");
if (fe) { fread(embs_data, 4, N_CB*N_VOCAB*N_EMBD, fe); fclose(fe); }
float* rcos = load_npy(FLAGS_cos_path.c_str(), 17*HD);
float* rsin = load_npy(FLAGS_sin_path.c_str(), 17*HD);
// Socket setup — TCP if tcp_port > 0, else Unix domain socket
int srv;
if (FLAGS_tcp_port > 0) {
srv = socket(AF_INET, SOCK_STREAM, 0);
int opt = 1; setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
struct sockaddr_in taddr = {}; taddr.sin_family = AF_INET;
taddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
taddr.sin_port = htons(FLAGS_tcp_port);
bind(srv, (struct sockaddr*)&taddr, sizeof(taddr));
listen(srv, 2);
fprintf(stderr, "CP_ET READY on tcp://127.0.0.1:%d\n", FLAGS_tcp_port);
} else {
unlink(FLAGS_sock_path.c_str());
srv = socket(AF_UNIX, SOCK_STREAM, 0);
struct sockaddr_un addr = {}; addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, FLAGS_sock_path.c_str(), sizeof(addr.sun_path)-1);
bind(srv, (struct sockaddr*)&addr, sizeof(addr));
chmod(FLAGS_sock_path.c_str(), 0666);
listen(srv, 1);
fprintf(stderr, "CP_ET READY on %s\n", FLAGS_sock_path.c_str());
}
while (true) {
int cli = accept(srv, nullptr, nullptr);
if (cli < 0) break;
float input[2 * N_EMBD];
while (read_exact(cli, input, sizeof(input))) {
auto t0 = std::chrono::high_resolution_clock::now();
float* hidden_in = input;
float* cb0_emb = input + N_EMBD;
int kv_elem = N_KV * KV_LEN * HD;
std::vector<float> kv(N_L * 2 * kv_elem, 0.0f);
int codes[N_CB] = {};
float* emb = hidden_in;
for (int step = 0; step < 17; step++) {
if (step == 1) emb = cb0_emb;
else if (step >= 2) emb = embs_data + ((step-2)*N_VOCAB + codes[step-2]) * N_EMBD;
// Prepare input tensors (allocates buffers matching the method's expectations)
auto prep = executorch::extension::prepare_input_tensors(method.get());
if (!prep.ok()) { fprintf(stderr, "prep fail %d\n", step); break; }
// Copy our data into the prepared tensors
// Input 0: emb [1,1,1024]
memcpy(method->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, N_EMBD*4);
// Input 1: mask [1,1,1,16]
float* mp = method->mutable_input(1).toTensor().mutable_data_ptr<float>();
for (int p = 0; p < KV_LEN; p++) mp[p] = (p >= KV_LEN-1-step) ? 0.0f : -1e9f;
// Input 2: cos [1,1,128]
memcpy(method->mutable_input(2).toTensor().mutable_data_ptr<float>(), rcos+step*HD, HD*4);
// Input 3: sin [1,1,128]
memcpy(method->mutable_input(3).toTensor().mutable_data_ptr<float>(), rsin+step*HD, HD*4);
// Inputs 4-13: KV caches [1,8,16,128]
for (int l = 0; l < N_L; l++) {
memcpy(method->mutable_input(4+l*2).toTensor().mutable_data_ptr<float>(),
kv.data()+(l*2)*kv_elem, kv_elem*4);
memcpy(method->mutable_input(5+l*2).toTensor().mutable_data_ptr<float>(),
kv.data()+(l*2+1)*kv_elem, kv_elem*4);
}
auto status = method->execute();
if (status != Error::Ok) {
fprintf(stderr, "exec fail step %d: %d\n", step, (int)status);
break;
}
// Get hidden output
const float* h = method->get_output(0).toTensor().const_data_ptr<float>();
// Head argmax on CPU
if (step >= 1 && step-1 < N_CB) {
int cb = step - 1;
const float* W = heads + cb * N_VOCAB * N_EMBD;
int best = 0; float bv = -1e30f;
for (int j = 0; j < N_VOCAB; j++) {
float dot = 0;
for (int k = 0; k < N_EMBD; k++) dot += h[k] * W[j*N_EMBD+k];
if (dot > bv) { bv = dot; best = j; }
}
codes[cb] = best;
}
// Update KV caches from outputs
for (int l = 0; l < N_L; l++) {
const float* ko = method->get_output(1+l*2).toTensor().const_data_ptr<float>();
const float* vo = method->get_output(2+l*2).toTensor().const_data_ptr<float>();
memcpy(kv.data()+(l*2)*kv_elem, ko, kv_elem*4);
memcpy(kv.data()+(l*2+1)*kv_elem, vo, kv_elem*4);
}
}
auto t1 = std::chrono::high_resolution_clock::now();
float ms = std::chrono::duration<float, std::milli>(t1-t0).count();
write_exact(cli, codes, sizeof(codes));
write_exact(cli, &ms, sizeof(ms));
}
close(cli);
}
free(heads); free(embs_data); free(rcos); free(rsin);
close(srv); unlink(FLAGS_sock_path.c_str());
return 0;
}

View File

@ -0,0 +1,122 @@
/**
* CP ET Test Client reads batch input file, sends to cp_et_runner socket,
* collects output codes. Runs ON DEVICE as root to avoid adb forward issues.
*
* Usage: cp_et_test_client --input=/path/input.bin --output=/path/output.bin
* --sock_path=/data/local/tmp/kazeia/cp_et.sock
*
* Input format: int32 n_frames, then per frame: float32[1024] hidden + float32[1024] cb0_emb
* Output format: int32 n_frames, then per frame: int32[15] codes + float32 timing_ms
*/
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
static bool read_exact(int fd, void* buf, size_t n) {
size_t d = 0;
while (d < n) {
ssize_t r = read(fd, (char*)buf + d, n - d);
if (r <= 0) return false;
d += r;
}
return true;
}
static bool write_exact(int fd, const void* buf, size_t n) {
size_t d = 0;
while (d < n) {
ssize_t r = write(fd, (const char*)buf + d, n - d);
if (r <= 0) return false;
d += r;
}
return true;
}
int main(int argc, char** argv) {
const char* input_path = nullptr;
const char* output_path = nullptr;
const char* sock_path = "/data/local/tmp/kazeia/cp_et.sock";
for (int i = 1; i < argc; i++) {
if (strncmp(argv[i], "--input=", 8) == 0) input_path = argv[i] + 8;
else if (strncmp(argv[i], "--output=", 9) == 0) output_path = argv[i] + 9;
else if (strncmp(argv[i], "--sock_path=", 12) == 0) sock_path = argv[i] + 12;
}
if (!input_path || !output_path) {
fprintf(stderr, "Usage: %s --input=in.bin --output=out.bin [--sock_path=...]\n", argv[0]);
return 1;
}
// Read input file
FILE* fin = fopen(input_path, "rb");
if (!fin) { fprintf(stderr, "Cannot open %s\n", input_path); return 1; }
int32_t n_frames;
fread(&n_frames, 4, 1, fin);
fprintf(stderr, "Frames: %d\n", n_frames);
const int N_EMBD = 1024;
float* inputs = (float*)malloc(n_frames * 2 * N_EMBD * sizeof(float));
fread(inputs, sizeof(float), n_frames * 2 * N_EMBD, fin);
fclose(fin);
// Connect to socket
int sock = socket(AF_UNIX, SOCK_STREAM, 0);
if (sock < 0) { perror("socket"); return 1; }
struct sockaddr_un addr = {};
addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1);
if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
perror("connect");
return 1;
}
fprintf(stderr, "Connected to %s\n", sock_path);
// Process frames
FILE* fout = fopen(output_path, "wb");
fwrite(&n_frames, 4, 1, fout);
float total_ms = 0;
for (int i = 0; i < n_frames; i++) {
float* frame = inputs + i * 2 * N_EMBD;
// Send 8192 bytes
if (!write_exact(sock, frame, 2 * N_EMBD * sizeof(float))) {
fprintf(stderr, "Write failed at frame %d\n", i);
break;
}
// Read 64 bytes: 15 ints + 1 float
int32_t codes[15];
float timing;
if (!read_exact(sock, codes, sizeof(codes))) {
fprintf(stderr, "Read codes failed at frame %d\n", i);
break;
}
if (!read_exact(sock, &timing, sizeof(timing))) {
fprintf(stderr, "Read timing failed at frame %d\n", i);
break;
}
fwrite(codes, sizeof(int32_t), 15, fout);
fwrite(&timing, sizeof(float), 1, fout);
total_ms += timing;
fprintf(stderr, " Frame %d: %.1fms codes=[%d,%d,%d,...]\n",
i, timing, codes[0], codes[1], codes[2]);
}
fclose(fout);
close(sock);
free(inputs);
fprintf(stderr, "Done! Total: %.0fms (%.1fms/frame)\n", total_ms, total_ms / n_frames);
return 0;
}

View File

@ -0,0 +1,367 @@
/**
* Native TTS pipeline: talker + CP autoregressive loop in C++.
* One JNI call runs the entire generation returns all codebook codes.
*/
#include <jni.h>
#include <arm_neon.h>
#include <android/log.h>
#include <cstring>
#include <cstdlib>
#include <cfloat>
#include <cmath>
#include <chrono>
#include <memory>
#include <vector>
#include <unordered_set>
#include <executorch/extension/data_loader/file_data_loader.h>
#include <executorch/extension/runner_util/inputs.h>
#include <executorch/runtime/executor/method.h>
#include <executorch/runtime/executor/program.h>
#include <executorch/runtime/platform/runtime.h>
#define TAG "TtsPipeline"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
using executorch::runtime::Error;
using executorch::runtime::EValue;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::Method;
using executorch::runtime::Program;
using executorch::runtime::Span;
static const int DIM=1024, VOCAB=3072, CB_SIZE=2048, NUM_CB=16;
static const int T_L=28, T_KV=8, T_HD=128, T_KV_LEN=100;
static const int C_L=5, C_KV=8, C_HD=128, C_KV_LEN=16;
static const int CODEC_EOS=2150;
static inline float dot_neon(const float* a, const float* b, int n) {
float32x4_t s0=vdupq_n_f32(0),s1=vdupq_n_f32(0),s2=vdupq_n_f32(0),s3=vdupq_n_f32(0);
int i=0;
for(;i+15<n;i+=16){
s0=vfmaq_f32(s0,vld1q_f32(a+i),vld1q_f32(b+i));
s1=vfmaq_f32(s1,vld1q_f32(a+i+4),vld1q_f32(b+i+4));
s2=vfmaq_f32(s2,vld1q_f32(a+i+8),vld1q_f32(b+i+8));
s3=vfmaq_f32(s3,vld1q_f32(a+i+12),vld1q_f32(b+i+12));
}
float r=vaddvq_f32(vaddq_f32(vaddq_f32(s0,s1),vaddq_f32(s2,s3)));
for(;i<n;i++) r+=a[i]*b[i];
return r;
}
static int argmax_head(const float*h,const float*W,int vocab,int dim){
int best=0;float bv=-FLT_MAX;
for(int j=0;j<vocab;j++){float d=dot_neon(h,W+j*dim,dim);if(d>bv){bv=d;best=j;}}
return best;
}
// Top-k sampling with temperature
static int sample_topk(const float* logits, int vocab, float temp, int k) {
// Find top-k
struct IV { int i; float v; };
std::vector<IV> topk(k, {0, -FLT_MAX});
for (int i = 0; i < vocab; i++) {
if (logits[i] > topk[k-1].v) {
topk[k-1] = {i, logits[i]};
// Bubble up
for (int j = k-2; j >= 0; j--) {
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
else break;
}
}
}
// Softmax with temperature
float maxv = topk[0].v;
float sum = 0;
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
// Sample
float r = (float)rand() / RAND_MAX * sum;
float acc = 0;
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
return topk[0].i;
}
struct PipelineState {
std::unique_ptr<executorch::extension::FileDataLoader> tLoader, cLoader;
std::unique_ptr<Program> tProg, cProg;
std::unique_ptr<MemoryManager> tMM, cMM;
Method* talker = nullptr;
Method* cp = nullptr;
std::vector<std::unique_ptr<uint8_t[]>> tBufs, cBufs;
bool loaded = false;
};
static PipelineState* gState = nullptr;
static uint8_t tMethodPool[8*1024*1024], tTempPool[2*1024*1024];
static uint8_t cMethodPool[4*1024*1024], cTempPool[1*1024*1024];
static Method* loadModel(const char* path,
std::unique_ptr<executorch::extension::FileDataLoader>& loader,
std::unique_ptr<Program>& program,
std::unique_ptr<MemoryManager>& mm,
std::vector<std::unique_ptr<uint8_t[]>>& bufs,
uint8_t* mp, size_t mps, uint8_t* tp, size_t tps)
{
auto ld = executorch::extension::FileDataLoader::from(path);
if(!ld.ok()) return nullptr;
loader=std::make_unique<executorch::extension::FileDataLoader>(std::move(ld.get()));
auto prog=Program::load(&*loader); if(!prog.ok()) return nullptr;
program=std::make_unique<Program>(std::move(prog.get()));
auto meta=program->method_meta("forward"); if(!meta.ok()) return nullptr;
std::vector<Span<uint8_t>> spans;
for(size_t i=0;i<meta->num_memory_planned_buffers();i++){
size_t sz=(size_t)meta->memory_planned_buffer_size(i).get();
bufs.push_back(std::make_unique<uint8_t[]>(sz));
spans.push_back({bufs.back().get(),sz});
}
auto*ma=new MemoryAllocator(mps,mp);
auto*ta=new MemoryAllocator(tps,tp);
auto*ha=new HierarchicalAllocator({spans.data(),spans.size()});
mm=std::unique_ptr<MemoryManager>(new MemoryManager(ma,ha,ta));
auto method=program->load_method("forward",mm.get());
if(!method.ok()) return nullptr;
return new Method(std::move(method.get()));
}
extern "C" {
JNIEXPORT jboolean JNICALL
Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring jCP){
executorch::runtime::runtime_init();
if(gState&&gState->loaded) return JNI_TRUE;
const char*tp=env->GetStringUTFChars(jTP,nullptr);
const char*cp=env->GetStringUTFChars(jCP,nullptr);
gState=new PipelineState();
LOGI("Loading talker+CP...");
auto t0=std::chrono::high_resolution_clock::now();
gState->talker=loadModel(tp,gState->tLoader,gState->tProg,gState->tMM,gState->tBufs,tMethodPool,sizeof(tMethodPool),tTempPool,sizeof(tTempPool));
gState->cp=loadModel(cp,gState->cLoader,gState->cProg,gState->cMM,gState->cBufs,cMethodPool,sizeof(cMethodPool),cTempPool,sizeof(cTempPool));
env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
gState->loaded=true;
// Warmup both
{auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
{auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
auto t1=std::chrono::high_resolution_clock::now();
LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
return JNI_TRUE;
}
JNIEXPORT void JNICALL
Java_com_kazeia_tts_TtsPipeline_nativeDestroy(JNIEnv*,jclass){
if(gState){delete gState->talker;delete gState->cp;delete gState;gState=nullptr;}
}
// Helper: run one talker step
static void talkerStep(Method&m, const float*emb, float*mask, int pos,
const float*tCos, const float*tSin, float*tK, float*tV,
float*outHidden, float*outLogits)
{
int kvElem = T_KV * T_KV_LEN * T_HD;
auto prep = executorch::extension::prepare_input_tensors(m);
memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
memcpy(m.mutable_input(1).toTensor().mutable_data_ptr<float>(), mask, T_KV_LEN*4);
int pi = std::min(pos, 249);
memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), tCos+pi*T_HD, T_HD*4);
memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), tSin+pi*T_HD, T_HD*4);
for(int i=0;i<T_L;i++){
memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), tK+i*kvElem, kvElem*4);
memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), tV+i*kvElem, kvElem*4);
}
m.execute();
memcpy(outHidden, m.get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
memcpy(outLogits, m.get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
for(int i=0;i<T_L;i++){
memcpy(tK+i*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
memcpy(tV+i*kvElem, m.get_output(3+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
}
}
// Helper: run full CP (17 steps) → 15 codes
static void cpForward(Method&m, const float*hidden, int cb0,
const float*codecEmb, const float*cpEmbs, const float*cpHeads,
const float*cCos, const float*cSin, int*codes)
{
int kvElem = C_KV * C_KV_LEN * C_HD;
std::vector<float> kv(C_L*2*kvElem, 0.0f);
for(int step=0;step<17;step++){
const float*emb;
if(step==0) emb=hidden;
else if(step==1) emb=codecEmb + std::min(std::max(cb0,0),VOCAB-1)*DIM;
else emb=cpEmbs + ((step-2)*CB_SIZE + std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM;
auto prep=executorch::extension::prepare_input_tensors(m);
memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
// Mask
float*mp=m.mutable_input(1).toTensor().mutable_data_ptr<float>();
for(int p=0;p<C_KV_LEN;p++) mp[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), cCos+step*C_HD, C_HD*4);
memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), cSin+step*C_HD, C_HD*4);
for(int i=0;i<C_L;i++){
memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2)*kvElem, kvElem*4);
memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2+1)*kvElem, kvElem*4);
}
m.execute();
const float*h=m.get_output(0).toTensor().const_data_ptr<float>();
if(step>=1&&step-1<15){
codes[step-1]=argmax_head(h, cpHeads+(step-1)*CB_SIZE*DIM, CB_SIZE, DIM);
}
for(int i=0;i<C_L;i++){
memcpy(kv.data()+(i*2)*kvElem, m.get_output(1+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
memcpy(kv.data()+(i*2+1)*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
}
}
}
JNIEXPORT jintArray JNICALL
Java_com_kazeia_tts_TtsPipeline_nativeRun(
JNIEnv*env,jclass,
jfloatArray jPrefill,jint nPrefill,
jfloatArray jTrailing,jint nTrailing,
jfloatArray jCodecEmb, jfloatArray jCpEmbs, jfloatArray jCpHeads,
jfloatArray jTCos,jfloatArray jTSin, jfloatArray jCCos,jfloatArray jCSin,
jint maxTokens)
{
if(!gState||!gState->loaded) return nullptr;
auto T0=std::chrono::high_resolution_clock::now();
// Copy all data from JNI (then release immediately)
int prefillSize = env->GetArrayLength(jPrefill);
std::vector<float> prefill(prefillSize);
env->GetFloatArrayRegion(jPrefill, 0, prefillSize, prefill.data());
std::vector<float> trailingData;
if(nTrailing>0){trailingData.resize(nTrailing*DIM);env->GetFloatArrayRegion(jTrailing,0,nTrailing*DIM,trailingData.data());}
int codecSize=env->GetArrayLength(jCodecEmb);
std::vector<float> codecEmb(codecSize);
env->GetFloatArrayRegion(jCodecEmb,0,codecSize,codecEmb.data());
int cpEmbsSize=env->GetArrayLength(jCpEmbs);
std::vector<float> cpEmbs(cpEmbsSize);
env->GetFloatArrayRegion(jCpEmbs,0,cpEmbsSize,cpEmbs.data());
int headsSize=env->GetArrayLength(jCpHeads);
std::vector<float> cpHeads(headsSize);
env->GetFloatArrayRegion(jCpHeads,0,headsSize,cpHeads.data());
int tcSize=env->GetArrayLength(jTCos);
std::vector<float> tCos(tcSize),tSin(tcSize);
env->GetFloatArrayRegion(jTCos,0,tcSize,tCos.data());
env->GetFloatArrayRegion(jTSin,0,tcSize,tSin.data());
int ccSize=env->GetArrayLength(jCCos);
std::vector<float> cCos(ccSize),cSin(ccSize);
env->GetFloatArrayRegion(jCCos,0,ccSize,cCos.data());
env->GetFloatArrayRegion(jCSin,0,ccSize,cSin.data());
// Pipeline state
int tkvElem=T_KV*T_KV_LEN*T_HD;
std::vector<float> tK(T_L*tkvElem,0), tV(T_L*tkvElem,0);
float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
float hidden[DIM]={}, logits[VOCAB]={};
std::vector<int> allCodes; // flat: numTokens × 16
std::vector<int> cb0History;
int pos=0, currentCb0=-1;
// ===== PREFILL =====
auto tP0=std::chrono::high_resolution_clock::now();
for(int step=0;step<nPrefill;step++){
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
if(mi>=0) mask[mi]=0.0f;
talkerStep(*gState->talker, prefill.data()+step*DIM, mask, pos, tCos.data(),tSin.data(),
tK.data(), tV.data(), hidden, logits);
pos++;
if(step==nPrefill-1){
for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
currentCb0=sample_topk(logits,VOCAB,0.9f,50);
}
}
auto tP1=std::chrono::high_resolution_clock::now();
LOGI("Prefill: %.0fms, %d steps, cb0=%d",
std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
// ===== GENERATION =====
float totalTalkerMs=0, totalCpMs=0;
int trailingIdx=0;
// Pad embedding (zeros + pad token is not available here, use zeros)
float padEmb[DIM]={}; // In practice should be tts_pad_embed, passed as param
for(int gen=0;gen<maxTokens;gen++){
int codes[NUM_CB]={}; codes[0]=currentCb0;
// CP
auto tc0=std::chrono::high_resolution_clock::now();
int cpCodes[15]={};
cpForward(*gState->cp, hidden, currentCb0, codecEmb.data(), cpEmbs.data(), cpHeads.data(),
cCos.data(), cSin.data(), cpCodes);
auto tc1=std::chrono::high_resolution_clock::now();
totalCpMs+=std::chrono::duration<float,std::milli>(tc1-tc0).count();
for(int i=0;i<15;i++) codes[i+1]=cpCodes[i];
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
cb0History.push_back(currentCb0);
// Build next talker input: sum codec embeddings
float nextEmb[DIM]={};
// cb0 embedding
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
// cb1-15 embeddings
for(int cb=0;cb<15;cb++){
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
}
// Add trailing text or pad
if(trailingIdx<nTrailing){
const float*te=trailingData.data()+trailingIdx*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
trailingIdx++;
}
// (pad embedding = zeros, already added implicitly)
// Talker step
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
if(mi>=0) mask[mi]=0.0f;
auto tt0=std::chrono::high_resolution_clock::now();
talkerStep(*gState->talker, nextEmb, mask, pos, tCos.data(),tSin.data(),
tK.data(), tV.data(), hidden, logits);
auto tt1=std::chrono::high_resolution_clock::now();
totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
pos++;
// Sample next cb0 (suppress non-codec, repetition penalty)
for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
int nextCb0=sample_topk(logits,VOCAB,0.9f,50);
if(nextCb0==CODEC_EOS){LOGI("EOS at step %d",gen+2);break;}
// Degeneration check
int histSz=(int)cb0History.size();
if(histSz>=9){
bool degen=true;
for(int i=histSz-9;i<histSz;i++) if(cb0History[i]!=nextCb0){degen=false;break;}
if(degen){LOGI("Degeneration at step %d",gen+2);break;}
}
currentCb0=nextCb0;
}
int nTokens=(int)allCodes.size()/NUM_CB;
auto T1=std::chrono::high_resolution_clock::now();
LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),
totalCpMs, totalCpMs/std::max(nTokens,1),
std::chrono::duration<float,std::milli>(T1-T0).count());
jintArray result=env->NewIntArray((int)allCodes.size());
env->SetIntArrayRegion(result,0,(int)allCodes.size(),allCodes.data());
return result;
}
} // extern "C"

View File

@ -130,6 +130,7 @@ class Qwen3TtsEngine(
private var cpAllHeads: FloatArray? = null // all 15 heads concatenated [15*2048*1024] for batch NEON
private var cpPteModule: org.pytorch.executorch.Module? = null // ExecuTorch CP on NPU (JNI)
private var talkerPteModule: org.pytorch.executorch.Module? = null // ExecuTorch talker on NPU (JNI)
private var nativePipelineReady: Boolean = false // C++ native pipeline available
private var talkerPteRotaryCos: FloatArray? = null
private var talkerPteRotarySin: FloatArray? = null
private var useEtCp: Boolean = false // CP via ExecuTorch runner process (root)
@ -307,6 +308,19 @@ class Qwen3TtsEngine(
cpPteModule!!.forward(*cIns.toTypedArray())
nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
} catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
// Init native C++ pipeline (loads models with own ExecuTorch runtime)
try {
val tn = System.currentTimeMillis()
nativePipelineReady = TtsPipeline.nativeInit(
"/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
"/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
)
nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
} catch (e: Exception) {
nlog("Native pipeline init failed: ${e.message}")
nativePipelineReady = false
}
}
} catch (e: Exception) {
nlog("Talker .pte JNI failed: ${e.message}")
@ -2272,23 +2286,47 @@ class Qwen3TtsEngine(
val embeds = Array(nTotal) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = bb.float } }
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
// Build textEmbedsList: first nPrefill are "prefill", rest are trailing
val textEmbedsList = embeds.toList()
val allCodesArray = runInterleavedPte(textEmbedsList.subList(0, 1), maxGenTokens = nTotal - nPrefill)
// Note: runInterleavedPte handles prefill internally via buildPrefillEmbeddings
val allCodes: Array<IntArray>
if (nativePipelineReady) {
// Native C++ pipeline — zero Java overhead
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
val nTrailing = nTotal - nPrefill
val trailingFlat = if (nTrailing > 0) FloatArray(nTrailing * TALKER_DIM).also { arr ->
for (i in 0 until nTrailing) System.arraycopy(embeds[nPrefill + i], 0, arr, i * TALKER_DIM, TALKER_DIM)
} else null
// Actually, generateFromEmbeds uses pre-computed embeds directly.
// Let's use runInterleavedPte properly by passing all embeds as textEmbedsList
// runInterleavedPte's buildPrefillEmbeddings will create the prefill from first embed
// But we need a different approach: pass all embeds directly.
// Load CP heads if not already
if (cpAllHeads == null) {
val headsFile = java.io.File("/data/local/tmp/kazeia/models/cp_heads.bin")
if (headsFile.exists()) {
val hb = headsFile.readBytes()
cpAllHeads = FloatArray(hb.size / 4)
ByteBuffer.wrap(hb).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(cpAllHeads!!)
}
}
// Simpler: use the run_pipeline code path which goes through generateSpeech → runInterleavedGeneration
// For now, let's just call runInterleavedPte with the right embeds structure
nlog("Running native C++ pipeline...")
val flat = TtsPipeline.nativeRun(
prefillFlat, nPrefill,
trailingFlat, nTrailing,
codecEmbedding ?: FloatArray(0),
cpEmbeddings ?: FloatArray(0),
cpAllHeads ?: FloatArray(0),
talkerPteRotaryCos ?: FloatArray(0), talkerPteRotarySin ?: FloatArray(0),
cpRotaryCos ?: FloatArray(0), cpRotarySin ?: FloatArray(0),
nTotal - nPrefill
)
if (flat == null || flat.isEmpty()) return ShortArray(0)
val nTokens = flat.size / NUM_CODEBOOKS
allCodes = Array(nTokens) { t -> IntArray(NUM_CODEBOOKS) { cb -> flat[t * NUM_CODEBOOKS + cb] } }
nlog("Native pipeline: $nTokens tokens")
} else {
// Fallback: Java pipeline
val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList()
val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList()
// Call the PTE pipeline directly
val allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
}
if (allCodes.isEmpty()) return ShortArray(0)
val numRealTokens = allCodes.size

View File

@ -0,0 +1,27 @@
package com.kazeia.tts
/** Native C++ TTS pipeline: talker + CP loop with zero Java overhead. */
object TtsPipeline {
init { System.loadLibrary("tts_pipeline") }
/** Load and warmup both .pte models. Returns true on success. */
external fun nativeInit(talkerPath: String, cpPath: String): Boolean
/** Release native resources. */
external fun nativeDestroy()
/**
* Run full pipeline. Returns flat int array: [numTokens × 16] codebook codes.
* Each group of 16 ints = [CB0, CB1, ..., CB15] for one time step.
*/
external fun nativeRun(
prefillEmbeds: FloatArray, nPrefill: Int,
trailingEmbeds: FloatArray?, nTrailing: Int,
codecEmbedding: FloatArray,
cpEmbeddings: FloatArray,
cpHeads: FloatArray,
talkerCos: FloatArray, talkerSin: FloatArray,
cpCos: FloatArray, cpSin: FloatArray,
maxTokens: Int
): IntArray?
}

View File

@ -37,6 +37,10 @@ target_include_directories(whisper_jni PRIVATE
target_link_libraries(whisper_jni whisper ggml ggml-base ggml-cpu android log)
target_compile_options(whisper_jni PRIVATE -std=c++17 -O2)
# --- TTS Pipeline: built externally via ExecuTorch cmake, copied to jniLibs ---
# Build with: cd /opt/Kazeia/executorch/build-android && cmake --build . --target tts_pipeline_jni -j$(nproc)
# Then copy to jniLibs/arm64-v8a/libtts_pipeline.so
# --- NEON optimized ops for TTS heads ---
add_library(neon_ops SHARED neon_ops.cpp)
target_link_libraries(neon_ops log)