Native C++ pipeline: RTF 1.4 (was 3.6 in Java)
Full talker+CP autoregressive loop in C++ via JNI. Talker 20ms/step, CP 44ms/step, total 6.6s for 4.64s audio. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fb6045a635
commit
393ce79eb5
|
|
@ -0,0 +1,222 @@
|
|||
/**
|
||||
* TTS Code Predictor Runner — ExecuTorch .pte on NPU HTP.
|
||||
* Based on executor_runner.cpp but with socket IPC for the app.
|
||||
* Same protocol as the GGUF CP runner.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
|
||||
#include <executorch/extension/data_loader/file_data_loader.h>
|
||||
#include <executorch/extension/runner_util/inputs.h>
|
||||
#include <executorch/runtime/executor/method.h>
|
||||
#include <executorch/runtime/executor/program.h>
|
||||
#include <executorch/runtime/platform/runtime.h>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
|
||||
DEFINE_string(model_path, "", "Path to .pte file");
|
||||
DEFINE_string(sock_path, "/data/local/tmp/kazeia/cp_et.sock", "Socket path");
|
||||
DEFINE_int32(tcp_port, 8790, "TCP port (0=disabled, use unix socket)");
|
||||
DEFINE_string(heads_path, "/data/local/tmp/kazeia/models/cp_heads.bin", "Heads file");
|
||||
DEFINE_string(embs_path, "/data/local/tmp/kazeia/models/cp_codec_embs.bin", "Codec embs file");
|
||||
DEFINE_string(cos_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_cos.npy", "Cos file");
|
||||
DEFINE_string(sin_path, "/data/local/tmp/kazeia/models/qwen3-tts-npu/cp_kv_v2/cp_rotary_sin.npy", "Sin file");
|
||||
|
||||
using executorch::runtime::Error;
|
||||
using executorch::runtime::EValue;
|
||||
using executorch::runtime::HierarchicalAllocator;
|
||||
using executorch::runtime::MemoryAllocator;
|
||||
using executorch::runtime::MemoryManager;
|
||||
using executorch::runtime::Method;
|
||||
using executorch::runtime::Program;
|
||||
using executorch::runtime::Result;
|
||||
using executorch::runtime::Span;
|
||||
|
||||
static const int N_EMBD=1024, N_VOCAB=2048, N_CB=15, N_KV=8, HD=128, KV_LEN=16, N_L=5;
|
||||
|
||||
static bool read_exact(int fd,void*buf,size_t n){
|
||||
size_t d=0;while(d<n){ssize_t r=read(fd,(char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
|
||||
}
|
||||
static bool write_exact(int fd,const void*buf,size_t n){
|
||||
size_t d=0;while(d<n){ssize_t r=write(fd,(const char*)buf+d,n-d);if(r<=0)return false;d+=r;}return true;
|
||||
}
|
||||
|
||||
static float* load_npy(const char*p,int n){
|
||||
FILE*f=fopen(p,"rb");if(!f)return nullptr;
|
||||
unsigned char h[10];fread(h,1,10,f);
|
||||
int hl=h[8]|(h[9]<<8);fseek(f,10+hl,SEEK_SET);
|
||||
float*d=(float*)malloc(n*4);fread(d,4,n,f);fclose(f);return d;
|
||||
}
|
||||
|
||||
static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
|
||||
static uint8_t temp_allocator_pool[1024U * 1024U]; // 1MB
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
executorch::runtime::runtime_init();
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
||||
|
||||
if (FLAGS_model_path.empty()) {
|
||||
fprintf(stderr, "Usage: cp_et_runner --model_path=model.pte\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load program
|
||||
auto loader = executorch::extension::FileDataLoader::from(FLAGS_model_path.c_str());
|
||||
ET_CHECK_MSG(loader.ok(), "Failed to load %s", FLAGS_model_path.c_str());
|
||||
|
||||
auto program = Program::load(&loader.get());
|
||||
ET_CHECK_MSG(program.ok(), "Failed to parse program");
|
||||
|
||||
// Setup memory — allocate planned buffers from program metadata
|
||||
MemoryAllocator method_allocator(sizeof(method_allocator_pool), method_allocator_pool);
|
||||
auto temp_allocator = MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool);
|
||||
|
||||
auto method_meta = program->method_meta("forward");
|
||||
ET_CHECK_MSG(method_meta.ok(), "Failed to get method meta");
|
||||
|
||||
std::vector<std::unique_ptr<uint8_t[]>> planned_bufs;
|
||||
std::vector<Span<uint8_t>> planned_spans;
|
||||
size_t n_planned = method_meta->num_memory_planned_buffers();
|
||||
for (size_t id = 0; id < n_planned; id++) {
|
||||
size_t sz = (size_t)method_meta->memory_planned_buffer_size(id).get();
|
||||
planned_bufs.push_back(std::make_unique<uint8_t[]>(sz));
|
||||
planned_spans.push_back({planned_bufs.back().get(), sz});
|
||||
}
|
||||
HierarchicalAllocator planned_memory({planned_spans.data(), planned_spans.size()});
|
||||
MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
|
||||
|
||||
// Load method
|
||||
auto method = program->load_method("forward", &memory_manager);
|
||||
ET_CHECK_MSG(method.ok(), "Failed to load method: 0x%x", (int)method.error());
|
||||
|
||||
auto meta = method->method_meta();
|
||||
fprintf(stderr, "CP_ET: %zu inputs, %zu outputs\n", meta.num_inputs(), meta.num_outputs());
|
||||
|
||||
// Load heads, embeddings, rotary
|
||||
float* heads = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
|
||||
float* embs_data = (float*)malloc(N_CB * N_VOCAB * N_EMBD * 4);
|
||||
FILE* fh = fopen(FLAGS_heads_path.c_str(), "rb");
|
||||
if (fh) { fread(heads, 4, N_CB*N_VOCAB*N_EMBD, fh); fclose(fh); }
|
||||
FILE* fe = fopen(FLAGS_embs_path.c_str(), "rb");
|
||||
if (fe) { fread(embs_data, 4, N_CB*N_VOCAB*N_EMBD, fe); fclose(fe); }
|
||||
float* rcos = load_npy(FLAGS_cos_path.c_str(), 17*HD);
|
||||
float* rsin = load_npy(FLAGS_sin_path.c_str(), 17*HD);
|
||||
|
||||
// Socket setup — TCP if tcp_port > 0, else Unix domain socket
|
||||
int srv;
|
||||
if (FLAGS_tcp_port > 0) {
|
||||
srv = socket(AF_INET, SOCK_STREAM, 0);
|
||||
int opt = 1; setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
|
||||
struct sockaddr_in taddr = {}; taddr.sin_family = AF_INET;
|
||||
taddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
|
||||
taddr.sin_port = htons(FLAGS_tcp_port);
|
||||
bind(srv, (struct sockaddr*)&taddr, sizeof(taddr));
|
||||
listen(srv, 2);
|
||||
fprintf(stderr, "CP_ET READY on tcp://127.0.0.1:%d\n", FLAGS_tcp_port);
|
||||
} else {
|
||||
unlink(FLAGS_sock_path.c_str());
|
||||
srv = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
struct sockaddr_un addr = {}; addr.sun_family = AF_UNIX;
|
||||
strncpy(addr.sun_path, FLAGS_sock_path.c_str(), sizeof(addr.sun_path)-1);
|
||||
bind(srv, (struct sockaddr*)&addr, sizeof(addr));
|
||||
chmod(FLAGS_sock_path.c_str(), 0666);
|
||||
listen(srv, 1);
|
||||
fprintf(stderr, "CP_ET READY on %s\n", FLAGS_sock_path.c_str());
|
||||
}
|
||||
|
||||
while (true) {
|
||||
int cli = accept(srv, nullptr, nullptr);
|
||||
if (cli < 0) break;
|
||||
|
||||
float input[2 * N_EMBD];
|
||||
while (read_exact(cli, input, sizeof(input))) {
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
float* hidden_in = input;
|
||||
float* cb0_emb = input + N_EMBD;
|
||||
int kv_elem = N_KV * KV_LEN * HD;
|
||||
std::vector<float> kv(N_L * 2 * kv_elem, 0.0f);
|
||||
int codes[N_CB] = {};
|
||||
float* emb = hidden_in;
|
||||
|
||||
for (int step = 0; step < 17; step++) {
|
||||
if (step == 1) emb = cb0_emb;
|
||||
else if (step >= 2) emb = embs_data + ((step-2)*N_VOCAB + codes[step-2]) * N_EMBD;
|
||||
|
||||
// Prepare input tensors (allocates buffers matching the method's expectations)
|
||||
auto prep = executorch::extension::prepare_input_tensors(method.get());
|
||||
if (!prep.ok()) { fprintf(stderr, "prep fail %d\n", step); break; }
|
||||
|
||||
// Copy our data into the prepared tensors
|
||||
// Input 0: emb [1,1,1024]
|
||||
memcpy(method->mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, N_EMBD*4);
|
||||
// Input 1: mask [1,1,1,16]
|
||||
float* mp = method->mutable_input(1).toTensor().mutable_data_ptr<float>();
|
||||
for (int p = 0; p < KV_LEN; p++) mp[p] = (p >= KV_LEN-1-step) ? 0.0f : -1e9f;
|
||||
// Input 2: cos [1,1,128]
|
||||
memcpy(method->mutable_input(2).toTensor().mutable_data_ptr<float>(), rcos+step*HD, HD*4);
|
||||
// Input 3: sin [1,1,128]
|
||||
memcpy(method->mutable_input(3).toTensor().mutable_data_ptr<float>(), rsin+step*HD, HD*4);
|
||||
// Inputs 4-13: KV caches [1,8,16,128]
|
||||
for (int l = 0; l < N_L; l++) {
|
||||
memcpy(method->mutable_input(4+l*2).toTensor().mutable_data_ptr<float>(),
|
||||
kv.data()+(l*2)*kv_elem, kv_elem*4);
|
||||
memcpy(method->mutable_input(5+l*2).toTensor().mutable_data_ptr<float>(),
|
||||
kv.data()+(l*2+1)*kv_elem, kv_elem*4);
|
||||
}
|
||||
|
||||
auto status = method->execute();
|
||||
if (status != Error::Ok) {
|
||||
fprintf(stderr, "exec fail step %d: %d\n", step, (int)status);
|
||||
break;
|
||||
}
|
||||
|
||||
// Get hidden output
|
||||
const float* h = method->get_output(0).toTensor().const_data_ptr<float>();
|
||||
|
||||
// Head argmax on CPU
|
||||
if (step >= 1 && step-1 < N_CB) {
|
||||
int cb = step - 1;
|
||||
const float* W = heads + cb * N_VOCAB * N_EMBD;
|
||||
int best = 0; float bv = -1e30f;
|
||||
for (int j = 0; j < N_VOCAB; j++) {
|
||||
float dot = 0;
|
||||
for (int k = 0; k < N_EMBD; k++) dot += h[k] * W[j*N_EMBD+k];
|
||||
if (dot > bv) { bv = dot; best = j; }
|
||||
}
|
||||
codes[cb] = best;
|
||||
}
|
||||
|
||||
// Update KV caches from outputs
|
||||
for (int l = 0; l < N_L; l++) {
|
||||
const float* ko = method->get_output(1+l*2).toTensor().const_data_ptr<float>();
|
||||
const float* vo = method->get_output(2+l*2).toTensor().const_data_ptr<float>();
|
||||
memcpy(kv.data()+(l*2)*kv_elem, ko, kv_elem*4);
|
||||
memcpy(kv.data()+(l*2+1)*kv_elem, vo, kv_elem*4);
|
||||
}
|
||||
}
|
||||
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
float ms = std::chrono::duration<float, std::milli>(t1-t0).count();
|
||||
write_exact(cli, codes, sizeof(codes));
|
||||
write_exact(cli, &ms, sizeof(ms));
|
||||
}
|
||||
close(cli);
|
||||
}
|
||||
|
||||
free(heads); free(embs_data); free(rcos); free(rsin);
|
||||
close(srv); unlink(FLAGS_sock_path.c_str());
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
/**
|
||||
* CP ET Test Client — reads batch input file, sends to cp_et_runner socket,
|
||||
* collects output codes. Runs ON DEVICE as root to avoid adb forward issues.
|
||||
*
|
||||
* Usage: cp_et_test_client --input=/path/input.bin --output=/path/output.bin
|
||||
* --sock_path=/data/local/tmp/kazeia/cp_et.sock
|
||||
*
|
||||
* Input format: int32 n_frames, then per frame: float32[1024] hidden + float32[1024] cb0_emb
|
||||
* Output format: int32 n_frames, then per frame: int32[15] codes + float32 timing_ms
|
||||
*/
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static bool read_exact(int fd, void* buf, size_t n) {
|
||||
size_t d = 0;
|
||||
while (d < n) {
|
||||
ssize_t r = read(fd, (char*)buf + d, n - d);
|
||||
if (r <= 0) return false;
|
||||
d += r;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool write_exact(int fd, const void* buf, size_t n) {
|
||||
size_t d = 0;
|
||||
while (d < n) {
|
||||
ssize_t r = write(fd, (const char*)buf + d, n - d);
|
||||
if (r <= 0) return false;
|
||||
d += r;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
const char* input_path = nullptr;
|
||||
const char* output_path = nullptr;
|
||||
const char* sock_path = "/data/local/tmp/kazeia/cp_et.sock";
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strncmp(argv[i], "--input=", 8) == 0) input_path = argv[i] + 8;
|
||||
else if (strncmp(argv[i], "--output=", 9) == 0) output_path = argv[i] + 9;
|
||||
else if (strncmp(argv[i], "--sock_path=", 12) == 0) sock_path = argv[i] + 12;
|
||||
}
|
||||
|
||||
if (!input_path || !output_path) {
|
||||
fprintf(stderr, "Usage: %s --input=in.bin --output=out.bin [--sock_path=...]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Read input file
|
||||
FILE* fin = fopen(input_path, "rb");
|
||||
if (!fin) { fprintf(stderr, "Cannot open %s\n", input_path); return 1; }
|
||||
|
||||
int32_t n_frames;
|
||||
fread(&n_frames, 4, 1, fin);
|
||||
fprintf(stderr, "Frames: %d\n", n_frames);
|
||||
|
||||
const int N_EMBD = 1024;
|
||||
float* inputs = (float*)malloc(n_frames * 2 * N_EMBD * sizeof(float));
|
||||
fread(inputs, sizeof(float), n_frames * 2 * N_EMBD, fin);
|
||||
fclose(fin);
|
||||
|
||||
// Connect to socket
|
||||
int sock = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
if (sock < 0) { perror("socket"); return 1; }
|
||||
|
||||
struct sockaddr_un addr = {};
|
||||
addr.sun_family = AF_UNIX;
|
||||
strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1);
|
||||
|
||||
if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
|
||||
perror("connect");
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "Connected to %s\n", sock_path);
|
||||
|
||||
// Process frames
|
||||
FILE* fout = fopen(output_path, "wb");
|
||||
fwrite(&n_frames, 4, 1, fout);
|
||||
|
||||
float total_ms = 0;
|
||||
for (int i = 0; i < n_frames; i++) {
|
||||
float* frame = inputs + i * 2 * N_EMBD;
|
||||
|
||||
// Send 8192 bytes
|
||||
if (!write_exact(sock, frame, 2 * N_EMBD * sizeof(float))) {
|
||||
fprintf(stderr, "Write failed at frame %d\n", i);
|
||||
break;
|
||||
}
|
||||
|
||||
// Read 64 bytes: 15 ints + 1 float
|
||||
int32_t codes[15];
|
||||
float timing;
|
||||
if (!read_exact(sock, codes, sizeof(codes))) {
|
||||
fprintf(stderr, "Read codes failed at frame %d\n", i);
|
||||
break;
|
||||
}
|
||||
if (!read_exact(sock, &timing, sizeof(timing))) {
|
||||
fprintf(stderr, "Read timing failed at frame %d\n", i);
|
||||
break;
|
||||
}
|
||||
|
||||
fwrite(codes, sizeof(int32_t), 15, fout);
|
||||
fwrite(&timing, sizeof(float), 1, fout);
|
||||
total_ms += timing;
|
||||
|
||||
fprintf(stderr, " Frame %d: %.1fms codes=[%d,%d,%d,...]\n",
|
||||
i, timing, codes[0], codes[1], codes[2]);
|
||||
}
|
||||
|
||||
fclose(fout);
|
||||
close(sock);
|
||||
free(inputs);
|
||||
|
||||
fprintf(stderr, "Done! Total: %.0fms (%.1fms/frame)\n", total_ms, total_ms / n_frames);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,367 @@
|
|||
/**
|
||||
* Native TTS pipeline: talker + CP autoregressive loop in C++.
|
||||
* One JNI call runs the entire generation → returns all codebook codes.
|
||||
*/
|
||||
#include <jni.h>
|
||||
#include <arm_neon.h>
|
||||
#include <android/log.h>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
#include <executorch/extension/data_loader/file_data_loader.h>
|
||||
#include <executorch/extension/runner_util/inputs.h>
|
||||
#include <executorch/runtime/executor/method.h>
|
||||
#include <executorch/runtime/executor/program.h>
|
||||
#include <executorch/runtime/platform/runtime.h>
|
||||
|
||||
#define TAG "TtsPipeline"
|
||||
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
||||
|
||||
using executorch::runtime::Error;
|
||||
using executorch::runtime::EValue;
|
||||
using executorch::runtime::HierarchicalAllocator;
|
||||
using executorch::runtime::MemoryAllocator;
|
||||
using executorch::runtime::MemoryManager;
|
||||
using executorch::runtime::Method;
|
||||
using executorch::runtime::Program;
|
||||
using executorch::runtime::Span;
|
||||
|
||||
static const int DIM=1024, VOCAB=3072, CB_SIZE=2048, NUM_CB=16;
|
||||
static const int T_L=28, T_KV=8, T_HD=128, T_KV_LEN=100;
|
||||
static const int C_L=5, C_KV=8, C_HD=128, C_KV_LEN=16;
|
||||
static const int CODEC_EOS=2150;
|
||||
|
||||
static inline float dot_neon(const float* a, const float* b, int n) {
|
||||
float32x4_t s0=vdupq_n_f32(0),s1=vdupq_n_f32(0),s2=vdupq_n_f32(0),s3=vdupq_n_f32(0);
|
||||
int i=0;
|
||||
for(;i+15<n;i+=16){
|
||||
s0=vfmaq_f32(s0,vld1q_f32(a+i),vld1q_f32(b+i));
|
||||
s1=vfmaq_f32(s1,vld1q_f32(a+i+4),vld1q_f32(b+i+4));
|
||||
s2=vfmaq_f32(s2,vld1q_f32(a+i+8),vld1q_f32(b+i+8));
|
||||
s3=vfmaq_f32(s3,vld1q_f32(a+i+12),vld1q_f32(b+i+12));
|
||||
}
|
||||
float r=vaddvq_f32(vaddq_f32(vaddq_f32(s0,s1),vaddq_f32(s2,s3)));
|
||||
for(;i<n;i++) r+=a[i]*b[i];
|
||||
return r;
|
||||
}
|
||||
|
||||
static int argmax_head(const float*h,const float*W,int vocab,int dim){
|
||||
int best=0;float bv=-FLT_MAX;
|
||||
for(int j=0;j<vocab;j++){float d=dot_neon(h,W+j*dim,dim);if(d>bv){bv=d;best=j;}}
|
||||
return best;
|
||||
}
|
||||
|
||||
// Top-k sampling with temperature
|
||||
static int sample_topk(const float* logits, int vocab, float temp, int k) {
|
||||
// Find top-k
|
||||
struct IV { int i; float v; };
|
||||
std::vector<IV> topk(k, {0, -FLT_MAX});
|
||||
for (int i = 0; i < vocab; i++) {
|
||||
if (logits[i] > topk[k-1].v) {
|
||||
topk[k-1] = {i, logits[i]};
|
||||
// Bubble up
|
||||
for (int j = k-2; j >= 0; j--) {
|
||||
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
|
||||
else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Softmax with temperature
|
||||
float maxv = topk[0].v;
|
||||
float sum = 0;
|
||||
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
|
||||
// Sample
|
||||
float r = (float)rand() / RAND_MAX * sum;
|
||||
float acc = 0;
|
||||
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
|
||||
return topk[0].i;
|
||||
}
|
||||
|
||||
struct PipelineState {
|
||||
std::unique_ptr<executorch::extension::FileDataLoader> tLoader, cLoader;
|
||||
std::unique_ptr<Program> tProg, cProg;
|
||||
std::unique_ptr<MemoryManager> tMM, cMM;
|
||||
Method* talker = nullptr;
|
||||
Method* cp = nullptr;
|
||||
std::vector<std::unique_ptr<uint8_t[]>> tBufs, cBufs;
|
||||
bool loaded = false;
|
||||
};
|
||||
static PipelineState* gState = nullptr;
|
||||
static uint8_t tMethodPool[8*1024*1024], tTempPool[2*1024*1024];
|
||||
static uint8_t cMethodPool[4*1024*1024], cTempPool[1*1024*1024];
|
||||
|
||||
static Method* loadModel(const char* path,
|
||||
std::unique_ptr<executorch::extension::FileDataLoader>& loader,
|
||||
std::unique_ptr<Program>& program,
|
||||
std::unique_ptr<MemoryManager>& mm,
|
||||
std::vector<std::unique_ptr<uint8_t[]>>& bufs,
|
||||
uint8_t* mp, size_t mps, uint8_t* tp, size_t tps)
|
||||
{
|
||||
auto ld = executorch::extension::FileDataLoader::from(path);
|
||||
if(!ld.ok()) return nullptr;
|
||||
loader=std::make_unique<executorch::extension::FileDataLoader>(std::move(ld.get()));
|
||||
auto prog=Program::load(&*loader); if(!prog.ok()) return nullptr;
|
||||
program=std::make_unique<Program>(std::move(prog.get()));
|
||||
auto meta=program->method_meta("forward"); if(!meta.ok()) return nullptr;
|
||||
std::vector<Span<uint8_t>> spans;
|
||||
for(size_t i=0;i<meta->num_memory_planned_buffers();i++){
|
||||
size_t sz=(size_t)meta->memory_planned_buffer_size(i).get();
|
||||
bufs.push_back(std::make_unique<uint8_t[]>(sz));
|
||||
spans.push_back({bufs.back().get(),sz});
|
||||
}
|
||||
auto*ma=new MemoryAllocator(mps,mp);
|
||||
auto*ta=new MemoryAllocator(tps,tp);
|
||||
auto*ha=new HierarchicalAllocator({spans.data(),spans.size()});
|
||||
mm=std::unique_ptr<MemoryManager>(new MemoryManager(ma,ha,ta));
|
||||
auto method=program->load_method("forward",mm.get());
|
||||
if(!method.ok()) return nullptr;
|
||||
return new Method(std::move(method.get()));
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
JNIEXPORT jboolean JNICALL
|
||||
Java_com_kazeia_tts_TtsPipeline_nativeInit(JNIEnv*env,jclass,jstring jTP,jstring jCP){
|
||||
executorch::runtime::runtime_init();
|
||||
if(gState&&gState->loaded) return JNI_TRUE;
|
||||
const char*tp=env->GetStringUTFChars(jTP,nullptr);
|
||||
const char*cp=env->GetStringUTFChars(jCP,nullptr);
|
||||
gState=new PipelineState();
|
||||
LOGI("Loading talker+CP...");
|
||||
auto t0=std::chrono::high_resolution_clock::now();
|
||||
gState->talker=loadModel(tp,gState->tLoader,gState->tProg,gState->tMM,gState->tBufs,tMethodPool,sizeof(tMethodPool),tTempPool,sizeof(tTempPool));
|
||||
gState->cp=loadModel(cp,gState->cLoader,gState->cProg,gState->cMM,gState->cBufs,cMethodPool,sizeof(cMethodPool),cTempPool,sizeof(cTempPool));
|
||||
env->ReleaseStringUTFChars(jTP,tp); env->ReleaseStringUTFChars(jCP,cp);
|
||||
if(!gState->talker||!gState->cp){LOGI("Load failed");delete gState;gState=nullptr;return JNI_FALSE;}
|
||||
gState->loaded=true;
|
||||
// Warmup both
|
||||
{auto p=executorch::extension::prepare_input_tensors(*gState->talker);if(p.ok())gState->talker->execute();}
|
||||
{auto p=executorch::extension::prepare_input_tensors(*gState->cp);if(p.ok())gState->cp->execute();}
|
||||
auto t1=std::chrono::high_resolution_clock::now();
|
||||
LOGI("Loaded+warmup: %.0fms",std::chrono::duration<float,std::milli>(t1-t0).count());
|
||||
return JNI_TRUE;
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_kazeia_tts_TtsPipeline_nativeDestroy(JNIEnv*,jclass){
|
||||
if(gState){delete gState->talker;delete gState->cp;delete gState;gState=nullptr;}
|
||||
}
|
||||
|
||||
// Helper: run one talker step
|
||||
static void talkerStep(Method&m, const float*emb, float*mask, int pos,
|
||||
const float*tCos, const float*tSin, float*tK, float*tV,
|
||||
float*outHidden, float*outLogits)
|
||||
{
|
||||
int kvElem = T_KV * T_KV_LEN * T_HD;
|
||||
auto prep = executorch::extension::prepare_input_tensors(m);
|
||||
memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
|
||||
memcpy(m.mutable_input(1).toTensor().mutable_data_ptr<float>(), mask, T_KV_LEN*4);
|
||||
int pi = std::min(pos, 249);
|
||||
memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), tCos+pi*T_HD, T_HD*4);
|
||||
memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), tSin+pi*T_HD, T_HD*4);
|
||||
for(int i=0;i<T_L;i++){
|
||||
memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), tK+i*kvElem, kvElem*4);
|
||||
memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), tV+i*kvElem, kvElem*4);
|
||||
}
|
||||
m.execute();
|
||||
memcpy(outHidden, m.get_output(0).toTensor().const_data_ptr<float>(), DIM*4);
|
||||
memcpy(outLogits, m.get_output(1).toTensor().const_data_ptr<float>(), VOCAB*4);
|
||||
for(int i=0;i<T_L;i++){
|
||||
memcpy(tK+i*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
|
||||
memcpy(tV+i*kvElem, m.get_output(3+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper: run full CP (17 steps) → 15 codes
|
||||
static void cpForward(Method&m, const float*hidden, int cb0,
|
||||
const float*codecEmb, const float*cpEmbs, const float*cpHeads,
|
||||
const float*cCos, const float*cSin, int*codes)
|
||||
{
|
||||
int kvElem = C_KV * C_KV_LEN * C_HD;
|
||||
std::vector<float> kv(C_L*2*kvElem, 0.0f);
|
||||
|
||||
for(int step=0;step<17;step++){
|
||||
const float*emb;
|
||||
if(step==0) emb=hidden;
|
||||
else if(step==1) emb=codecEmb + std::min(std::max(cb0,0),VOCAB-1)*DIM;
|
||||
else emb=cpEmbs + ((step-2)*CB_SIZE + std::min(std::max(codes[step-2],0),CB_SIZE-1))*DIM;
|
||||
|
||||
auto prep=executorch::extension::prepare_input_tensors(m);
|
||||
memcpy(m.mutable_input(0).toTensor().mutable_data_ptr<float>(), emb, DIM*4);
|
||||
// Mask
|
||||
float*mp=m.mutable_input(1).toTensor().mutable_data_ptr<float>();
|
||||
for(int p=0;p<C_KV_LEN;p++) mp[p]=(p>=C_KV_LEN-1-step)?0.0f:-1e9f;
|
||||
memcpy(m.mutable_input(2).toTensor().mutable_data_ptr<float>(), cCos+step*C_HD, C_HD*4);
|
||||
memcpy(m.mutable_input(3).toTensor().mutable_data_ptr<float>(), cSin+step*C_HD, C_HD*4);
|
||||
for(int i=0;i<C_L;i++){
|
||||
memcpy(m.mutable_input(4+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2)*kvElem, kvElem*4);
|
||||
memcpy(m.mutable_input(5+i*2).toTensor().mutable_data_ptr<float>(), kv.data()+(i*2+1)*kvElem, kvElem*4);
|
||||
}
|
||||
m.execute();
|
||||
const float*h=m.get_output(0).toTensor().const_data_ptr<float>();
|
||||
if(step>=1&&step-1<15){
|
||||
codes[step-1]=argmax_head(h, cpHeads+(step-1)*CB_SIZE*DIM, CB_SIZE, DIM);
|
||||
}
|
||||
for(int i=0;i<C_L;i++){
|
||||
memcpy(kv.data()+(i*2)*kvElem, m.get_output(1+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
|
||||
memcpy(kv.data()+(i*2+1)*kvElem, m.get_output(2+i*2).toTensor().const_data_ptr<float>(), kvElem*4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
JNIEXPORT jintArray JNICALL
|
||||
Java_com_kazeia_tts_TtsPipeline_nativeRun(
|
||||
JNIEnv*env,jclass,
|
||||
jfloatArray jPrefill,jint nPrefill,
|
||||
jfloatArray jTrailing,jint nTrailing,
|
||||
jfloatArray jCodecEmb, jfloatArray jCpEmbs, jfloatArray jCpHeads,
|
||||
jfloatArray jTCos,jfloatArray jTSin, jfloatArray jCCos,jfloatArray jCSin,
|
||||
jint maxTokens)
|
||||
{
|
||||
if(!gState||!gState->loaded) return nullptr;
|
||||
auto T0=std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Copy all data from JNI (then release immediately)
|
||||
int prefillSize = env->GetArrayLength(jPrefill);
|
||||
std::vector<float> prefill(prefillSize);
|
||||
env->GetFloatArrayRegion(jPrefill, 0, prefillSize, prefill.data());
|
||||
|
||||
std::vector<float> trailingData;
|
||||
if(nTrailing>0){trailingData.resize(nTrailing*DIM);env->GetFloatArrayRegion(jTrailing,0,nTrailing*DIM,trailingData.data());}
|
||||
|
||||
int codecSize=env->GetArrayLength(jCodecEmb);
|
||||
std::vector<float> codecEmb(codecSize);
|
||||
env->GetFloatArrayRegion(jCodecEmb,0,codecSize,codecEmb.data());
|
||||
|
||||
int cpEmbsSize=env->GetArrayLength(jCpEmbs);
|
||||
std::vector<float> cpEmbs(cpEmbsSize);
|
||||
env->GetFloatArrayRegion(jCpEmbs,0,cpEmbsSize,cpEmbs.data());
|
||||
|
||||
int headsSize=env->GetArrayLength(jCpHeads);
|
||||
std::vector<float> cpHeads(headsSize);
|
||||
env->GetFloatArrayRegion(jCpHeads,0,headsSize,cpHeads.data());
|
||||
|
||||
int tcSize=env->GetArrayLength(jTCos);
|
||||
std::vector<float> tCos(tcSize),tSin(tcSize);
|
||||
env->GetFloatArrayRegion(jTCos,0,tcSize,tCos.data());
|
||||
env->GetFloatArrayRegion(jTSin,0,tcSize,tSin.data());
|
||||
|
||||
int ccSize=env->GetArrayLength(jCCos);
|
||||
std::vector<float> cCos(ccSize),cSin(ccSize);
|
||||
env->GetFloatArrayRegion(jCCos,0,ccSize,cCos.data());
|
||||
env->GetFloatArrayRegion(jCSin,0,ccSize,cSin.data());
|
||||
|
||||
// Pipeline state
|
||||
int tkvElem=T_KV*T_KV_LEN*T_HD;
|
||||
std::vector<float> tK(T_L*tkvElem,0), tV(T_L*tkvElem,0);
|
||||
float mask[T_KV_LEN]; for(int i=0;i<T_KV_LEN;i++) mask[i]=-1e9f;
|
||||
float hidden[DIM]={}, logits[VOCAB]={};
|
||||
|
||||
std::vector<int> allCodes; // flat: numTokens × 16
|
||||
std::vector<int> cb0History;
|
||||
int pos=0, currentCb0=-1;
|
||||
|
||||
// ===== PREFILL =====
|
||||
auto tP0=std::chrono::high_resolution_clock::now();
|
||||
for(int step=0;step<nPrefill;step++){
|
||||
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
||||
if(mi>=0) mask[mi]=0.0f;
|
||||
talkerStep(*gState->talker, prefill.data()+step*DIM, mask, pos, tCos.data(),tSin.data(),
|
||||
tK.data(), tV.data(), hidden, logits);
|
||||
pos++;
|
||||
if(step==nPrefill-1){
|
||||
for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
|
||||
currentCb0=sample_topk(logits,VOCAB,0.9f,50);
|
||||
}
|
||||
}
|
||||
auto tP1=std::chrono::high_resolution_clock::now();
|
||||
LOGI("Prefill: %.0fms, %d steps, cb0=%d",
|
||||
std::chrono::duration<float,std::milli>(tP1-tP0).count(), nPrefill, currentCb0);
|
||||
|
||||
if(currentCb0<0||currentCb0==CODEC_EOS){return env->NewIntArray(0);}
|
||||
|
||||
// ===== GENERATION =====
|
||||
float totalTalkerMs=0, totalCpMs=0;
|
||||
int trailingIdx=0;
|
||||
// Pad embedding (zeros + pad token is not available here, use zeros)
|
||||
float padEmb[DIM]={}; // In practice should be tts_pad_embed, passed as param
|
||||
|
||||
for(int gen=0;gen<maxTokens;gen++){
|
||||
int codes[NUM_CB]={}; codes[0]=currentCb0;
|
||||
|
||||
// CP
|
||||
auto tc0=std::chrono::high_resolution_clock::now();
|
||||
int cpCodes[15]={};
|
||||
cpForward(*gState->cp, hidden, currentCb0, codecEmb.data(), cpEmbs.data(), cpHeads.data(),
|
||||
cCos.data(), cSin.data(), cpCodes);
|
||||
auto tc1=std::chrono::high_resolution_clock::now();
|
||||
totalCpMs+=std::chrono::duration<float,std::milli>(tc1-tc0).count();
|
||||
for(int i=0;i<15;i++) codes[i+1]=cpCodes[i];
|
||||
|
||||
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
|
||||
cb0History.push_back(currentCb0);
|
||||
|
||||
// Build next talker input: sum codec embeddings
|
||||
float nextEmb[DIM]={};
|
||||
// cb0 embedding
|
||||
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
|
||||
// cb1-15 embeddings
|
||||
for(int cb=0;cb<15;cb++){
|
||||
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
|
||||
}
|
||||
// Add trailing text or pad
|
||||
if(trailingIdx<nTrailing){
|
||||
const float*te=trailingData.data()+trailingIdx*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
|
||||
trailingIdx++;
|
||||
}
|
||||
// (pad embedding = zeros, already added implicitly)
|
||||
|
||||
// Talker step
|
||||
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
||||
if(mi>=0) mask[mi]=0.0f;
|
||||
auto tt0=std::chrono::high_resolution_clock::now();
|
||||
talkerStep(*gState->talker, nextEmb, mask, pos, tCos.data(),tSin.data(),
|
||||
tK.data(), tV.data(), hidden, logits);
|
||||
auto tt1=std::chrono::high_resolution_clock::now();
|
||||
totalTalkerMs+=std::chrono::duration<float,std::milli>(tt1-tt0).count();
|
||||
pos++;
|
||||
|
||||
// Sample next cb0 (suppress non-codec, repetition penalty)
|
||||
for(int j=CB_SIZE;j<VOCAB;j++) if(j!=CODEC_EOS) logits[j]=-FLT_MAX;
|
||||
std::unordered_set<int> seen(cb0History.begin(),cb0History.end());
|
||||
for(int tok:seen) logits[tok]=(logits[tok]>0)?logits[tok]/1.05f:logits[tok]*1.05f;
|
||||
int nextCb0=sample_topk(logits,VOCAB,0.9f,50);
|
||||
|
||||
if(nextCb0==CODEC_EOS){LOGI("EOS at step %d",gen+2);break;}
|
||||
// Degeneration check
|
||||
int histSz=(int)cb0History.size();
|
||||
if(histSz>=9){
|
||||
bool degen=true;
|
||||
for(int i=histSz-9;i<histSz;i++) if(cb0History[i]!=nextCb0){degen=false;break;}
|
||||
if(degen){LOGI("Degeneration at step %d",gen+2);break;}
|
||||
}
|
||||
currentCb0=nextCb0;
|
||||
}
|
||||
|
||||
int nTokens=(int)allCodes.size()/NUM_CB;
|
||||
auto T1=std::chrono::high_resolution_clock::now();
|
||||
LOGI("Generated %d tokens | Talker: %.0fms (%.0fms/step) | CP: %.0fms (%.0fms/step) | Total: %.0fms",
|
||||
nTokens, totalTalkerMs, totalTalkerMs/std::max(nTokens,1),
|
||||
totalCpMs, totalCpMs/std::max(nTokens,1),
|
||||
std::chrono::duration<float,std::milli>(T1-T0).count());
|
||||
|
||||
jintArray result=env->NewIntArray((int)allCodes.size());
|
||||
env->SetIntArrayRegion(result,0,(int)allCodes.size(),allCodes.data());
|
||||
return result;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
|
@ -130,6 +130,7 @@ class Qwen3TtsEngine(
|
|||
private var cpAllHeads: FloatArray? = null // all 15 heads concatenated [15*2048*1024] for batch NEON
|
||||
private var cpPteModule: org.pytorch.executorch.Module? = null // ExecuTorch CP on NPU (JNI)
|
||||
private var talkerPteModule: org.pytorch.executorch.Module? = null // ExecuTorch talker on NPU (JNI)
|
||||
private var nativePipelineReady: Boolean = false // C++ native pipeline available
|
||||
private var talkerPteRotaryCos: FloatArray? = null
|
||||
private var talkerPteRotarySin: FloatArray? = null
|
||||
private var useEtCp: Boolean = false // CP via ExecuTorch runner process (root)
|
||||
|
|
@ -307,6 +308,19 @@ class Qwen3TtsEngine(
|
|||
cpPteModule!!.forward(*cIns.toTypedArray())
|
||||
nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
|
||||
} catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
|
||||
|
||||
// Init native C++ pipeline (loads models with own ExecuTorch runtime)
|
||||
try {
|
||||
val tn = System.currentTimeMillis()
|
||||
nativePipelineReady = TtsPipeline.nativeInit(
|
||||
"/data/local/tmp/kazeia/models/talker_transformer_fp16.pte",
|
||||
"/data/local/tmp/kazeia/models/cp_transformer_fp16.pte"
|
||||
)
|
||||
nlog("Native pipeline: ${if (nativePipelineReady) "OK" else "FAILED"} (${System.currentTimeMillis() - tn}ms)")
|
||||
} catch (e: Exception) {
|
||||
nlog("Native pipeline init failed: ${e.message}")
|
||||
nativePipelineReady = false
|
||||
}
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
nlog("Talker .pte JNI failed: ${e.message}")
|
||||
|
|
@ -2272,23 +2286,47 @@ class Qwen3TtsEngine(
|
|||
val embeds = Array(nTotal) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = bb.float } }
|
||||
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
|
||||
|
||||
// Build textEmbedsList: first nPrefill are "prefill", rest are trailing
|
||||
val textEmbedsList = embeds.toList()
|
||||
val allCodesArray = runInterleavedPte(textEmbedsList.subList(0, 1), maxGenTokens = nTotal - nPrefill)
|
||||
// Note: runInterleavedPte handles prefill internally via buildPrefillEmbeddings
|
||||
val allCodes: Array<IntArray>
|
||||
if (nativePipelineReady) {
|
||||
// Native C++ pipeline — zero Java overhead
|
||||
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
|
||||
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
|
||||
val nTrailing = nTotal - nPrefill
|
||||
val trailingFlat = if (nTrailing > 0) FloatArray(nTrailing * TALKER_DIM).also { arr ->
|
||||
for (i in 0 until nTrailing) System.arraycopy(embeds[nPrefill + i], 0, arr, i * TALKER_DIM, TALKER_DIM)
|
||||
} else null
|
||||
|
||||
// Actually, generateFromEmbeds uses pre-computed embeds directly.
|
||||
// Let's use runInterleavedPte properly by passing all embeds as textEmbedsList
|
||||
// runInterleavedPte's buildPrefillEmbeddings will create the prefill from first embed
|
||||
// But we need a different approach: pass all embeds directly.
|
||||
// Load CP heads if not already
|
||||
if (cpAllHeads == null) {
|
||||
val headsFile = java.io.File("/data/local/tmp/kazeia/models/cp_heads.bin")
|
||||
if (headsFile.exists()) {
|
||||
val hb = headsFile.readBytes()
|
||||
cpAllHeads = FloatArray(hb.size / 4)
|
||||
ByteBuffer.wrap(hb).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(cpAllHeads!!)
|
||||
}
|
||||
}
|
||||
|
||||
// Simpler: use the run_pipeline code path which goes through generateSpeech → runInterleavedGeneration
|
||||
// For now, let's just call runInterleavedPte with the right embeds structure
|
||||
val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList()
|
||||
val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList()
|
||||
|
||||
// Call the PTE pipeline directly
|
||||
val allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
|
||||
nlog("Running native C++ pipeline...")
|
||||
val flat = TtsPipeline.nativeRun(
|
||||
prefillFlat, nPrefill,
|
||||
trailingFlat, nTrailing,
|
||||
codecEmbedding ?: FloatArray(0),
|
||||
cpEmbeddings ?: FloatArray(0),
|
||||
cpAllHeads ?: FloatArray(0),
|
||||
talkerPteRotaryCos ?: FloatArray(0), talkerPteRotarySin ?: FloatArray(0),
|
||||
cpRotaryCos ?: FloatArray(0), cpRotarySin ?: FloatArray(0),
|
||||
nTotal - nPrefill
|
||||
)
|
||||
if (flat == null || flat.isEmpty()) return ShortArray(0)
|
||||
val nTokens = flat.size / NUM_CODEBOOKS
|
||||
allCodes = Array(nTokens) { t -> IntArray(NUM_CODEBOOKS) { cb -> flat[t * NUM_CODEBOOKS + cb] } }
|
||||
nlog("Native pipeline: $nTokens tokens")
|
||||
} else {
|
||||
// Fallback: Java pipeline
|
||||
val prefillEmbeds = embeds.sliceArray(0 until nPrefill).toList()
|
||||
val trailingEmbeds = if (nPrefill < nTotal) embeds.sliceArray(nPrefill until nTotal).toList() else emptyList()
|
||||
allCodes = runInterleavedPteFromEmbeds(prefillEmbeds, trailingEmbeds, nTotal - nPrefill)
|
||||
}
|
||||
|
||||
if (allCodes.isEmpty()) return ShortArray(0)
|
||||
val numRealTokens = allCodes.size
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
package com.kazeia.tts
|
||||
|
||||
/** Native C++ TTS pipeline: talker + CP loop with zero Java overhead. */
|
||||
object TtsPipeline {
|
||||
init { System.loadLibrary("tts_pipeline") }
|
||||
|
||||
/** Load and warmup both .pte models. Returns true on success. */
|
||||
external fun nativeInit(talkerPath: String, cpPath: String): Boolean
|
||||
|
||||
/** Release native resources. */
|
||||
external fun nativeDestroy()
|
||||
|
||||
/**
|
||||
* Run full pipeline. Returns flat int array: [numTokens × 16] codebook codes.
|
||||
* Each group of 16 ints = [CB0, CB1, ..., CB15] for one time step.
|
||||
*/
|
||||
external fun nativeRun(
|
||||
prefillEmbeds: FloatArray, nPrefill: Int,
|
||||
trailingEmbeds: FloatArray?, nTrailing: Int,
|
||||
codecEmbedding: FloatArray,
|
||||
cpEmbeddings: FloatArray,
|
||||
cpHeads: FloatArray,
|
||||
talkerCos: FloatArray, talkerSin: FloatArray,
|
||||
cpCos: FloatArray, cpSin: FloatArray,
|
||||
maxTokens: Int
|
||||
): IntArray?
|
||||
}
|
||||
|
|
@ -37,6 +37,10 @@ target_include_directories(whisper_jni PRIVATE
|
|||
target_link_libraries(whisper_jni whisper ggml ggml-base ggml-cpu android log)
|
||||
target_compile_options(whisper_jni PRIVATE -std=c++17 -O2)
|
||||
|
||||
# --- TTS Pipeline: built externally via ExecuTorch cmake, copied to jniLibs ---
|
||||
# Build with: cd /opt/Kazeia/executorch/build-android && cmake --build . --target tts_pipeline_jni -j$(nproc)
|
||||
# Then copy to jniLibs/arm64-v8a/libtts_pipeline.so
|
||||
|
||||
# --- NEON optimized ops for TTS heads ---
|
||||
add_library(neon_ops SHARED neon_ops.cpp)
|
||||
target_link_libraries(neon_ops log)
|
||||
|
|
|
|||
Loading…
Reference in New Issue