Fix C++ pipeline eos/pad + disable for quality (keep Java default)

- Fixed trailing embed handling (use pre-computed as-is)
- Added eos/pad embed params to nativeRun
- Improved C++ PRNG for sampling
- Disabled native pipeline: slight quality regression vs Java
  (two separate QNN instances give different numerical results)
- Java pipeline (RTF 1.8) kept as default for validated quality

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-09 10:53:19 +02:00
parent 3b01302cfb
commit 8e536094df
2 changed files with 29 additions and 23 deletions

View File

@ -57,27 +57,30 @@ static int argmax_head(const float*h,const float*W,int vocab,int dim){
return best;
}
// Top-k sampling with temperature
// Top-k sampling with temperature (Java-compatible PRNG)
static uint64_t g_rng_state = 0x12345678ABCDEF01ULL;
static float next_rand() {
// Java-style LCG for reproducibility
g_rng_state = g_rng_state * 6364136223846793005ULL + 1442695040888963407ULL;
return (float)((g_rng_state >> 33) & 0x7FFFFFFF) / (float)0x7FFFFFFF;
}
static int sample_topk(const float* logits, int vocab, float temp, int k) {
// Find top-k
struct IV { int i; float v; };
std::vector<IV> topk(k, {0, -FLT_MAX});
for (int i = 0; i < vocab; i++) {
if (logits[i] > topk[k-1].v) {
topk[k-1] = {i, logits[i]};
// Bubble up
for (int j = k-2; j >= 0; j--) {
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
else break;
}
}
}
// Softmax with temperature
float maxv = topk[0].v;
float sum = 0;
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
// Sample
float r = (float)rand() / RAND_MAX * sum;
float r = next_rand() * sum;
float acc = 0;
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
return topk[0].i;
@ -312,27 +315,27 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
cb0History.push_back(currentCb0);
// Build next talker input: sum codec embeddings
// Build next talker input
float nextEmb[DIM]={};
// cb0 embedding
if(trailingIdx<nTrailing){
// Pre-computed decode embed from file: use as-is (already contains codec+text)
memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
trailingIdx++;
} else {
// After trailing exhausted: build from our codes + eos/pad
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
// cb1-15 embeddings
for(int cb=0;cb<15;cb++){
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
}
// Add trailing text, then eos, then pad (matches Python/Kotlin pipeline)
if(trailingIdx<nTrailing){
const float*te=trailingData.data()+trailingIdx*DIM;
for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
trailingIdx++;
} else if(trailingIdx==nTrailing){
if(trailingIdx==nTrailing){
for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmbed[k];
trailingIdx++;
} else {
for(int k=0;k<DIM;k++) nextEmb[k]+=padEmbed[k];
}
}
// Talker step
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);

View File

@ -2287,7 +2287,10 @@ class Qwen3TtsEngine(
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
val allCodes: Array<IntArray>
if (nativePipelineReady) {
// Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
// Java pipeline: RTF 1.8, validated quality
// TODO: share QNN context between Java and C++ for same quality at C++ speed
if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
// Native C++ pipeline — zero Java overhead
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)