Fix C++ pipeline eos/pad + disable for quality (keep Java default)
- Fixed trailing embed handling (use pre-computed as-is) - Added eos/pad embed params to nativeRun - Improved C++ PRNG for sampling - Disabled native pipeline: slight quality regression vs Java (two separate QNN instances give different numerical results) - Java pipeline (RTF 1.8) kept as default for validated quality Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3b01302cfb
commit
8e536094df
|
|
@ -57,27 +57,30 @@ static int argmax_head(const float*h,const float*W,int vocab,int dim){
|
|||
return best;
|
||||
}
|
||||
|
||||
// Top-k sampling with temperature
|
||||
// Top-k sampling with temperature (Java-compatible PRNG)
|
||||
static uint64_t g_rng_state = 0x12345678ABCDEF01ULL;
|
||||
static float next_rand() {
|
||||
// Java-style LCG for reproducibility
|
||||
g_rng_state = g_rng_state * 6364136223846793005ULL + 1442695040888963407ULL;
|
||||
return (float)((g_rng_state >> 33) & 0x7FFFFFFF) / (float)0x7FFFFFFF;
|
||||
}
|
||||
|
||||
static int sample_topk(const float* logits, int vocab, float temp, int k) {
|
||||
// Find top-k
|
||||
struct IV { int i; float v; };
|
||||
std::vector<IV> topk(k, {0, -FLT_MAX});
|
||||
for (int i = 0; i < vocab; i++) {
|
||||
if (logits[i] > topk[k-1].v) {
|
||||
topk[k-1] = {i, logits[i]};
|
||||
// Bubble up
|
||||
for (int j = k-2; j >= 0; j--) {
|
||||
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
|
||||
else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Softmax with temperature
|
||||
float maxv = topk[0].v;
|
||||
float sum = 0;
|
||||
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
|
||||
// Sample
|
||||
float r = (float)rand() / RAND_MAX * sum;
|
||||
float r = next_rand() * sum;
|
||||
float acc = 0;
|
||||
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
|
||||
return topk[0].i;
|
||||
|
|
@ -312,27 +315,27 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
|
|||
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
|
||||
cb0History.push_back(currentCb0);
|
||||
|
||||
// Build next talker input: sum codec embeddings
|
||||
// Build next talker input
|
||||
float nextEmb[DIM]={};
|
||||
// cb0 embedding
|
||||
if(trailingIdx<nTrailing){
|
||||
// Pre-computed decode embed from file: use as-is (already contains codec+text)
|
||||
memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
|
||||
trailingIdx++;
|
||||
} else {
|
||||
// After trailing exhausted: build from our codes + eos/pad
|
||||
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
|
||||
// cb1-15 embeddings
|
||||
for(int cb=0;cb<15;cb++){
|
||||
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
|
||||
}
|
||||
// Add trailing text, then eos, then pad (matches Python/Kotlin pipeline)
|
||||
if(trailingIdx<nTrailing){
|
||||
const float*te=trailingData.data()+trailingIdx*DIM;
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
|
||||
trailingIdx++;
|
||||
} else if(trailingIdx==nTrailing){
|
||||
if(trailingIdx==nTrailing){
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmbed[k];
|
||||
trailingIdx++;
|
||||
} else {
|
||||
for(int k=0;k<DIM;k++) nextEmb[k]+=padEmbed[k];
|
||||
}
|
||||
}
|
||||
|
||||
// Talker step
|
||||
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
||||
|
|
|
|||
|
|
@ -2287,7 +2287,10 @@ class Qwen3TtsEngine(
|
|||
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
|
||||
|
||||
val allCodes: Array<IntArray>
|
||||
if (nativePipelineReady) {
|
||||
// Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
|
||||
// Java pipeline: RTF 1.8, validated quality
|
||||
// TODO: share QNN context between Java and C++ for same quality at C++ speed
|
||||
if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
|
||||
// Native C++ pipeline — zero Java overhead
|
||||
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
|
||||
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
|
||||
|
|
|
|||
Loading…
Reference in New Issue