Fix C++ pipeline eos/pad + disable for quality (keep Java default)
- Fixed trailing embed handling (use pre-computed as-is) - Added eos/pad embed params to nativeRun - Improved C++ PRNG for sampling - Disabled native pipeline: slight quality regression vs Java (two separate QNN instances give different numerical results) - Java pipeline (RTF 1.8) kept as default for validated quality Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3b01302cfb
commit
8e536094df
|
|
@ -57,27 +57,30 @@ static int argmax_head(const float*h,const float*W,int vocab,int dim){
|
||||||
return best;
|
return best;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Top-k sampling with temperature
|
// Top-k sampling with temperature (Java-compatible PRNG)
|
||||||
|
static uint64_t g_rng_state = 0x12345678ABCDEF01ULL;
|
||||||
|
static float next_rand() {
|
||||||
|
// Java-style LCG for reproducibility
|
||||||
|
g_rng_state = g_rng_state * 6364136223846793005ULL + 1442695040888963407ULL;
|
||||||
|
return (float)((g_rng_state >> 33) & 0x7FFFFFFF) / (float)0x7FFFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
static int sample_topk(const float* logits, int vocab, float temp, int k) {
|
static int sample_topk(const float* logits, int vocab, float temp, int k) {
|
||||||
// Find top-k
|
|
||||||
struct IV { int i; float v; };
|
struct IV { int i; float v; };
|
||||||
std::vector<IV> topk(k, {0, -FLT_MAX});
|
std::vector<IV> topk(k, {0, -FLT_MAX});
|
||||||
for (int i = 0; i < vocab; i++) {
|
for (int i = 0; i < vocab; i++) {
|
||||||
if (logits[i] > topk[k-1].v) {
|
if (logits[i] > topk[k-1].v) {
|
||||||
topk[k-1] = {i, logits[i]};
|
topk[k-1] = {i, logits[i]};
|
||||||
// Bubble up
|
|
||||||
for (int j = k-2; j >= 0; j--) {
|
for (int j = k-2; j >= 0; j--) {
|
||||||
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
|
if (topk[j+1].v > topk[j].v) std::swap(topk[j], topk[j+1]);
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Softmax with temperature
|
|
||||||
float maxv = topk[0].v;
|
float maxv = topk[0].v;
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
|
for (auto& t : topk) { t.v = expf((t.v - maxv) / temp); sum += t.v; }
|
||||||
// Sample
|
float r = next_rand() * sum;
|
||||||
float r = (float)rand() / RAND_MAX * sum;
|
|
||||||
float acc = 0;
|
float acc = 0;
|
||||||
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
|
for (auto& t : topk) { acc += t.v; if (acc >= r) return t.i; }
|
||||||
return topk[0].i;
|
return topk[0].i;
|
||||||
|
|
@ -312,27 +315,27 @@ Java_com_kazeia_tts_TtsPipeline_nativeRun(
|
||||||
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
|
for(int i=0;i<NUM_CB;i++) allCodes.push_back(codes[i]);
|
||||||
cb0History.push_back(currentCb0);
|
cb0History.push_back(currentCb0);
|
||||||
|
|
||||||
// Build next talker input: sum codec embeddings
|
// Build next talker input
|
||||||
float nextEmb[DIM]={};
|
float nextEmb[DIM]={};
|
||||||
// cb0 embedding
|
if(trailingIdx<nTrailing){
|
||||||
|
// Pre-computed decode embed from file: use as-is (already contains codec+text)
|
||||||
|
memcpy(nextEmb, trailingData.data()+trailingIdx*DIM, DIM*4);
|
||||||
|
trailingIdx++;
|
||||||
|
} else {
|
||||||
|
// After trailing exhausted: build from our codes + eos/pad
|
||||||
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
|
const float*e0=codecEmb.data()+std::min(std::max(codes[0],0),VOCAB-1)*DIM;
|
||||||
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
|
for(int k=0;k<DIM;k++) nextEmb[k]+=e0[k];
|
||||||
// cb1-15 embeddings
|
|
||||||
for(int cb=0;cb<15;cb++){
|
for(int cb=0;cb<15;cb++){
|
||||||
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
|
const float*ec=cpEmbs.data()+((long)cb*CB_SIZE+std::min(std::max(codes[cb+1],0),CB_SIZE-1))*DIM;
|
||||||
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
|
for(int k=0;k<DIM;k++) nextEmb[k]+=ec[k];
|
||||||
}
|
}
|
||||||
// Add trailing text, then eos, then pad (matches Python/Kotlin pipeline)
|
if(trailingIdx==nTrailing){
|
||||||
if(trailingIdx<nTrailing){
|
|
||||||
const float*te=trailingData.data()+trailingIdx*DIM;
|
|
||||||
for(int k=0;k<DIM;k++) nextEmb[k]+=te[k];
|
|
||||||
trailingIdx++;
|
|
||||||
} else if(trailingIdx==nTrailing){
|
|
||||||
for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmbed[k];
|
for(int k=0;k<DIM;k++) nextEmb[k]+=eosEmbed[k];
|
||||||
trailingIdx++;
|
trailingIdx++;
|
||||||
} else {
|
} else {
|
||||||
for(int k=0;k<DIM;k++) nextEmb[k]+=padEmbed[k];
|
for(int k=0;k<DIM;k++) nextEmb[k]+=padEmbed[k];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Talker step
|
// Talker step
|
||||||
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
int mi=T_KV_LEN-1-std::min(pos,T_KV_LEN-1);
|
||||||
|
|
|
||||||
|
|
@ -2287,7 +2287,10 @@ class Qwen3TtsEngine(
|
||||||
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
|
nlog("Loaded $nTotal embeds ($nPrefill prefill + ${nTotal - nPrefill} decode)")
|
||||||
|
|
||||||
val allCodes: Array<IntArray>
|
val allCodes: Array<IntArray>
|
||||||
if (nativePipelineReady) {
|
// Native C++ pipeline: RTF 1.4 but slight quality loss vs Java (different QNN instance)
|
||||||
|
// Java pipeline: RTF 1.8, validated quality
|
||||||
|
// TODO: share QNN context between Java and C++ for same quality at C++ speed
|
||||||
|
if (nativePipelineReady && false) { // Disabled: quality regression, see TODO above
|
||||||
// Native C++ pipeline — zero Java overhead
|
// Native C++ pipeline — zero Java overhead
|
||||||
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
|
val prefillFlat = FloatArray(nPrefill * TALKER_DIM)
|
||||||
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
|
for (i in 0 until nPrefill) System.arraycopy(embeds[i], 0, prefillFlat, i * TALKER_DIM, TALKER_DIM)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue