Full native C++ TTS validated on short + long phrases
Dynamic formula: target_len = n_tokens × 3.2 + 5 (calibrated) - Short "Bonjour..." (18 tokens → 62 trailing): OK - Long "Je suis Kazeia... difficiles" (30 tokens → 101 trailing): OK RMS trim disabled (garbage is loud, can't distinguish from speech). Length controlled purely by maxTokens = trailing count. Pipeline: prepare_tts_native.py "any text" → adb push → run → audio Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dafbe2a52b
commit
199bc4fbc9
|
|
@ -2606,41 +2606,9 @@ class Qwen3TtsEngine(
|
|||
return result
|
||||
}
|
||||
|
||||
/** Trim trailing garbage from audio by detecting RMS drop.
|
||||
* Scans forward, finds where RMS drops significantly → end of speech. */
|
||||
private fun trimTrailingSilence(audio: ShortArray): ShortArray {
|
||||
val windowSamples = SR / 10 // 100ms windows
|
||||
if (audio.size < windowSamples * 4) return audio
|
||||
|
||||
// Compute RMS per window
|
||||
val nWindows = audio.size / windowSamples
|
||||
val rmsValues = FloatArray(nWindows)
|
||||
for (w in 0 until nWindows) {
|
||||
var sum = 0.0
|
||||
for (i in 0 until windowSamples) {
|
||||
val s = audio[w * windowSamples + i].toFloat()
|
||||
sum += s * s
|
||||
}
|
||||
rmsValues[w] = Math.sqrt(sum / windowSamples).toFloat()
|
||||
}
|
||||
|
||||
// Find peak RMS in first half (speech region)
|
||||
val peakRms = rmsValues.take(nWindows / 2).maxOrNull() ?: return audio
|
||||
|
||||
// Scan from 60% onwards, find first window where RMS drops below 15% of peak
|
||||
// (speech ended, garbage/silence started)
|
||||
val threshold = peakRms * 0.15f
|
||||
var cutWindow = nWindows
|
||||
for (w in (nWindows * 3 / 5) until nWindows) {
|
||||
if (rmsValues[w] < threshold) {
|
||||
cutWindow = w + 1 // keep one more window for tail
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
val trimPoint = minOf(cutWindow * windowSamples, audio.size)
|
||||
return if (trimPoint < audio.size) audio.copyOf(trimPoint) else audio
|
||||
}
|
||||
/** No-op trim — garbage post-text has high energy, can't distinguish from speech.
|
||||
* Length is controlled by maxTokens = trailing count instead. */
|
||||
private fun trimTrailingSilence(audio: ShortArray): ShortArray = audio
|
||||
|
||||
/** Full pipeline using Hexagon talker + Hexagon CP from pre-computed embeddings. */
|
||||
private fun generateFromEmbedsHexagon(embedsPath: String): ShortArray {
|
||||
|
|
|
|||
|
|
@ -54,8 +54,7 @@ eos = sp[1].astype(np.float32)
|
|||
|
||||
# Build trailing: text[1:] + eos padding
|
||||
# Audio is ~3.5× longer than text tokens. Pad with eos to ensure full coverage.
|
||||
AUDIO_RATIO = 3.5 # audio_tokens ≈ text_tokens × 3.5 (slightly generous to avoid cut)
|
||||
target_len = max(int(len(tokens) * AUDIO_RATIO), 50) # minimum 50 for short phrases
|
||||
target_len = max(int(len(tokens) * 3.2) + 5, 40) # calibrated: 3.2× + 5 buffer
|
||||
|
||||
trailing = [proj[i] for i in range(1, len(proj))] # text[1:]
|
||||
while len(trailing) < target_len:
|
||||
|
|
|
|||
Loading…
Reference in New Issue