UI+TTS: voice hot-swap + typing dots + emoji stripping
Three tightly-coupled UX fixes the user flagged during live testing.
**Voice hot-swap (Qwen3TtsEngine.setVoice)**: previously a no-op
stub — the spinner callback updated the orb color but the actual
audio kept using Damien's cached prefix/suffix embeddings. Now we
derive the voice id from the WAV basename (elodie.wav → 'elodie'),
look up `<id>_voice_prefix.bin` + `<id>_voice_suffix.bin` in the
model dir, parse their headers, and atomically replace the embedding
arrays so the NEXT synthesized segment uses the new voice. If the
files aren't present we log a clear warning pointing at
prepare_tts_native.py — the hot-swap is wired, but per-voice prefix/
suffix still need to be generated offline and adb-pushed.
KazeiaService.setVoice now forwards to Qwen3TtsEngine in addition to
the Chatterbox branch.
**Emoji stripping**: the model loves closing on "😊" and it was
reaching TTS as a standalone segment that synthesized a fraction of
a second of junk. KazeiaPipeline.speakText now runs each sentence
through stripNonSpeakable before enqueueing — drops Unicode emoji /
dingbat / pictograph / flag blocks plus variation selectors and
zero-width joiners, then trims. Empty-after-strip sentences are
skipped entirely. The chat bubble still shows the original text
(with emojis) — only the audio path drops them.
**Typing dots indicator**: while LLM is done but TTS synthesis is
still running (~3–5 s for the first segment), the Kazeia bubble now
shows an animated ". / .. / ..." cycle at 400 ms cadence instead of
sitting empty. The moment the first segment actually starts playing,
the cycle cancels, the bubble resets to empty, and the existing
word-by-word reveal takes over. A defensive finally block also
cancels the job when no segment ever fires (e.g. all-emoji reply).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b5b13780f7
commit
c2f7859dfe
|
|
@ -164,7 +164,14 @@ class KazeiaPipeline {
|
|||
if (qwen != null) {
|
||||
qwen.onSegmentPlaying = onSegmentPlaying
|
||||
qwen.startStreamingSession()
|
||||
val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
|
||||
val streamer = com.kazeia.tts.SentenceStreamer { raw ->
|
||||
// Strip emoji / non-speakable pictographs before TTS
|
||||
// so a standalone "😊" doesn't become its own noisy
|
||||
// segment. The chat bubble keeps the original text —
|
||||
// only the audio path sees the cleaned version.
|
||||
val spoken = stripNonSpeakable(raw).trim()
|
||||
if (spoken.isNotEmpty()) qwen.enqueueSentence(spoken)
|
||||
}
|
||||
streamer.append(text)
|
||||
streamer.flush()
|
||||
qwen.endStreamingSession()
|
||||
|
|
@ -183,6 +190,41 @@ class KazeiaPipeline {
|
|||
_messages.value = _messages.value + msg
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop emoji + dingbat + pictographic characters so the TTS engine
|
||||
* doesn't try to synthesize them. Covers the main Unicode emoji
|
||||
* blocks (Miscellaneous Symbols, Dingbats, Emoticons, Transport,
|
||||
* Supplemental Symbols and Pictographs, etc.) plus variation
|
||||
* selectors and zero-width joiners that tag emoji sequences.
|
||||
* Keeps everything in the Basic Latin / Latin-1 / Latin Extended
|
||||
* ranges + common French punctuation untouched.
|
||||
*/
|
||||
private fun stripNonSpeakable(text: String): String {
|
||||
val sb = StringBuilder(text.length)
|
||||
var i = 0
|
||||
while (i < text.length) {
|
||||
val cp = text.codePointAt(i)
|
||||
val skip = when {
|
||||
cp in 0x2600..0x27BF -> true // misc symbols + dingbats
|
||||
cp in 0x1F300..0x1F5FF -> true // pictographs
|
||||
cp in 0x1F600..0x1F64F -> true // emoticons
|
||||
cp in 0x1F680..0x1F6FF -> true // transport
|
||||
cp in 0x1F700..0x1F77F -> true // alchemical
|
||||
cp in 0x1F780..0x1F7FF -> true // geometric extended
|
||||
cp in 0x1F800..0x1F8FF -> true // supplemental arrows-c
|
||||
cp in 0x1F900..0x1F9FF -> true // supplemental pictographs
|
||||
cp in 0x1FA00..0x1FAFF -> true // symbols & pictographs extended-A
|
||||
cp == 0x200D -> true // zero-width joiner
|
||||
cp in 0xFE00..0xFE0F -> true // variation selectors
|
||||
cp in 0x1F1E6..0x1F1FF -> true // regional indicators (flags)
|
||||
else -> false
|
||||
}
|
||||
if (!skip) sb.appendCodePoint(cp)
|
||||
i += Character.charCount(cp)
|
||||
}
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
fun log(msg: String) {
|
||||
Log.i(TAG, msg)
|
||||
val time = java.text.SimpleDateFormat("HH:mm:ss.SSS", java.util.Locale.FRANCE)
|
||||
|
|
|
|||
|
|
@ -628,6 +628,16 @@ class KazeiaService : Service() {
|
|||
if (chatterbox != null) {
|
||||
chatterbox.setVoice(voicePath)
|
||||
log("Voice set to: $voicePath")
|
||||
return
|
||||
}
|
||||
val qwen = tts as? com.kazeia.tts.Qwen3TtsEngine
|
||||
if (qwen != null) {
|
||||
// Hot-swap prefix/suffix embeddings — no model reload. Takes
|
||||
// effect from the NEXT synthesized segment (current in-flight
|
||||
// one, if any, finishes with the old voice since the arrays
|
||||
// are already in its closure).
|
||||
qwen.setVoice(voicePath)
|
||||
log("Voice set to: $voicePath")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1241,18 +1251,36 @@ class KazeiaService : Service() {
|
|||
// the continuous-listening mic loop drops frames and we
|
||||
// don't feed our own speaker output back into STT.
|
||||
_pipelineState.value = PipelineState.Speaking
|
||||
// Create an empty KAZEIA bubble up-front so the per-sentence
|
||||
// reveal has somewhere to append to, but the text stays
|
||||
// empty until TTS audio for the first sentence starts —
|
||||
// matching the "conversation" feel the user asked for
|
||||
// (read-as-you-hear, not read-then-hear).
|
||||
val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = "")
|
||||
// Create a KAZEIA bubble up-front. Until the first TTS
|
||||
// segment actually starts playing the bubble shows an
|
||||
// animated "." → ".." → "..." typing indicator so the
|
||||
// user knows Kazeia is thinking/synthesising; once the
|
||||
// first segment plays the dots are cleared and the
|
||||
// per-sentence word reveal takes over.
|
||||
val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = ".")
|
||||
addMessage(bubble)
|
||||
val revealScope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.Default)
|
||||
var revealedSoFar = ""
|
||||
val revealJobs = mutableListOf<kotlinx.coroutines.Job>()
|
||||
val firstSegmentSeen = java.util.concurrent.atomic.AtomicBoolean(false)
|
||||
val typingJob = revealScope.launch {
|
||||
var tick = 0
|
||||
while (!firstSegmentSeen.get()) {
|
||||
val dots = ".".repeat(1 + (tick % 3)) // . → .. → ...
|
||||
updateMessageText(bubble.id, dots)
|
||||
tick++
|
||||
kotlinx.coroutines.delay(400)
|
||||
}
|
||||
}
|
||||
try {
|
||||
pipeline.speakText(responseText) { sentence, durationMs, envelope, spectrogram ->
|
||||
// First segment: stop the typing indicator and
|
||||
// reset the bubble to empty so the word reveal
|
||||
// doesn't collide with the dots.
|
||||
if (firstSegmentSeen.compareAndSet(false, true)) {
|
||||
try { typingJob.cancel() } catch (_: Exception) {}
|
||||
updateMessageText(bubble.id, "")
|
||||
}
|
||||
// Push the envelope + spectrogram to the
|
||||
// visualizer at the same moment the MediaPlayer
|
||||
// starts playing so the orb reacts to this
|
||||
|
|
@ -1293,6 +1321,11 @@ class KazeiaService : Service() {
|
|||
revealJobs.forEach { try { it.join() } catch (_: Exception) {} }
|
||||
updateMessageText(bubble.id, responseText)
|
||||
} finally {
|
||||
// Defensive: cancel the typing dots in case no
|
||||
// segment ever fired (e.g. the response was entirely
|
||||
// emojis and got stripped empty).
|
||||
firstSegmentSeen.set(true)
|
||||
try { typingJob.cancel() } catch (_: Exception) {}
|
||||
_pipelineState.value = if (_isListening.value)
|
||||
PipelineState.Listening else PipelineState.Idle
|
||||
// If we're going back to mic listening, the VAD loop
|
||||
|
|
|
|||
|
|
@ -608,8 +608,53 @@ class Qwen3TtsEngine(
|
|||
|
||||
override fun isLoaded(): Boolean = loaded
|
||||
|
||||
/**
|
||||
* Hot-swap the speaker prefix/suffix embeddings used for voice
|
||||
* conditioning. [voicePath] is a WAV path like
|
||||
* `/…/voix/elodie.wav` — we derive the voice id from its basename
|
||||
* and look for matching `<id>_voice_prefix.bin` + `<id>_voice_suffix.bin`
|
||||
* in the model dir. If both files exist they replace the current
|
||||
* [damienVoicePrefix] / [damienVoiceSuffix] arrays so the next
|
||||
* segment generated uses the new voice. If either file is missing
|
||||
* we log a warning and keep the current voice — per-voice
|
||||
* prefix/suffix files are offline-generated via
|
||||
* scripts/prepare_tts_native.py; run once per voice WAV and
|
||||
* `adb push` into the model dir to enable.
|
||||
*
|
||||
* Thread-safety: the arrays are read by the synth worker on
|
||||
* Dispatchers.IO; replacing a reference via a volatile var is
|
||||
* atomic on the JVM so a mid-segment replacement just takes
|
||||
* effect on the next segment boundary.
|
||||
*/
|
||||
fun setVoice(voicePath: String) {
|
||||
nlog("Voice: $voicePath")
|
||||
val modelDir = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
|
||||
val id = java.io.File(voicePath).nameWithoutExtension.lowercase()
|
||||
val prefixFile = java.io.File("$modelDir/${id}_voice_prefix.bin")
|
||||
val suffixFile = java.io.File("$modelDir/${id}_voice_suffix.bin")
|
||||
if (!prefixFile.exists() || !suffixFile.exists()) {
|
||||
nlog("Voice '$id' not available (missing ${prefixFile.name} or ${suffixFile.name}); keeping current voice. " +
|
||||
"Run scripts/prepare_tts_native.py with this WAV to generate the files.")
|
||||
return
|
||||
}
|
||||
try {
|
||||
val pBytes = prefixFile.readBytes()
|
||||
val pHead = java.nio.ByteBuffer.wrap(pBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
|
||||
val nPref = pHead.int; val dimPref = pHead.int
|
||||
if (dimPref != TALKER_DIM) throw IllegalStateException("prefix dim $dimPref != $TALKER_DIM")
|
||||
val newPrefix = Array(nPref) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = pHead.float } }
|
||||
|
||||
val sBytes = suffixFile.readBytes()
|
||||
val sHead = java.nio.ByteBuffer.wrap(sBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
|
||||
val nSuf = sHead.int; val dimSuf = sHead.int
|
||||
if (dimSuf != TALKER_DIM) throw IllegalStateException("suffix dim $dimSuf != $TALKER_DIM")
|
||||
val newSuffix = Array(nSuf) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = sHead.float } }
|
||||
|
||||
damienVoicePrefix = newPrefix
|
||||
damienVoiceSuffix = newSuffix
|
||||
nlog("Voice switched to '$id' ($nPref prefix + $nSuf suffix embeds)")
|
||||
} catch (e: Exception) {
|
||||
nlog("Voice swap failed for '$id': ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
override suspend fun synthesize(text: String, language: String): TtsResult {
|
||||
|
|
|
|||
Loading…
Reference in New Issue