diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt index 033768f..edfbc35 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt @@ -164,7 +164,14 @@ class KazeiaPipeline { if (qwen != null) { qwen.onSegmentPlaying = onSegmentPlaying qwen.startStreamingSession() - val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) } + val streamer = com.kazeia.tts.SentenceStreamer { raw -> + // Strip emoji / non-speakable pictographs before TTS + // so a standalone "😊" doesn't become its own noisy + // segment. The chat bubble keeps the original text — + // only the audio path sees the cleaned version. + val spoken = stripNonSpeakable(raw).trim() + if (spoken.isNotEmpty()) qwen.enqueueSentence(spoken) + } streamer.append(text) streamer.flush() qwen.endStreamingSession() @@ -183,6 +190,41 @@ class KazeiaPipeline { _messages.value = _messages.value + msg } + /** + * Drop emoji + dingbat + pictographic characters so the TTS engine + * doesn't try to synthesize them. Covers the main Unicode emoji + * blocks (Miscellaneous Symbols, Dingbats, Emoticons, Transport, + * Supplemental Symbols and Pictographs, etc.) plus variation + * selectors and zero-width joiners that tag emoji sequences. + * Keeps everything in the Basic Latin / Latin-1 / Latin Extended + * ranges + common French punctuation untouched. + */ + private fun stripNonSpeakable(text: String): String { + val sb = StringBuilder(text.length) + var i = 0 + while (i < text.length) { + val cp = text.codePointAt(i) + val skip = when { + cp in 0x2600..0x27BF -> true // misc symbols + dingbats + cp in 0x1F300..0x1F5FF -> true // pictographs + cp in 0x1F600..0x1F64F -> true // emoticons + cp in 0x1F680..0x1F6FF -> true // transport + cp in 0x1F700..0x1F77F -> true // alchemical + cp in 0x1F780..0x1F7FF -> true // geometric extended + cp in 0x1F800..0x1F8FF -> true // supplemental arrows-c + cp in 0x1F900..0x1F9FF -> true // supplemental pictographs + cp in 0x1FA00..0x1FAFF -> true // symbols & pictographs extended-A + cp == 0x200D -> true // zero-width joiner + cp in 0xFE00..0xFE0F -> true // variation selectors + cp in 0x1F1E6..0x1F1FF -> true // regional indicators (flags) + else -> false + } + if (!skip) sb.appendCodePoint(cp) + i += Character.charCount(cp) + } + return sb.toString() + } + fun log(msg: String) { Log.i(TAG, msg) val time = java.text.SimpleDateFormat("HH:mm:ss.SSS", java.util.Locale.FRANCE) diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt index 691c52c..b52c0bc 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt @@ -628,6 +628,16 @@ class KazeiaService : Service() { if (chatterbox != null) { chatterbox.setVoice(voicePath) log("Voice set to: $voicePath") + return + } + val qwen = tts as? com.kazeia.tts.Qwen3TtsEngine + if (qwen != null) { + // Hot-swap prefix/suffix embeddings — no model reload. Takes + // effect from the NEXT synthesized segment (current in-flight + // one, if any, finishes with the old voice since the arrays + // are already in its closure). + qwen.setVoice(voicePath) + log("Voice set to: $voicePath") } } @@ -1241,18 +1251,36 @@ class KazeiaService : Service() { // the continuous-listening mic loop drops frames and we // don't feed our own speaker output back into STT. _pipelineState.value = PipelineState.Speaking - // Create an empty KAZEIA bubble up-front so the per-sentence - // reveal has somewhere to append to, but the text stays - // empty until TTS audio for the first sentence starts — - // matching the "conversation" feel the user asked for - // (read-as-you-hear, not read-then-hear). - val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = "") + // Create a KAZEIA bubble up-front. Until the first TTS + // segment actually starts playing the bubble shows an + // animated "." → ".." → "..." typing indicator so the + // user knows Kazeia is thinking/synthesising; once the + // first segment plays the dots are cleared and the + // per-sentence word reveal takes over. + val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = ".") addMessage(bubble) val revealScope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.Default) var revealedSoFar = "" val revealJobs = mutableListOf() + val firstSegmentSeen = java.util.concurrent.atomic.AtomicBoolean(false) + val typingJob = revealScope.launch { + var tick = 0 + while (!firstSegmentSeen.get()) { + val dots = ".".repeat(1 + (tick % 3)) // . → .. → ... + updateMessageText(bubble.id, dots) + tick++ + kotlinx.coroutines.delay(400) + } + } try { pipeline.speakText(responseText) { sentence, durationMs, envelope, spectrogram -> + // First segment: stop the typing indicator and + // reset the bubble to empty so the word reveal + // doesn't collide with the dots. + if (firstSegmentSeen.compareAndSet(false, true)) { + try { typingJob.cancel() } catch (_: Exception) {} + updateMessageText(bubble.id, "") + } // Push the envelope + spectrogram to the // visualizer at the same moment the MediaPlayer // starts playing so the orb reacts to this @@ -1293,6 +1321,11 @@ class KazeiaService : Service() { revealJobs.forEach { try { it.join() } catch (_: Exception) {} } updateMessageText(bubble.id, responseText) } finally { + // Defensive: cancel the typing dots in case no + // segment ever fired (e.g. the response was entirely + // emojis and got stripped empty). + firstSegmentSeen.set(true) + try { typingJob.cancel() } catch (_: Exception) {} _pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle // If we're going back to mic listening, the VAD loop diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index a26d6ae..a46fe4a 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -608,8 +608,53 @@ class Qwen3TtsEngine( override fun isLoaded(): Boolean = loaded + /** + * Hot-swap the speaker prefix/suffix embeddings used for voice + * conditioning. [voicePath] is a WAV path like + * `/…/voix/elodie.wav` — we derive the voice id from its basename + * and look for matching `_voice_prefix.bin` + `_voice_suffix.bin` + * in the model dir. If both files exist they replace the current + * [damienVoicePrefix] / [damienVoiceSuffix] arrays so the next + * segment generated uses the new voice. If either file is missing + * we log a warning and keep the current voice — per-voice + * prefix/suffix files are offline-generated via + * scripts/prepare_tts_native.py; run once per voice WAV and + * `adb push` into the model dir to enable. + * + * Thread-safety: the arrays are read by the synth worker on + * Dispatchers.IO; replacing a reference via a volatile var is + * atomic on the JVM so a mid-segment replacement just takes + * effect on the next segment boundary. + */ fun setVoice(voicePath: String) { - nlog("Voice: $voicePath") + val modelDir = "/data/local/tmp/kazeia/models/qwen3-tts-npu" + val id = java.io.File(voicePath).nameWithoutExtension.lowercase() + val prefixFile = java.io.File("$modelDir/${id}_voice_prefix.bin") + val suffixFile = java.io.File("$modelDir/${id}_voice_suffix.bin") + if (!prefixFile.exists() || !suffixFile.exists()) { + nlog("Voice '$id' not available (missing ${prefixFile.name} or ${suffixFile.name}); keeping current voice. " + + "Run scripts/prepare_tts_native.py with this WAV to generate the files.") + return + } + try { + val pBytes = prefixFile.readBytes() + val pHead = java.nio.ByteBuffer.wrap(pBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN) + val nPref = pHead.int; val dimPref = pHead.int + if (dimPref != TALKER_DIM) throw IllegalStateException("prefix dim $dimPref != $TALKER_DIM") + val newPrefix = Array(nPref) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = pHead.float } } + + val sBytes = suffixFile.readBytes() + val sHead = java.nio.ByteBuffer.wrap(sBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN) + val nSuf = sHead.int; val dimSuf = sHead.int + if (dimSuf != TALKER_DIM) throw IllegalStateException("suffix dim $dimSuf != $TALKER_DIM") + val newSuffix = Array(nSuf) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = sHead.float } } + + damienVoicePrefix = newPrefix + damienVoiceSuffix = newSuffix + nlog("Voice switched to '$id' ($nPref prefix + $nSuf suffix embeds)") + } catch (e: Exception) { + nlog("Voice swap failed for '$id': ${e.message}") + } } override suspend fun synthesize(text: String, language: String): TtsResult {