diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt index 6453ecd..db5170f 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt @@ -145,11 +145,12 @@ class KazeiaPipeline { suspend fun speakText( text: String, // Fires the instant each synthesized sentence starts playing - // through the speaker, with the sentence text and its audio - // duration. Used by processLlmResponse to defer the KAZEIA - // chat bubble appearance until sound is audible and to pace - // word-by-word reveal inside the bubble. - onSegmentPlaying: ((sentence: String, durationMs: Long) -> Unit)? = null + // through the speaker, with the sentence text, audio duration, + // and a per-ENVELOPE_WINDOW_MS RMS envelope. Used by + // processLlmResponse to defer the KAZEIA chat bubble appearance + // until sound is audible, pace word-by-word reveal inside the + // bubble, and drive the AudioVisualizerView orb. + onSegmentPlaying: ((sentence: String, durationMs: Long, rmsEnvelope: FloatArray) -> Unit)? = null ) { val ttsEngine = tts ?: return _pipelineState.value = PipelineState.Speaking diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt index 3e1ec4d..28ce395 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt @@ -83,6 +83,20 @@ class KazeiaService : Service() { private val _isListening = MutableStateFlow(false) val isListening: StateFlow = _isListening + // Drives the AudioVisualizerView orb. Pushed from the VAD loop + // during mic capture (mic RMS, normalized) and from the TTS engine's + // onSegmentPlaying callback (TTS RMS envelope per-segment). The view + // reads this via collectLatest in ChatActivity; the signals carry + // their own state so the visualizer knows whether it's idle, tracking + // the mic, or rendering a TTS segment. + sealed class VisualizerSignal { + object Idle : VisualizerSignal() + data class Listening(val micRms: Float) : VisualizerSignal() + data class Speaking(val rmsEnvelope: FloatArray, val durationMs: Long) : VisualizerSignal() + } + private val _visualizerSignal = MutableStateFlow(VisualizerSignal.Idle) + val visualizerSignal: StateFlow = _visualizerSignal + private val _debugMode = MutableStateFlow(false) val debugMode: StateFlow = _debugMode @@ -852,6 +866,14 @@ class KazeiaService : Service() { for (s in frame) sumSq += s.toLong() * s.toLong() val rms = Math.sqrt(sumSq.toDouble() / frameSize).toInt() + // Drive the visualizer orb. Normalize with the same + // sqrt squashing used for TTS so loud peaks don't + // saturate and quiet speech is still visible. The + // visualizer stays in Listening mode; it will swap + // to Speaking or Idle when pipelineState moves on. + val rmsNorm = kotlin.math.sqrt((rms / 6000f).coerceIn(0f, 1f)) + _visualizerSignal.value = VisualizerSignal.Listening(rmsNorm) + // Log RMS every second for calibration if (frameCount % 10 == 0) { Log.d(TAG, "VAD RMS=$rms (threshold=$silenceThreshold)") @@ -1216,7 +1238,12 @@ class KazeiaService : Service() { var revealedSoFar = "" val revealJobs = mutableListOf() try { - pipeline.speakText(responseText) { sentence, durationMs -> + pipeline.speakText(responseText) { sentence, durationMs, envelope -> + // Push the envelope to the visualizer at the same + // moment the MediaPlayer starts playing so the orb + // reacts to this segment's actual energy. + _visualizerSignal.value = + VisualizerSignal.Speaking(envelope, durationMs) // Start a coroutine that appends one word at a time // over the segment's audio duration. Words are // separated on whitespace; punctuation rides with @@ -1252,6 +1279,13 @@ class KazeiaService : Service() { } finally { _pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle + // If we're going back to mic listening, the VAD loop + // will keep pushing Listening signals; otherwise drop + // to Idle so the orb settles back to its breathing + // baseline. + if (!_isListening.value) { + _visualizerSignal.value = VisualizerSignal.Idle + } } } else { _pipelineState.value = if (_isListening.value) diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index 6edf736..9f1335a 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -108,6 +108,11 @@ class Qwen3TtsEngine( // (WAV write + MediaPlayer prepare add ~150 ms per segment) but // it's the only reliable path to audible output on this device. private const val USE_MEDIAPLAYER_FALLBACK = true + + // Window size for the TTS→visualizer RMS sidecar. 50 ms at 24 kHz + // = 1200 samples/window — small enough for a 60 fps visualizer to + // track formants, large enough to run at negligible CPU cost. + const val ENVELOPE_WINDOW_MS = 50 } private var ortEnv: OrtEnvironment? = null @@ -3380,10 +3385,13 @@ class Qwen3TtsEngine( * Fires the moment a synthesized segment starts playing through the * speaker. [sentence] is the original text submitted to * [enqueueSentence], [durationMs] is the WAV duration so the caller - * can drive a progressive-reveal UI timer matched to speech pacing. + * can drive a progressive-reveal UI timer matched to speech pacing, + * and [rmsEnvelope] is a per-[ENVELOPE_WINDOW_MS] normalized RMS + * sidecar the UI can use to drive an audio-reactive visualizer + * without needing access to the live PCM stream from MediaPlayer. * Set before calling [startStreamingSession]; cleared on session end. */ - var onSegmentPlaying: ((sentence: String, durationMs: Long) -> Unit)? = null + var onSegmentPlaying: ((sentence: String, durationMs: Long, rmsEnvelope: FloatArray) -> Unit)? = null private fun startStreamingSessionMp() { if (sessionMpQueue != null) return @@ -3413,8 +3421,9 @@ class Qwen3TtsEngine( val wavPath = "${context?.cacheDir?.absolutePath ?: "/data/local/tmp/kazeia"}/tts_seg_${segIdx}.wav" saveWav(wavPath, audio) val durationMs = audio.size * 1000L / SR - nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio), queued for playback") - wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs)) + val envelope = computeRmsEnvelope(audio) + nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio, ${envelope.size} env windows), queued for playback") + wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs, envelope)) } catch (e: Exception) { nlog("MP synth error: ${e.message}") } @@ -3475,7 +3484,7 @@ class Qwen3TtsEngine( current = prepareMp(first.wavPath, first.segIdx) current!!.setOnCompletionListener { it.release() } current!!.start() - try { onSegmentPlaying?.invoke(first.sentence, first.durationMs) } catch (_: Exception) {} + try { onSegmentPlaying?.invoke(first.sentence, first.durationMs, first.rmsEnvelope) } catch (_: Exception) {} nlog("MP seg ${first.segIdx} started (chained, ${first.durationMs}ms)") while (true) { @@ -3497,7 +3506,7 @@ class Qwen3TtsEngine( // `next` player was chained via setNextMediaPlayer and has // auto-started at this point; notify the UI so it can start // revealing the sentence in sync with the audio. - try { onSegmentPlaying?.invoke(currentInfo!!.sentence, currentInfo!!.durationMs) } catch (_: Exception) {} + try { onSegmentPlaying?.invoke(currentInfo!!.sentence, currentInfo!!.durationMs, currentInfo!!.rmsEnvelope) } catch (_: Exception) {} next = null nextInfo = null } @@ -3517,14 +3526,44 @@ class Qwen3TtsEngine( /** Payload handed from the synth worker to the playback worker so * the UI can be notified with matching text + duration when each - * segment starts playing. */ + * segment starts playing. The [rmsEnvelope] is an optional sidecar + * array of per-ENVELOPE_WINDOW_MS RMS values normalized to [0, 1] + * that drives the audio-reactive orb visualizer without having to + * read PCM back from MediaPlayer. */ private data class SegmentReady( val segIdx: Int, val wavPath: String, val sentence: String, - val durationMs: Long + val durationMs: Long, + val rmsEnvelope: FloatArray ) + /** Compute a per-ENVELOPE_WINDOW_MS normalized RMS envelope from a + * mono 16-bit PCM buffer at [SR]. Cheap (one pass, trivially fast + * on the ~100 k samples we generate per segment) and called only + * once per segment right after synthesis. */ + private fun computeRmsEnvelope(audio: ShortArray): FloatArray { + if (audio.isEmpty()) return FloatArray(0) + val windowSamples = SR * ENVELOPE_WINDOW_MS / 1000 + val nWindows = (audio.size + windowSamples - 1) / windowSamples + val env = FloatArray(nWindows) + for (w in 0 until nWindows) { + val start = w * windowSamples + val end = minOf(start + windowSamples, audio.size) + var sumSq = 0.0 + for (i in start until end) { + val s = audio[i].toDouble() + sumSq += s * s + } + val rms = kotlin.math.sqrt(sumSq / (end - start)) + // Normalize: 32767 is full-scale; squash the upper range + // with a sqrt curve so even quiet speech shows visible + // motion without saturating on loud peaks. + env[w] = kotlin.math.sqrt((rms / 32767.0).coerceIn(0.0, 1.0)).toFloat() + } + return env + } + private suspend fun waitForPlaybackCompletion( mp: android.media.MediaPlayer, segIdx: Int ) { diff --git a/kazeia-android/app/src/main/java/com/kazeia/ui/AudioVisualizerView.kt b/kazeia-android/app/src/main/java/com/kazeia/ui/AudioVisualizerView.kt new file mode 100644 index 0000000..6368166 --- /dev/null +++ b/kazeia-android/app/src/main/java/com/kazeia/ui/AudioVisualizerView.kt @@ -0,0 +1,266 @@ +package com.kazeia.ui + +import android.content.Context +import android.graphics.Canvas +import android.graphics.Color +import android.graphics.Paint +import android.graphics.RadialGradient +import android.graphics.Shader +import android.util.AttributeSet +import android.view.Choreographer +import android.view.View +import kotlin.math.cos +import kotlin.math.max +import kotlin.math.min +import kotlin.math.sin + +/** + * Épuré audio-reactive orb visualizer for the TTS + STT feedback loop. + * + * Three states driven by [setIdle], [setListening], [startSpeaking]: + * + * - **Idle**: fixed orb with a slow respiratory pulsation (~4 s cycle) + * and a faint halo, matching the "chatbot is awake, waiting" vibe. + * Minimal GPU work — a single draw per frame with easing precomputed. + * + * - **Listening**: the orb grows and its halo brightens with the live + * mic RMS passed into [setListening]. Concentric micro-waves ripple + * outward to confirm the app is hearing the user, before STT has any + * result. Useful feedback during the ~1 s silence gap before Whisper + * fires. + * + * - **Speaking**: amplitude and halo track the pre-computed TTS RMS + * envelope (one float per 50 ms) passed into [startSpeaking]. The view + * walks through the envelope using its own internal timer synced to + * [durationMs], so it doesn't need MediaPlayer.getCurrentPosition. + * Outward ripples fire on each envelope peak above the current floor. + * + * All animation runs on [Choreographer.FrameCallback]. At Idle, the + * frame callback self-throttles to ~20 fps (still smooth for a 4 s + * breathing cycle) to keep CPU cost near zero. During Listening and + * Speaking it runs at display refresh (60/90/120 fps). + */ +class AudioVisualizerView @JvmOverloads constructor( + context: Context, + attrs: AttributeSet? = null, + defStyleAttr: Int = 0 +) : View(context, attrs, defStyleAttr), Choreographer.FrameCallback { + + // --- Configuration --- + // Colors picked for a calm, non-clinical feel. Soft lavender/blue + // core with a slightly warmer outer halo; all in the same hue family + // so transitions between states stay visually continuous. + private val coreColor = Color.parseColor("#BCA4E8") // soft lavender + private val haloColor = Color.parseColor("#8B6EC9") // deeper violet + private val rippleColor = Color.parseColor("#A48FDD") // between the two + + // Amplitude gain so TTS signal ([0,1]) maps to perceptible size. + // Observed: normalized TTS RMS rarely exceeds ~0.5, so we stretch. + private val amplitudeGain = 1.8f + + // --- State machine --- + private sealed class State { + object Idle : State() + data class Listening(var micRms: Float) : State() + data class Speaking( + val envelope: FloatArray, + val durationMs: Long, + val startedAtMs: Long + ) : State() + } + + @Volatile private var state: State = State.Idle + + // --- Animation state (mutated on UI thread from doFrame) --- + private var frameStartNs = 0L + private var lastFrameNs = 0L + private var smoothedAmp = 0f // exponential smoothing on amplitude + private val ripples = ArrayList() + private var lastEnvelopeIdx = -1 + + // Paints are allocated once; colors/alphas tweaked per frame. + private val corePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL } + private val haloPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL } + private val ripplePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { + style = Paint.Style.STROKE + strokeWidth = 4f + } + + init { + setLayerType(LAYER_TYPE_HARDWARE, null) + } + + // ==================== Public API ==================== + + fun setIdle() { + state = State.Idle + scheduleFrame() + } + + fun setListening(micRms: Float) { + val s = state + if (s is State.Listening) s.micRms = micRms.coerceIn(0f, 1f) + else state = State.Listening(micRms.coerceIn(0f, 1f)) + scheduleFrame() + } + + fun startSpeaking(envelope: FloatArray, durationMs: Long) { + if (envelope.isEmpty() || durationMs <= 0) { setIdle(); return } + state = State.Speaking(envelope, durationMs, System.currentTimeMillis()) + lastEnvelopeIdx = -1 + scheduleFrame() + } + + // ==================== View lifecycle ==================== + + override fun onAttachedToWindow() { + super.onAttachedToWindow() + frameStartNs = System.nanoTime() + scheduleFrame() + } + + override fun onDetachedFromWindow() { + super.onDetachedFromWindow() + Choreographer.getInstance().removeFrameCallback(this) + } + + private var frameScheduled = false + private fun scheduleFrame() { + if (!frameScheduled && isAttachedToWindow) { + frameScheduled = true + Choreographer.getInstance().postFrameCallback(this) + } + } + + override fun doFrame(frameTimeNanos: Long) { + frameScheduled = false + lastFrameNs = frameTimeNanos + + val s = state + when (s) { + is State.Idle -> { + // Self-throttled loop at ~20 fps for the breathing pulse. + Choreographer.getInstance().postFrameCallbackDelayed(this, 50) + frameScheduled = true + } + is State.Listening -> { + Choreographer.getInstance().postFrameCallback(this) + frameScheduled = true + } + is State.Speaking -> { + val elapsed = System.currentTimeMillis() - s.startedAtMs + if (elapsed >= s.durationMs + 300) { + // Auto-fallback to Idle if no explicit transition. + // The +300 ms grace lets the final envelope decay + // finish visibly before we snap back. + state = State.Idle + Choreographer.getInstance().postFrameCallbackDelayed(this, 50) + frameScheduled = true + } else { + Choreographer.getInstance().postFrameCallback(this) + frameScheduled = true + } + } + } + invalidate() + } + + // ==================== Drawing ==================== + + override fun onDraw(canvas: Canvas) { + super.onDraw(canvas) + val w = width.toFloat(); val h = height.toFloat() + if (w <= 0 || h <= 0) return + val cx = w / 2f; val cy = h / 2f + val maxR = min(w, h) * 0.42f + + // Compute target amplitude in [0, 1] for the current state. + val now = System.currentTimeMillis() + val target: Float = when (val s = state) { + is State.Idle -> { + // 4 s breathing cycle via a soft sine; amplitude 0 → 0.12. + val t = (now - frameStartNs / 1_000_000) % 4000L / 4000f + 0.06f + 0.06f * (0.5f + 0.5f * sin((t * 2f * Math.PI).toFloat())) + } + is State.Listening -> { + // Base breathing + live mic contribution. + val t = (now - frameStartNs / 1_000_000) % 4000L / 4000f + val breath = 0.08f + 0.04f * (0.5f + 0.5f * sin((t * 2f * Math.PI).toFloat())) + breath + 0.55f * s.micRms + } + is State.Speaking -> { + val idxF = (now - s.startedAtMs).toFloat() * + s.envelope.size / s.durationMs.toFloat() + val idx = idxF.toInt().coerceIn(0, s.envelope.size - 1) + val frac = (idxF - idx).coerceIn(0f, 1f) + val a = s.envelope[idx] + val b = s.envelope[min(idx + 1, s.envelope.size - 1)] + val env = a + (b - a) * frac + + // Emit a ripple whenever we cross a local peak above a + // floor, at most once per envelope step. + if (idx != lastEnvelopeIdx && env > 0.35f) { + val prev = if (idx > 0) s.envelope[idx - 1] else 0f + val next = if (idx < s.envelope.size - 1) s.envelope[idx + 1] else 0f + if (env >= prev && env >= next) { + ripples.add(Ripple(bornAtMs = now, peak = env)) + } + lastEnvelopeIdx = idx + } + (env * amplitudeGain).coerceIn(0f, 1f) + } + } + + // Exponential smoothing so frame-to-frame changes feel organic. + smoothedAmp += (target - smoothedAmp) * 0.25f + + // --- Halo (radial gradient, grows with amplitude) --- + val haloR = maxR * (0.85f + 0.35f * smoothedAmp) + val haloAlpha = (80 + 100 * smoothedAmp).toInt().coerceIn(0, 200) + haloPaint.shader = RadialGradient( + cx, cy, haloR, + intArrayOf( + Color.argb(haloAlpha, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor)), + Color.argb(0, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor)) + ), + floatArrayOf(0f, 1f), + Shader.TileMode.CLAMP + ) + canvas.drawCircle(cx, cy, haloR, haloPaint) + + // --- Ripples --- + if (ripples.isNotEmpty()) { + val it = ripples.iterator() + while (it.hasNext()) { + val r = it.next() + val age = (now - r.bornAtMs) / 900f // 900 ms lifetime + if (age >= 1f) { it.remove(); continue } + val radius = maxR * (0.55f + 0.6f * age) + val alpha = ((1f - age) * 140f * r.peak).toInt().coerceIn(0, 200) + ripplePaint.color = Color.argb( + alpha, + Color.red(rippleColor), + Color.green(rippleColor), + Color.blue(rippleColor) + ) + ripplePaint.strokeWidth = max(1.5f, (1f - age) * 5f) + canvas.drawCircle(cx, cy, radius, ripplePaint) + } + } + + // --- Core orb --- + val coreR = maxR * (0.45f + 0.25f * smoothedAmp) + corePaint.shader = RadialGradient( + cx, cy, coreR, + intArrayOf( + Color.argb(255, Color.red(coreColor), Color.green(coreColor), Color.blue(coreColor)), + Color.argb(180, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor)) + ), + floatArrayOf(0f, 1f), + Shader.TileMode.CLAMP + ) + canvas.drawCircle(cx, cy, coreR, corePaint) + } + + private class Ripple(val bornAtMs: Long, val peak: Float) +} diff --git a/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt b/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt index 2d34bcb..2bf7db8 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt @@ -326,6 +326,33 @@ class ChatActivity : AppCompatActivity() { setDebugPanelVisible(debug) } } + launch { + // Drive the orb visualizer from the service-side signal. + // Service decides whether the app is idle, tracking the + // mic, or rendering a TTS segment; the view just renders + // it. StartSpeaking is edge-triggered on the envelope + // identity so re-emitting the same signal won't restart + // the animation timer. + var lastSpeakingEnv: FloatArray? = null + service.visualizerSignal.collect { sig -> + when (sig) { + is com.kazeia.service.KazeiaService.VisualizerSignal.Idle -> { + binding.audioViz.setIdle() + lastSpeakingEnv = null + } + is com.kazeia.service.KazeiaService.VisualizerSignal.Listening -> { + binding.audioViz.setListening(sig.micRms) + lastSpeakingEnv = null + } + is com.kazeia.service.KazeiaService.VisualizerSignal.Speaking -> { + if (sig.rmsEnvelope !== lastSpeakingEnv) { + binding.audioViz.startSpeaking(sig.rmsEnvelope, sig.durationMs) + lastSpeakingEnv = sig.rmsEnvelope + } + } + } + } + } } } } diff --git a/kazeia-android/app/src/main/res/layout/activity_chat.xml b/kazeia-android/app/src/main/res/layout/activity_chat.xml index 27167a4..908346c 100644 --- a/kazeia-android/app/src/main/res/layout/activity_chat.xml +++ b/kazeia-android/app/src/main/res/layout/activity_chat.xml @@ -100,6 +100,18 @@ + + +