UI: épuré audio-reactive orb visualizer — replaces 3D avatar for MVP

Adds a breathing lavender orb centred above the chat list that tracks
the actual audio state of the app:

- **Idle**: slow respiratory pulsation (~4 s cycle) at 20 fps. The
  chatbot is visually "awake" without animating loudly.
- **Listening**: halo swells with live mic RMS from the VAD loop, so
  the user sees Kazeia hearing them even before Whisper has produced
  any transcription. Mic RMS is normalised with the same sqrt
  squashing the TTS envelope uses so quiet speech still reads visibly.
- **Speaking**: amplitude + halo driven by a pre-computed RMS envelope
  (50 ms windows, sqrt-normalised) produced at synthesis time. Ripples
  fire on local peaks above 0.35 — matches speech rhythm without
  overwhelming. Timer is internal to the view, synced to the segment's
  durationMs; no MediaPlayer position polling.

Architecture:
- Sidecar RMS envelope. Computed in Qwen3TtsEngine.generateSegmentAudioVC
  right after PCM is available, packed into SegmentReady, and handed to
  onSegmentPlaying(sentence, durationMs, rmsEnvelope) when each MediaPlayer
  starts. Zero extra IO — runs on the same PCM we already write to WAV.
- KazeiaService exposes VisualizerSignal (Idle | Listening(rms) |
  Speaking(env, dur)) as a StateFlow. The VAD loop pushes Listening,
  processLlmResponse pushes Speaking from the per-segment TTS callback,
  and finally clears to Idle when no mic is open.
- AudioVisualizerView renders via Choreographer.FrameCallback, self-
  throttled to 20 fps at Idle and full refresh during Listening/
  Speaking. Hardware layer. Pure Kotlin + Canvas, no deps. ~280 LOC.

Layout: 140 dp strip between voiceBar and rvMessages in activity_chat.xml.

No 3D engine, no Unity, no splash extension. The avatar design work
remains on disk for a later phase when the TTS+streaming pipeline
stabilises enough to spend time on DECA/FLAME integration.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-14 23:20:15 +02:00
parent f17131aefb
commit 8939c680b2
6 changed files with 394 additions and 15 deletions

View File

@ -145,11 +145,12 @@ class KazeiaPipeline {
suspend fun speakText(
text: String,
// Fires the instant each synthesized sentence starts playing
// through the speaker, with the sentence text and its audio
// duration. Used by processLlmResponse to defer the KAZEIA
// chat bubble appearance until sound is audible and to pace
// word-by-word reveal inside the bubble.
onSegmentPlaying: ((sentence: String, durationMs: Long) -> Unit)? = null
// through the speaker, with the sentence text, audio duration,
// and a per-ENVELOPE_WINDOW_MS RMS envelope. Used by
// processLlmResponse to defer the KAZEIA chat bubble appearance
// until sound is audible, pace word-by-word reveal inside the
// bubble, and drive the AudioVisualizerView orb.
onSegmentPlaying: ((sentence: String, durationMs: Long, rmsEnvelope: FloatArray) -> Unit)? = null
) {
val ttsEngine = tts ?: return
_pipelineState.value = PipelineState.Speaking

View File

@ -83,6 +83,20 @@ class KazeiaService : Service() {
private val _isListening = MutableStateFlow(false)
val isListening: StateFlow<Boolean> = _isListening
// Drives the AudioVisualizerView orb. Pushed from the VAD loop
// during mic capture (mic RMS, normalized) and from the TTS engine's
// onSegmentPlaying callback (TTS RMS envelope per-segment). The view
// reads this via collectLatest in ChatActivity; the signals carry
// their own state so the visualizer knows whether it's idle, tracking
// the mic, or rendering a TTS segment.
sealed class VisualizerSignal {
object Idle : VisualizerSignal()
data class Listening(val micRms: Float) : VisualizerSignal()
data class Speaking(val rmsEnvelope: FloatArray, val durationMs: Long) : VisualizerSignal()
}
private val _visualizerSignal = MutableStateFlow<VisualizerSignal>(VisualizerSignal.Idle)
val visualizerSignal: StateFlow<VisualizerSignal> = _visualizerSignal
private val _debugMode = MutableStateFlow(false)
val debugMode: StateFlow<Boolean> = _debugMode
@ -852,6 +866,14 @@ class KazeiaService : Service() {
for (s in frame) sumSq += s.toLong() * s.toLong()
val rms = Math.sqrt(sumSq.toDouble() / frameSize).toInt()
// Drive the visualizer orb. Normalize with the same
// sqrt squashing used for TTS so loud peaks don't
// saturate and quiet speech is still visible. The
// visualizer stays in Listening mode; it will swap
// to Speaking or Idle when pipelineState moves on.
val rmsNorm = kotlin.math.sqrt((rms / 6000f).coerceIn(0f, 1f))
_visualizerSignal.value = VisualizerSignal.Listening(rmsNorm)
// Log RMS every second for calibration
if (frameCount % 10 == 0) {
Log.d(TAG, "VAD RMS=$rms (threshold=$silenceThreshold)")
@ -1216,7 +1238,12 @@ class KazeiaService : Service() {
var revealedSoFar = ""
val revealJobs = mutableListOf<kotlinx.coroutines.Job>()
try {
pipeline.speakText(responseText) { sentence, durationMs ->
pipeline.speakText(responseText) { sentence, durationMs, envelope ->
// Push the envelope to the visualizer at the same
// moment the MediaPlayer starts playing so the orb
// reacts to this segment's actual energy.
_visualizerSignal.value =
VisualizerSignal.Speaking(envelope, durationMs)
// Start a coroutine that appends one word at a time
// over the segment's audio duration. Words are
// separated on whitespace; punctuation rides with
@ -1252,6 +1279,13 @@ class KazeiaService : Service() {
} finally {
_pipelineState.value = if (_isListening.value)
PipelineState.Listening else PipelineState.Idle
// If we're going back to mic listening, the VAD loop
// will keep pushing Listening signals; otherwise drop
// to Idle so the orb settles back to its breathing
// baseline.
if (!_isListening.value) {
_visualizerSignal.value = VisualizerSignal.Idle
}
}
} else {
_pipelineState.value = if (_isListening.value)

View File

@ -108,6 +108,11 @@ class Qwen3TtsEngine(
// (WAV write + MediaPlayer prepare add ~150 ms per segment) but
// it's the only reliable path to audible output on this device.
private const val USE_MEDIAPLAYER_FALLBACK = true
// Window size for the TTS→visualizer RMS sidecar. 50 ms at 24 kHz
// = 1200 samples/window — small enough for a 60 fps visualizer to
// track formants, large enough to run at negligible CPU cost.
const val ENVELOPE_WINDOW_MS = 50
}
private var ortEnv: OrtEnvironment? = null
@ -3380,10 +3385,13 @@ class Qwen3TtsEngine(
* Fires the moment a synthesized segment starts playing through the
* speaker. [sentence] is the original text submitted to
* [enqueueSentence], [durationMs] is the WAV duration so the caller
* can drive a progressive-reveal UI timer matched to speech pacing.
* can drive a progressive-reveal UI timer matched to speech pacing,
* and [rmsEnvelope] is a per-[ENVELOPE_WINDOW_MS] normalized RMS
* sidecar the UI can use to drive an audio-reactive visualizer
* without needing access to the live PCM stream from MediaPlayer.
* Set before calling [startStreamingSession]; cleared on session end.
*/
var onSegmentPlaying: ((sentence: String, durationMs: Long) -> Unit)? = null
var onSegmentPlaying: ((sentence: String, durationMs: Long, rmsEnvelope: FloatArray) -> Unit)? = null
private fun startStreamingSessionMp() {
if (sessionMpQueue != null) return
@ -3413,8 +3421,9 @@ class Qwen3TtsEngine(
val wavPath = "${context?.cacheDir?.absolutePath ?: "/data/local/tmp/kazeia"}/tts_seg_${segIdx}.wav"
saveWav(wavPath, audio)
val durationMs = audio.size * 1000L / SR
nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio), queued for playback")
wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs))
val envelope = computeRmsEnvelope(audio)
nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio, ${envelope.size} env windows), queued for playback")
wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs, envelope))
} catch (e: Exception) {
nlog("MP synth error: ${e.message}")
}
@ -3475,7 +3484,7 @@ class Qwen3TtsEngine(
current = prepareMp(first.wavPath, first.segIdx)
current!!.setOnCompletionListener { it.release() }
current!!.start()
try { onSegmentPlaying?.invoke(first.sentence, first.durationMs) } catch (_: Exception) {}
try { onSegmentPlaying?.invoke(first.sentence, first.durationMs, first.rmsEnvelope) } catch (_: Exception) {}
nlog("MP seg ${first.segIdx} started (chained, ${first.durationMs}ms)")
while (true) {
@ -3497,7 +3506,7 @@ class Qwen3TtsEngine(
// `next` player was chained via setNextMediaPlayer and has
// auto-started at this point; notify the UI so it can start
// revealing the sentence in sync with the audio.
try { onSegmentPlaying?.invoke(currentInfo!!.sentence, currentInfo!!.durationMs) } catch (_: Exception) {}
try { onSegmentPlaying?.invoke(currentInfo!!.sentence, currentInfo!!.durationMs, currentInfo!!.rmsEnvelope) } catch (_: Exception) {}
next = null
nextInfo = null
}
@ -3517,14 +3526,44 @@ class Qwen3TtsEngine(
/** Payload handed from the synth worker to the playback worker so
* the UI can be notified with matching text + duration when each
* segment starts playing. */
* segment starts playing. The [rmsEnvelope] is an optional sidecar
* array of per-ENVELOPE_WINDOW_MS RMS values normalized to [0, 1]
* that drives the audio-reactive orb visualizer without having to
* read PCM back from MediaPlayer. */
private data class SegmentReady(
val segIdx: Int,
val wavPath: String,
val sentence: String,
val durationMs: Long
val durationMs: Long,
val rmsEnvelope: FloatArray
)
/** Compute a per-ENVELOPE_WINDOW_MS normalized RMS envelope from a
* mono 16-bit PCM buffer at [SR]. Cheap (one pass, trivially fast
* on the ~100 k samples we generate per segment) and called only
* once per segment right after synthesis. */
private fun computeRmsEnvelope(audio: ShortArray): FloatArray {
if (audio.isEmpty()) return FloatArray(0)
val windowSamples = SR * ENVELOPE_WINDOW_MS / 1000
val nWindows = (audio.size + windowSamples - 1) / windowSamples
val env = FloatArray(nWindows)
for (w in 0 until nWindows) {
val start = w * windowSamples
val end = minOf(start + windowSamples, audio.size)
var sumSq = 0.0
for (i in start until end) {
val s = audio[i].toDouble()
sumSq += s * s
}
val rms = kotlin.math.sqrt(sumSq / (end - start))
// Normalize: 32767 is full-scale; squash the upper range
// with a sqrt curve so even quiet speech shows visible
// motion without saturating on loud peaks.
env[w] = kotlin.math.sqrt((rms / 32767.0).coerceIn(0.0, 1.0)).toFloat()
}
return env
}
private suspend fun waitForPlaybackCompletion(
mp: android.media.MediaPlayer, segIdx: Int
) {

View File

@ -0,0 +1,266 @@
package com.kazeia.ui
import android.content.Context
import android.graphics.Canvas
import android.graphics.Color
import android.graphics.Paint
import android.graphics.RadialGradient
import android.graphics.Shader
import android.util.AttributeSet
import android.view.Choreographer
import android.view.View
import kotlin.math.cos
import kotlin.math.max
import kotlin.math.min
import kotlin.math.sin
/**
* Épuré audio-reactive orb visualizer for the TTS + STT feedback loop.
*
* Three states driven by [setIdle], [setListening], [startSpeaking]:
*
* - **Idle**: fixed orb with a slow respiratory pulsation (~4 s cycle)
* and a faint halo, matching the "chatbot is awake, waiting" vibe.
* Minimal GPU work a single draw per frame with easing precomputed.
*
* - **Listening**: the orb grows and its halo brightens with the live
* mic RMS passed into [setListening]. Concentric micro-waves ripple
* outward to confirm the app is hearing the user, before STT has any
* result. Useful feedback during the ~1 s silence gap before Whisper
* fires.
*
* - **Speaking**: amplitude and halo track the pre-computed TTS RMS
* envelope (one float per 50 ms) passed into [startSpeaking]. The view
* walks through the envelope using its own internal timer synced to
* [durationMs], so it doesn't need MediaPlayer.getCurrentPosition.
* Outward ripples fire on each envelope peak above the current floor.
*
* All animation runs on [Choreographer.FrameCallback]. At Idle, the
* frame callback self-throttles to ~20 fps (still smooth for a 4 s
* breathing cycle) to keep CPU cost near zero. During Listening and
* Speaking it runs at display refresh (60/90/120 fps).
*/
class AudioVisualizerView @JvmOverloads constructor(
context: Context,
attrs: AttributeSet? = null,
defStyleAttr: Int = 0
) : View(context, attrs, defStyleAttr), Choreographer.FrameCallback {
// --- Configuration ---
// Colors picked for a calm, non-clinical feel. Soft lavender/blue
// core with a slightly warmer outer halo; all in the same hue family
// so transitions between states stay visually continuous.
private val coreColor = Color.parseColor("#BCA4E8") // soft lavender
private val haloColor = Color.parseColor("#8B6EC9") // deeper violet
private val rippleColor = Color.parseColor("#A48FDD") // between the two
// Amplitude gain so TTS signal ([0,1]) maps to perceptible size.
// Observed: normalized TTS RMS rarely exceeds ~0.5, so we stretch.
private val amplitudeGain = 1.8f
// --- State machine ---
private sealed class State {
object Idle : State()
data class Listening(var micRms: Float) : State()
data class Speaking(
val envelope: FloatArray,
val durationMs: Long,
val startedAtMs: Long
) : State()
}
@Volatile private var state: State = State.Idle
// --- Animation state (mutated on UI thread from doFrame) ---
private var frameStartNs = 0L
private var lastFrameNs = 0L
private var smoothedAmp = 0f // exponential smoothing on amplitude
private val ripples = ArrayList<Ripple>()
private var lastEnvelopeIdx = -1
// Paints are allocated once; colors/alphas tweaked per frame.
private val corePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
private val haloPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
private val ripplePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.STROKE
strokeWidth = 4f
}
init {
setLayerType(LAYER_TYPE_HARDWARE, null)
}
// ==================== Public API ====================
fun setIdle() {
state = State.Idle
scheduleFrame()
}
fun setListening(micRms: Float) {
val s = state
if (s is State.Listening) s.micRms = micRms.coerceIn(0f, 1f)
else state = State.Listening(micRms.coerceIn(0f, 1f))
scheduleFrame()
}
fun startSpeaking(envelope: FloatArray, durationMs: Long) {
if (envelope.isEmpty() || durationMs <= 0) { setIdle(); return }
state = State.Speaking(envelope, durationMs, System.currentTimeMillis())
lastEnvelopeIdx = -1
scheduleFrame()
}
// ==================== View lifecycle ====================
override fun onAttachedToWindow() {
super.onAttachedToWindow()
frameStartNs = System.nanoTime()
scheduleFrame()
}
override fun onDetachedFromWindow() {
super.onDetachedFromWindow()
Choreographer.getInstance().removeFrameCallback(this)
}
private var frameScheduled = false
private fun scheduleFrame() {
if (!frameScheduled && isAttachedToWindow) {
frameScheduled = true
Choreographer.getInstance().postFrameCallback(this)
}
}
override fun doFrame(frameTimeNanos: Long) {
frameScheduled = false
lastFrameNs = frameTimeNanos
val s = state
when (s) {
is State.Idle -> {
// Self-throttled loop at ~20 fps for the breathing pulse.
Choreographer.getInstance().postFrameCallbackDelayed(this, 50)
frameScheduled = true
}
is State.Listening -> {
Choreographer.getInstance().postFrameCallback(this)
frameScheduled = true
}
is State.Speaking -> {
val elapsed = System.currentTimeMillis() - s.startedAtMs
if (elapsed >= s.durationMs + 300) {
// Auto-fallback to Idle if no explicit transition.
// The +300 ms grace lets the final envelope decay
// finish visibly before we snap back.
state = State.Idle
Choreographer.getInstance().postFrameCallbackDelayed(this, 50)
frameScheduled = true
} else {
Choreographer.getInstance().postFrameCallback(this)
frameScheduled = true
}
}
}
invalidate()
}
// ==================== Drawing ====================
override fun onDraw(canvas: Canvas) {
super.onDraw(canvas)
val w = width.toFloat(); val h = height.toFloat()
if (w <= 0 || h <= 0) return
val cx = w / 2f; val cy = h / 2f
val maxR = min(w, h) * 0.42f
// Compute target amplitude in [0, 1] for the current state.
val now = System.currentTimeMillis()
val target: Float = when (val s = state) {
is State.Idle -> {
// 4 s breathing cycle via a soft sine; amplitude 0 → 0.12.
val t = (now - frameStartNs / 1_000_000) % 4000L / 4000f
0.06f + 0.06f * (0.5f + 0.5f * sin((t * 2f * Math.PI).toFloat()))
}
is State.Listening -> {
// Base breathing + live mic contribution.
val t = (now - frameStartNs / 1_000_000) % 4000L / 4000f
val breath = 0.08f + 0.04f * (0.5f + 0.5f * sin((t * 2f * Math.PI).toFloat()))
breath + 0.55f * s.micRms
}
is State.Speaking -> {
val idxF = (now - s.startedAtMs).toFloat() *
s.envelope.size / s.durationMs.toFloat()
val idx = idxF.toInt().coerceIn(0, s.envelope.size - 1)
val frac = (idxF - idx).coerceIn(0f, 1f)
val a = s.envelope[idx]
val b = s.envelope[min(idx + 1, s.envelope.size - 1)]
val env = a + (b - a) * frac
// Emit a ripple whenever we cross a local peak above a
// floor, at most once per envelope step.
if (idx != lastEnvelopeIdx && env > 0.35f) {
val prev = if (idx > 0) s.envelope[idx - 1] else 0f
val next = if (idx < s.envelope.size - 1) s.envelope[idx + 1] else 0f
if (env >= prev && env >= next) {
ripples.add(Ripple(bornAtMs = now, peak = env))
}
lastEnvelopeIdx = idx
}
(env * amplitudeGain).coerceIn(0f, 1f)
}
}
// Exponential smoothing so frame-to-frame changes feel organic.
smoothedAmp += (target - smoothedAmp) * 0.25f
// --- Halo (radial gradient, grows with amplitude) ---
val haloR = maxR * (0.85f + 0.35f * smoothedAmp)
val haloAlpha = (80 + 100 * smoothedAmp).toInt().coerceIn(0, 200)
haloPaint.shader = RadialGradient(
cx, cy, haloR,
intArrayOf(
Color.argb(haloAlpha, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor)),
Color.argb(0, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor))
),
floatArrayOf(0f, 1f),
Shader.TileMode.CLAMP
)
canvas.drawCircle(cx, cy, haloR, haloPaint)
// --- Ripples ---
if (ripples.isNotEmpty()) {
val it = ripples.iterator()
while (it.hasNext()) {
val r = it.next()
val age = (now - r.bornAtMs) / 900f // 900 ms lifetime
if (age >= 1f) { it.remove(); continue }
val radius = maxR * (0.55f + 0.6f * age)
val alpha = ((1f - age) * 140f * r.peak).toInt().coerceIn(0, 200)
ripplePaint.color = Color.argb(
alpha,
Color.red(rippleColor),
Color.green(rippleColor),
Color.blue(rippleColor)
)
ripplePaint.strokeWidth = max(1.5f, (1f - age) * 5f)
canvas.drawCircle(cx, cy, radius, ripplePaint)
}
}
// --- Core orb ---
val coreR = maxR * (0.45f + 0.25f * smoothedAmp)
corePaint.shader = RadialGradient(
cx, cy, coreR,
intArrayOf(
Color.argb(255, Color.red(coreColor), Color.green(coreColor), Color.blue(coreColor)),
Color.argb(180, Color.red(haloColor), Color.green(haloColor), Color.blue(haloColor))
),
floatArrayOf(0f, 1f),
Shader.TileMode.CLAMP
)
canvas.drawCircle(cx, cy, coreR, corePaint)
}
private class Ripple(val bornAtMs: Long, val peak: Float)
}

View File

@ -326,6 +326,33 @@ class ChatActivity : AppCompatActivity() {
setDebugPanelVisible(debug)
}
}
launch {
// Drive the orb visualizer from the service-side signal.
// Service decides whether the app is idle, tracking the
// mic, or rendering a TTS segment; the view just renders
// it. StartSpeaking is edge-triggered on the envelope
// identity so re-emitting the same signal won't restart
// the animation timer.
var lastSpeakingEnv: FloatArray? = null
service.visualizerSignal.collect { sig ->
when (sig) {
is com.kazeia.service.KazeiaService.VisualizerSignal.Idle -> {
binding.audioViz.setIdle()
lastSpeakingEnv = null
}
is com.kazeia.service.KazeiaService.VisualizerSignal.Listening -> {
binding.audioViz.setListening(sig.micRms)
lastSpeakingEnv = null
}
is com.kazeia.service.KazeiaService.VisualizerSignal.Speaking -> {
if (sig.rmsEnvelope !== lastSpeakingEnv) {
binding.audioViz.startSpeaking(sig.rmsEnvelope, sig.durationMs)
lastSpeakingEnv = sig.rmsEnvelope
}
}
}
}
}
}
}
}

View File

@ -100,6 +100,18 @@
</LinearLayout>
<!-- Audio-reactive orb visualizer: Kazeia's visual presence.
Shows a breathing baseline at Idle, grows with mic RMS while
Listening, and reacts to the TTS envelope while Speaking. -->
<com.kazeia.ui.AudioVisualizerView
android:id="@+id/audioViz"
android:layout_width="0dp"
android:layout_height="140dp"
android:background="@color/kazeia_background"
app:layout_constraintTop_toBottomOf="@id/voiceBar"
app:layout_constraintStart_toStartOf="parent"
app:layout_constraintEnd_toEndOf="parent" />
<!-- Chat messages -->
<androidx.recyclerview.widget.RecyclerView
android:id="@+id/rvMessages"
@ -107,7 +119,7 @@
android:layout_height="0dp"
android:clipToPadding="false"
android:padding="8dp"
app:layout_constraintTop_toBottomOf="@id/voiceBar"
app:layout_constraintTop_toBottomOf="@id/audioViz"
app:layout_constraintBottom_toTopOf="@id/inputBar"
app:layout_constraintStart_toStartOf="parent"
app:layout_constraintEnd_toEndOf="parent" />