diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
index 1522f50..6730c42 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
@@ -79,6 +79,7 @@ class KazeiaService : Service() {
     val aiWorkload: StateFlow<AiWorkload> = _aiWorkload
 
     private val serviceScope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
+    private var currentPipelineJob: kotlinx.coroutines.Job? = null
     private val _isListening = MutableStateFlow(false)
     val isListening: StateFlow<Boolean> = _isListening
 
@@ -108,12 +109,18 @@ class KazeiaService : Service() {
         // Auto-pipeline trigger via: am startservice -n com.kazeia/.service.KazeiaService --ez run_pipeline true
         if (intent?.getBooleanExtra("run_pipeline", false) == true) {
             log("Auto-pipeline triggered via intent")
-            serviceScope.launch {
+            // Cancel any running pipeline to avoid blocking
+            currentPipelineJob?.cancel()
+            currentPipelineJob = serviceScope.launch(Dispatchers.IO) {
                 // Wait for TTS to be loaded
                 while (!::tts.isInitialized || tts !is com.kazeia.tts.Qwen3TtsEngine) {
-                    kotlinx.coroutines.delay(1000)
+                    kotlinx.coroutines.delay(500)
                 }
-                processTextInput("pipeline")
+                val embedsPath = "/data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin"
+                val qwenTts = tts as? com.kazeia.tts.Qwen3TtsEngine ?: return@launch
+                val audio = qwenTts.generateFromEmbeds(embedsPath)
+                log("Pipeline done: ${audio.size} samples (${audio.size/24000f}s)")
+                // Audio is played by the TTS engine internally
             }
         }
         intent?.getStringExtra("full_pipeline")?.let { embedsPath ->
@@ -912,8 +919,10 @@ class KazeiaService : Service() {
     fun processTextInput(text: String) {
         log("processTextInput: '$text'")
         serviceScope.launch {
-            // Special test commands
+            // Special test commands — cancel previous pipeline first
             if (text.trim().lowercase().let { it.startsWith("pipeline") || it.startsWith("!pipeline") || it.startsWith("\\!pipeline") || it == "go" }) {
+                currentPipelineJob?.cancel()
+                currentPipelineJob = kotlin.coroutines.coroutineContext[kotlinx.coroutines.Job]
                 val embedsPath = "/data/local/tmp/kazeia/models/qwen3-tts-npu/full_pipeline_embeds.bin"
                 val wavPath = "/data/local/tmp/kazeia/tts_output.wav"
                 addMessage(ChatMessage(role = ChatMessage.Role.SYSTEM, text = "Running full pipeline..."))
diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
index edd4e0a..b378674 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@@ -265,11 +265,47 @@ class Qwen3TtsEngine(
                             nlog("Talker .pte JNI loaded+compiled: ${System.currentTimeMillis() - t0}ms, result=$lm")
                             if (lm != 0) { nlog("Talker .pte loadMethod failed"); talkerPteModule = null }
                             else {
-                                // Load rotary tables for talker .pte
                                 val path = "/data/local/tmp/kazeia/models"
                                 talkerPteRotaryCos = loadNpy("$path/talker_pte_rotary_cos.npy")
                                 talkerPteRotarySin = loadNpy("$path/talker_pte_rotary_sin.npy")
                                 nlog("Talker .pte rotary: ${talkerPteRotaryCos?.size} floats")
+
+                                // Warmup both models: first forward() triggers QNN DSP compilation (~7s)
+                                // Better to pay this cost at init than at first pipeline run
+                                val tw = System.currentTimeMillis()
+                                try {
+                                    val dE = FloatArray(TALKER_DIM)
+                                    val dM = FloatArray(TALKER_PTE_KV_LEN) { -1e9f }; dM[TALKER_PTE_KV_LEN - 1] = 0f
+                                    val dC = FloatArray(TALKER_HEAD_DIM) { 1f }
+                                    val dS = FloatArray(TALKER_HEAD_DIM)
+                                    val tkvSz = TALKER_HEADS * TALKER_PTE_KV_LEN * TALKER_HEAD_DIM
+                                    val ins = mutableListOf(
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(dE, longArrayOf(1,1,TALKER_DIM.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(dM, longArrayOf(1,1,1,TALKER_PTE_KV_LEN.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(dC, longArrayOf(1,1,TALKER_HEAD_DIM.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(dS, longArrayOf(1,1,TALKER_HEAD_DIM.toLong())))
+                                    )
+                                    for (i in 0 until TALKER_LAYERS * 2) ins.add(org.pytorch.executorch.EValue.from(
+                                        org.pytorch.executorch.Tensor.fromBlob(FloatArray(tkvSz), longArrayOf(1,TALKER_HEADS.toLong(),TALKER_PTE_KV_LEN.toLong(),TALKER_HEAD_DIM.toLong()))))
+                                    talkerPteModule!!.forward(*ins.toTypedArray())
+                                    nlog("Talker warmup: ${System.currentTimeMillis() - tw}ms")
+                                } catch (e: Exception) { nlog("Talker warmup failed: ${e.message}") }
+
+                                // CP warmup
+                                val cw = System.currentTimeMillis()
+                                try {
+                                    val ckvSz = CP_KV_HEADS * CP_KV_LEN * CP_HEAD_DIM
+                                    val cIns = mutableListOf(
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(FloatArray(TALKER_DIM), longArrayOf(1,1,TALKER_DIM.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(FloatArray(CP_KV_LEN){-1e9f}.also{it[CP_KV_LEN-1]=0f}, longArrayOf(1,1,1,CP_KV_LEN.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(FloatArray(CP_HEAD_DIM){1f}, longArrayOf(1,1,CP_HEAD_DIM.toLong()))),
+                                        org.pytorch.executorch.EValue.from(org.pytorch.executorch.Tensor.fromBlob(FloatArray(CP_HEAD_DIM), longArrayOf(1,1,CP_HEAD_DIM.toLong())))
+                                    )
+                                    for (i in 0 until CP_LAYERS * 2) cIns.add(org.pytorch.executorch.EValue.from(
+                                        org.pytorch.executorch.Tensor.fromBlob(FloatArray(ckvSz), longArrayOf(1,CP_KV_HEADS.toLong(),CP_KV_LEN.toLong(),CP_HEAD_DIM.toLong()))))
+                                    cpPteModule!!.forward(*cIns.toTypedArray())
+                                    nlog("CP warmup: ${System.currentTimeMillis() - cw}ms")
+                                } catch (e: Exception) { nlog("CP warmup failed: ${e.message}") }
                             }
                         } catch (e: Exception) {
                             nlog("Talker .pte JNI failed: ${e.message}")