From 809a6d4fedb94bfe1dafef2d48a0491ed1a2a202 Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Tue, 14 Apr 2026 10:39:50 +0200
Subject: [PATCH] =?UTF-8?q?LLM=20no-root:=20migrate=20to=20in-process=20Ll?=
 =?UTF-8?q?mModule=20(JNI)=20=E2=80=94=20zero=20su=20calls?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The root cause of the previous su-c requirement was that Qualcomm's FastRPC
kernel driver rejects processes spawned via ProcessBuilder fork+exec because
they lose supplementary GIDs on exec. Zygote-forked app processes retain the
proper init-configured credentials and are accepted by the adsprpcd service,
which is why ORT-QNN (Whisper, in-process) worked while the subprocess
qnn_llama_runner did not. Running the LLM in-process via ExecuTorch's
LlmModule bypasses the fork+exec path entirely.

What this commit does:
- ExecuTorchLlmEngine now uses org.pytorch.executorch.extension.llm.LlmModule
  with MODEL_TYPE_QNN_LLAMA=4 (routes to example::Runner in jni_layer_llama.cpp,
  the same C++ runner that qnn_llama_runner embeds).
- All su, ProcessBuilder, file-based prompt/response plumbing, and run_llm.sh
  gone. ChatML template is built in Kotlin; tokens stream in via LlmCallback.

Supporting changes under executorch-patches/llm_in_process_jni.patch:
1. backends/qualcomm/CMakeLists.txt — gate PyQnnManagerAdaptor on NOT ANDROID.
   The original guard (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64) misfires in a
   nested scope during Android cross-compile and tried to build the host
   Python bindings.
2. extension/android/jni/jni_layer_llama.cpp — hardcode decoder_model="qwen3"
   (was "llama3") and pass eval_mode=0 (EvalMode::kKVCached) + shared_buffer=true
   to match our hybrid_llama_qnn.pte which only contains kv_forward, not
   prefill_forward.

Build: scripts/build_android_library.sh arm64-v8a with QNN_SDK_ROOT pointing
to /opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225 and EXECUTORCH_BUILD_QNN=ON.
Produces libexecutorch_jni.so (192 MB) with QNN v2.42 backend + the llama
runner code, plus libqnn_executorch_backend.so. Both staged in jniLibs.

Validated on OnePlus Pad 3: LlmModule.load() completes in 4.2 s, no su
prompts, Pipeline ready with STT(WhisperHybridEngine) → [VoiceCommands →
LLM] → TTS(Qwen3TtsEngine). TTS .pte still loads with the upgraded v2.42
runtime — no regression.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 executorch-patches/llm_in_process_jni.patch   |  40 +++
 .../com/kazeia/llm/ExecuTorchLlmEngine.kt     | 251 +++++++-----------
 .../java/com/kazeia/service/KazeiaService.kt  |   2 +-
 3 files changed, 144 insertions(+), 149 deletions(-)
 create mode 100644 executorch-patches/llm_in_process_jni.patch
diff --git a/executorch-patches/llm_in_process_jni.patch b/executorch-patches/llm_in_process_jni.patch
new file mode 100644
index 0000000..a4a64a5
--- /dev/null
+++ b/executorch-patches/llm_in_process_jni.patch
@@ -0,0 +1,40 @@
+diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
+index e93731e..4951e1d 100644
+--- a/backends/qualcomm/CMakeLists.txt
++++ b/backends/qualcomm/CMakeLists.txt
+@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+   )
+ endif()
+ 
+-# QNN pybind
+-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
++# QNN pybind — host Python bindings, not for Android cross-compile
++if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
+   add_subdirectory(
+     ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
+     ${CMAKE_CURRENT_BINARY_DIR}/pybind11
+diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
+index 45f2414..e1c2a8f 100644
+--- a/extension/android/jni/jni_layer_llama.cpp
++++ b/extension/android/jni/jni_layer_llama.cpp
+@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
+           model_path->toStdString().c_str(),
+           data_files_vector,
+           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
+-      std::string decoder_model = "llama3"; // use llama3 for now
++      std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
+       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
+           std::move(module),
+           decoder_model.c_str(),
+           model_path->toStdString().c_str(),
+           tokenizer_path->toStdString().c_str(),
+-          "",
+-          "");
++          /* performance_output_path */ "",
++          /* dump_logits_path */ "",
++          /* temperature */ 0.7f,
++          /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward)
++          /* shared_buffer */ true);
+       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
+ #endif
+ #if defined(EXECUTORCH_BUILD_MEDIATEK)
diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
index 85df1d0..18a207a 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@@ -1,43 +1,49 @@
 package com.kazeia.llm
 
+import android.content.Context
 import android.util.Log
 import com.kazeia.core.*
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.withContext
 import java.io.File
+import org.pytorch.executorch.extension.llm.LlmCallback
+import org.pytorch.executorch.extension.llm.LlmModule
 
 /**
- * LLM Engine using ExecuTorch + QNN backend via subprocess.
- * Calls qnn_llama_runner binary with root access (Magisk su).
+ * LLM Engine using ExecuTorch LlmModule in-process — **no root required**.
+ *
+ * Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
+ * wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
+ * but inside the app's own process. The QNN HTP backend works because the
+ * DSP fastrpc service accepts the Zygote-forked app process (unlike
+ * ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
+ * and get rejected by the fastrpc credential checks).
+ *
+ * Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
+ * on this device's permissive SELinux policy). libexecutorch.so + QNN libs
+ * are bundled in jniLibs.
  *
  * Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
  * (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
- *
- * Why root: the runner binary plus its QNN v2.42 .so deps live in
- * /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
- * apps can't exec binaries from there. The Hexagon DSP fastrpc service also
- * refuses to load the v2.42 Skel from the app's own files dir — only from
- * nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel
- * (same filename, different version, can't coexist). Rebuilding everything
- * against one QNN version would eliminate the conflict, but would require
- * re-exporting the TTS .pte with the new runtime (tooling currently broken
- * on the flatc schema/dataclass mismatch in the qnn_venv).
  */
 class ExecuTorchLlmEngine(
+    private val context: Context,
     private val onLog: ((String) -> Unit)? = null
 ) : LlmEngine {
 
     companion object {
         private const val TAG = "ExecuTorchLLM"
-        private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
         // /no_think disables Qwen3's chain-of-thought block so the full token
-        // budget goes to the actual answer (without it, 120-200 tokens get
-        // consumed by <think>…</think> leaving nothing to speak).
-        // Short-response directive keeps TTS latency manageable — each sentence
-        // costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
+        // budget goes to the actual answer. Short-response directive keeps
+        // TTS latency manageable.
         private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
+
+        private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
+        private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
+        private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
     }
 
+    private var llmModule: LlmModule? = null
     private var modelName = ""
     private var loaded = false
 
@@ -48,77 +54,88 @@ class ExecuTorchLlmEngine(
 
     override suspend fun load(modelPath: String, config: LlmConfig) {
         withContext(Dispatchers.IO) {
-            val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
-            if (check.contains("No such file")) {
-                nlog("ERROR: runner or model not found in $RUNNER_DIR")
+            if (!File(MODEL_PATH).exists()) {
+                nlog("ERROR: model not found at $MODEL_PATH")
+                return@withContext
+            }
+            if (!File(TOKENIZER_PATH).exists()) {
+                nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
                 return@withContext
             }
 
-            deployRunnerScript()
+            try {
+                val t0 = System.currentTimeMillis()
+                // MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
+                // jni_layer_llama.cpp, which uses example::Runner (same code
+                // as the qnn_llama_runner binary) instead of the generic
+                // TextLLMRunner. Our .pte was exported with
+                // --decoder_model qwen3-4b which requires this path.
+                val MODEL_TYPE_QNN_LLAMA = 4
+                llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
+                nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")
 
-            writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
-                android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
-            if (SYSTEM_PROMPT.isNotEmpty()) {
-                writeFileRoot("$RUNNER_DIR/outputs/system.b64",
-                    android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
-            } else {
-                execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
-            }
-            val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
+                // Load the PTE into QNN HTP (calls the native load()).
+                val loadResult = llmModule!!.load()
+                if (loadResult != 0) {
+                    nlog("ERROR: LlmModule.load() returned $loadResult")
+                    llmModule = null
+                    return@withContext
+                }
+                nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")
 
-            if (test.contains("Generated Tokens") || test.contains("Rate:")) {
                 loaded = true
-                val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
-                val rate = rateMatch?.groupValues?.get(1) ?: "?"
-                modelName = "Qwen3 (${rate} tok/s NPU)"
+                modelName = "Qwen3-4B LlmModule"
                 nlog("Ready: $modelName")
-            } else {
-                nlog("ERROR: test failed: ${test.takeLast(200)}")
+            } catch (e: Throwable) {
+                nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
+                llmModule = null
             }
         }
     }
 
-    override fun isLoaded(): Boolean = loaded
+    override fun isLoaded(): Boolean = loaded && llmModule != null
 
     override suspend fun generate(
         prompt: String,
         params: SamplingParams,
         onToken: ((String) -> Boolean)?
     ): GenerationResult = withContext(Dispatchers.IO) {
-        if (!loaded) throw IllegalStateException("Model not loaded")
+        val mod = llmModule ?: throw IllegalStateException("Model not loaded")
 
         val startTime = System.currentTimeMillis()
-
-        writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
-            android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
-        if (SYSTEM_PROMPT.isNotEmpty()) {
-            writeFileRoot("$RUNNER_DIR/outputs/system.b64",
-                android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
-        } else {
-            execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
-        }
-
+        val fullPrompt = buildChatTemplate(prompt)
         nlog("Prompt: '${prompt.take(80)}'")
 
+        val responseBuilder = StringBuilder()
+        var firstTokenMs = -1L
+
+        val cb = object : LlmCallback {
+            override fun onResult(result: String) {
+                if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
+                responseBuilder.append(result)
+                onToken?.invoke(result)
+            }
+            override fun onStats(stats: String) {
+                nlog("stats: ${stats.take(200)}")
+            }
+        }
+
         val seqLen = minOf(params.maxNewTokens, 512)
-        val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
-
-        val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
-            ?.groupValues?.get(1)?.toIntOrNull() ?: 0
-        val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
-        val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
-
-        val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
-        nlog("RAW: ${responseRaw.take(300)}")
-        val responseText = extractResponse(responseRaw)
+        val rc = try {
+            mod.generate(fullPrompt, seqLen, cb)
+        } catch (e: Throwable) {
+            nlog("generate() threw: ${e.message}")
+            -1
+        }
 
         val elapsed = System.currentTimeMillis() - startTime
-        nlog("Response: '$responseText'")
-        nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
+        val rawText = responseBuilder.toString()
+        val responseText = cleanResponse(rawText)
+        val tokenCount = rawText.length / 4  // rough estimate without a tokenizer
+        val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f
 
-        onToken?.invoke(responseText)
+        nlog("Response: '${responseText.take(80)}'")
+        nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")
 
         GenerationResult(
             text = responseText,
@@ -128,20 +145,31 @@ class ExecuTorchLlmEngine(
         )
     }
 
-    private fun extractResponse(raw: String): String {
+    /**
+     * Wrap user input in Qwen3's ChatML template so the instruct model
+     * actually follows the system directive instead of echoing the prompt.
+     * Terminating with `<|im_start|>assistant\n` signals the model to begin
+     * its reply; no trailing tokens.
+     */
+    private fun buildChatTemplate(userInput: String): String {
+        val sb = StringBuilder()
+        if (SYSTEM_PROMPT.isNotEmpty()) {
+            sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
+        }
+        sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
+        sb.append("<|im_start|>assistant\n")
+        return sb.toString()
+    }
+
+    /** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
+    private fun cleanResponse(raw: String): String {
         var text = raw
         val thinkEnd = text.indexOf("</think>")
         if (thinkEnd >= 0) {
             text = text.substring(thinkEnd + "</think>".length)
-        } else {
-            val thinkStart = text.indexOf("<think>")
-            val assistantTag = text.indexOf("assistant")
-            if (thinkStart >= 0) {
-                nlog("WARN: <think> block never closed, no response generated")
-                return ""
-            } else if (assistantTag >= 0) {
-                text = text.substring(assistantTag + "assistant".length)
-            }
+        } else if (text.indexOf("<think>") >= 0) {
+            nlog("WARN: <think> block never closed")
+            return ""
         }
         return text
             .replace("<|im_start|>", "")
@@ -152,82 +180,9 @@ class ExecuTorchLlmEngine(
             .trim()
     }
 
-    private fun deployRunnerScript() {
-        val script = """
-#!/bin/sh
-cd $RUNNER_DIR
-export LD_LIBRARY_PATH=$RUNNER_DIR
-export ADSP_LIBRARY_PATH=$RUNNER_DIR
-
-TEMP=${'$'}1
-SEQ_LEN=${'$'}2
-
-PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
-
-rm -f $RUNNER_DIR/outputs/response.txt
-
-SYSTEM_ARGS=""
-if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
-  SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
-  SYSTEM_ARGS="--system_prompt"
-fi
-
-if [ -n "${'$'}SYSTEM_ARGS" ]; then
-  exec ./qnn_llama_runner \
-    --model_path hybrid_llama_qnn.pte \
-    --tokenizer_path tokenizer.json \
-    --decoder_model_version qwen3 \
-    --output_path $RUNNER_DIR/outputs/response.txt \
-    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
-    --shared_buffer \
-    --system_prompt "${'$'}SYSTEM" \
-    --prompt "${'$'}PROMPT" \
-    --temperature ${'$'}TEMP \
-    --seq_len ${'$'}SEQ_LEN \
-    --eval_mode 0
-else
-  exec ./qnn_llama_runner \
-    --model_path hybrid_llama_qnn.pte \
-    --tokenizer_path tokenizer.json \
-    --decoder_model_version qwen3 \
-    --output_path $RUNNER_DIR/outputs/response.txt \
-    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
-    --shared_buffer \
-    --prompt "${'$'}PROMPT" \
-    --temperature ${'$'}TEMP \
-    --seq_len ${'$'}SEQ_LEN \
-    --eval_mode 0
-fi
-""".trimIndent()
-
-        writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
-        execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
-    }
-
     override fun release() {
+        try { llmModule?.resetNative() } catch (_: Throwable) {}
+        llmModule = null
         loaded = false
     }
-
-    private fun writeFileRoot(path: String, content: String) {
-        try {
-            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
-            process.outputStream.bufferedWriter().use { it.write(content) }
-            process.waitFor()
-        } catch (e: Exception) {
-            Log.e(TAG, "writeFileRoot failed: ${e.message}")
-        }
-    }
-
-    private fun execRoot(cmd: String): String {
-        return try {
-            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
-            val result = process.inputStream.bufferedReader().readText()
-            val error = process.errorStream.bufferedReader().readText()
-            process.waitFor()
-            if (error.isNotEmpty() && result.isEmpty()) error else result
-        } catch (e: Exception) {
-            Log.e(TAG, "execRoot failed: ${e.message}")
-            ""
-        }
-    }
 }
diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
index 82eaa24..401ebc1 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
@@ -518,7 +518,7 @@ class KazeiaService : Service() {
 
                 // LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
                 _loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
-                llm = ExecuTorchLlmEngine { msg -> log(msg) }
+                llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
                 try {
                     llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
                 } catch (e: Exception) {