From f32b5ddfdd43c594a6cd8cf9b5fad7d8445a0fb1 Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Tue, 14 Apr 2026 11:11:23 +0200
Subject: [PATCH] LLM no-root: validate end-to-end pipeline, fix
 kv_io_bit_width detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end validation on OnePlus Pad 3 with stream_llm intent:
  Prompt:   'Bonjour, comment vas-tu ?'
  Response: 'Bonjour ! Je suis là pour t'écouter. Comment vas-tu aujourd'hui ?'
  TTS:      Talker(PTE) 37ms/step, CP(PTE) 73ms/step, audio synthesized.
  No su, no Magisk prompts.

Two fixes since the previous commit:
1. ExecuTorchLlmEngine: pass echo=false to LlmModule.generate() — by default
   the runner echoes the prompt tokens back via the callback, which fed the
   ChatML wrap (<|im_start|>user …) into the SentenceStreamer and TTS.
2. jni_layer_llama.cpp: pick Runner<uint8_t> vs Runner<uint16_t> based on the
   model's get_kv_io_bit_width metadata, mirroring qnn_llama_runner.cpp main().
   The hard-coded uint16_t was wrong for our Qwen3-4B export (which uses 8-bit
   KV I/O) and produced fluent-looking but completely random tokens
   ("blocked罩ug darkestSOLEQuotes作者本人 …") — same symptom whether greedy or
   sampled, the smoking gun for a width-mismatched KV cache reinterpretation.

Other tweaks:
- temperature=0.0 in the QNN_LLAMA branch of jni_layer_llama.cpp (greedy,
  matches the working qnn_llama_runner --temperature 0 invocation)
- shared_buffer=true (same as binary defaults)
- Kotlin chat template mirrors qnn_llama_runner.cpp's get_formatted_prompt for
  Qwen3 (user-first, then optional system, then "<|im_start|>assistant" with
  no trailing newline — that quirky ordering is what the .pte was trained on)

TFTT is ~4 s for a 77-token prompt on kv-only mode (sequential prefill, one
forward per token). To get a sub-second TTFT we'd need to re-export the model
in --model_mode hybrid which adds a parallel prefill_forward graph; not
required for the conversational use case.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 executorch-patches/llm_in_process_jni.patch   | 49 ++++++++++++++-----
 .../com/kazeia/llm/ExecuTorchLlmEngine.kt     | 18 ++++---
 2 files changed, 47 insertions(+), 20 deletions(-)
diff --git a/executorch-patches/llm_in_process_jni.patch b/executorch-patches/llm_in_process_jni.patch
index a4a64a5..357c7cd 100644
--- a/executorch-patches/llm_in_process_jni.patch
+++ b/executorch-patches/llm_in_process_jni.patch
@@ -14,27 +14,50 @@ index e93731e..4951e1d 100644
      ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
      ${CMAKE_CURRENT_BINARY_DIR}/pybind11
 diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
-index 45f2414..e1c2a8f 100644
+index 45f2414..7c4e1aa 100644
 --- a/extension/android/jni/jni_layer_llama.cpp
 +++ b/extension/android/jni/jni_layer_llama.cpp
-@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
+@@ -171,14 +171,35 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
            model_path->toStdString().c_str(),
            data_files_vector,
            executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
 -      std::string decoder_model = "llama3"; // use llama3 for now
-+      std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
-       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
-           std::move(module),
-           decoder_model.c_str(),
-           model_path->toStdString().c_str(),
-           tokenizer_path->toStdString().c_str(),
+-      runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
+-          std::move(module),
+-          decoder_model.c_str(),
+-          model_path->toStdString().c_str(),
+-          tokenizer_path->toStdString().c_str(),
 -          "",
 -          "");
-+          /* performance_output_path */ "",
-+          /* dump_logits_path */ "",
-+          /* temperature */ 0.7f,
-+          /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward)
-+          /* shared_buffer */ true);
++      std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
++
++      // Mirror qnn_llama_runner.cpp main(): pick the Runner<T> template based
++      // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models
++      // were introduced after the 8-bit ones, and using the wrong T treats
++      // KV-cache bytes as the wrong width → garbage logits → gibberish output.
++      example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
++      if (module->method_names()->count("get_kv_io_bit_width") > 0) {
++        kv_bitwidth = static_cast<example::KvBitWidth>(
++            module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
++      }
++      auto make_runner = [&](auto sample) -> std::unique_ptr<llm::IRunner> {
++        using T = decltype(sample);
++        return std::make_unique<example::Runner<T>>(
++            std::move(module),
++            decoder_model.c_str(),
++            model_path->toStdString().c_str(),
++            tokenizer_path->toStdString().c_str(),
++            /* performance_output_path */ "",
++            /* dump_logits_path */ "",
++            /* temperature */ 0.0f, // greedy
++            /* eval_mode */ 0, // EvalMode::kKVCached
++            /* shared_buffer */ true);
++      };
++      if (kv_bitwidth == example::KvBitWidth::kWidth16) {
++        runner_ = make_runner(uint16_t{0});
++      } else {
++        runner_ = make_runner(uint8_t{0});
++      }
        model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
  #endif
  #if defined(EXECUTORCH_BUILD_MEDIATEK)
diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
index 18a207a..d0910cc 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@@ -122,7 +122,10 @@ class ExecuTorchLlmEngine(
 
         val seqLen = minOf(params.maxNewTokens, 512)
         val rc = try {
-            mod.generate(fullPrompt, seqLen, cb)
+            // echo=false so onResult() only receives the generated completion,
+            // not the prompt tokens echoed back — otherwise the sentence
+            // streamer would feed '<|im_start|>user …' to the TTS.
+            mod.generate(fullPrompt, seqLen, cb, /* echo */ false)
         } catch (e: Throwable) {
             nlog("generate() threw: ${e.message}")
             -1
@@ -146,18 +149,19 @@ class ExecuTorchLlmEngine(
     }
 
     /**
-     * Wrap user input in Qwen3's ChatML template so the instruct model
-     * actually follows the system directive instead of echoing the prompt.
-     * Terminating with `<|im_start|>assistant\n` signals the model to begin
-     * its reply; no trailing tokens.
+     * Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt()
+     * for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering
+     * (quirky but required — the runner binary produces the same layout and our
+     * .pte was trained with it). Terminates with `<|im_start|>assistant` with
+     * no trailing newline, matching the binary exactly.
      */
     private fun buildChatTemplate(userInput: String): String {
         val sb = StringBuilder()
+        sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
         if (SYSTEM_PROMPT.isNotEmpty()) {
             sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
         }
-        sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
-        sb.append("<|im_start|>assistant\n")
+        sb.append("<|im_start|>assistant")
         return sb.toString()
     }