From f32b5ddfdd43c594a6cd8cf9b5fad7d8445a0fb1 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Tue, 14 Apr 2026 11:11:23 +0200 Subject: [PATCH] LLM no-root: validate end-to-end pipeline, fix kv_io_bit_width detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end validation on OnePlus Pad 3 with stream_llm intent: Prompt: 'Bonjour, comment vas-tu ?' Response: 'Bonjour ! Je suis là pour t'écouter. Comment vas-tu aujourd'hui ?' TTS: Talker(PTE) 37ms/step, CP(PTE) 73ms/step, audio synthesized. No su, no Magisk prompts. Two fixes since the previous commit: 1. ExecuTorchLlmEngine: pass echo=false to LlmModule.generate() — by default the runner echoes the prompt tokens back via the callback, which fed the ChatML wrap (<|im_start|>user …) into the SentenceStreamer and TTS. 2. jni_layer_llama.cpp: pick Runner vs Runner based on the model's get_kv_io_bit_width metadata, mirroring qnn_llama_runner.cpp main(). The hard-coded uint16_t was wrong for our Qwen3-4B export (which uses 8-bit KV I/O) and produced fluent-looking but completely random tokens ("blocked罩ug darkestSOLEQuotes作者本人 …") — same symptom whether greedy or sampled, the smoking gun for a width-mismatched KV cache reinterpretation. Other tweaks: - temperature=0.0 in the QNN_LLAMA branch of jni_layer_llama.cpp (greedy, matches the working qnn_llama_runner --temperature 0 invocation) - shared_buffer=true (same as binary defaults) - Kotlin chat template mirrors qnn_llama_runner.cpp's get_formatted_prompt for Qwen3 (user-first, then optional system, then "<|im_start|>assistant" with no trailing newline — that quirky ordering is what the .pte was trained on) TFTT is ~4 s for a 77-token prompt on kv-only mode (sequential prefill, one forward per token). To get a sub-second TTFT we'd need to re-export the model in --model_mode hybrid which adds a parallel prefill_forward graph; not required for the conversational use case. Co-Authored-By: Claude Opus 4.6 (1M context) --- executorch-patches/llm_in_process_jni.patch | 49 ++++++++++++++----- .../com/kazeia/llm/ExecuTorchLlmEngine.kt | 18 ++++--- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/executorch-patches/llm_in_process_jni.patch b/executorch-patches/llm_in_process_jni.patch index a4a64a5..357c7cd 100644 --- a/executorch-patches/llm_in_process_jni.patch +++ b/executorch-patches/llm_in_process_jni.patch @@ -14,27 +14,50 @@ index e93731e..4951e1d 100644 ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11 diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp -index 45f2414..e1c2a8f 100644 +index 45f2414..7c4e1aa 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp -@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { +@@ -171,14 +171,35 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { model_path->toStdString().c_str(), data_files_vector, executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); - std::string decoder_model = "llama3"; // use llama3 for now -+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b - runner_ = std::make_unique>( // QNN runner - std::move(module), - decoder_model.c_str(), - model_path->toStdString().c_str(), - tokenizer_path->toStdString().c_str(), +- runner_ = std::make_unique>( // QNN runner +- std::move(module), +- decoder_model.c_str(), +- model_path->toStdString().c_str(), +- tokenizer_path->toStdString().c_str(), - "", - ""); -+ /* performance_output_path */ "", -+ /* dump_logits_path */ "", -+ /* temperature */ 0.7f, -+ /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward) -+ /* shared_buffer */ true); ++ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b ++ ++ // Mirror qnn_llama_runner.cpp main(): pick the Runner template based ++ // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models ++ // were introduced after the 8-bit ones, and using the wrong T treats ++ // KV-cache bytes as the wrong width → garbage logits → gibberish output. ++ example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; ++ if (module->method_names()->count("get_kv_io_bit_width") > 0) { ++ kv_bitwidth = static_cast( ++ module->get("get_kv_io_bit_width").get().toScalar().to()); ++ } ++ auto make_runner = [&](auto sample) -> std::unique_ptr { ++ using T = decltype(sample); ++ return std::make_unique>( ++ std::move(module), ++ decoder_model.c_str(), ++ model_path->toStdString().c_str(), ++ tokenizer_path->toStdString().c_str(), ++ /* performance_output_path */ "", ++ /* dump_logits_path */ "", ++ /* temperature */ 0.0f, // greedy ++ /* eval_mode */ 0, // EvalMode::kKVCached ++ /* shared_buffer */ true); ++ }; ++ if (kv_bitwidth == example::KvBitWidth::kWidth16) { ++ runner_ = make_runner(uint16_t{0}); ++ } else { ++ runner_ = make_runner(uint8_t{0}); ++ } model_type_category_ = MODEL_TYPE_CATEGORY_LLM; #endif #if defined(EXECUTORCH_BUILD_MEDIATEK) diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt index 18a207a..d0910cc 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt @@ -122,7 +122,10 @@ class ExecuTorchLlmEngine( val seqLen = minOf(params.maxNewTokens, 512) val rc = try { - mod.generate(fullPrompt, seqLen, cb) + // echo=false so onResult() only receives the generated completion, + // not the prompt tokens echoed back — otherwise the sentence + // streamer would feed '<|im_start|>user …' to the TTS. + mod.generate(fullPrompt, seqLen, cb, /* echo */ false) } catch (e: Throwable) { nlog("generate() threw: ${e.message}") -1 @@ -146,18 +149,19 @@ class ExecuTorchLlmEngine( } /** - * Wrap user input in Qwen3's ChatML template so the instruct model - * actually follows the system directive instead of echoing the prompt. - * Terminating with `<|im_start|>assistant\n` signals the model to begin - * its reply; no trailing tokens. + * Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt() + * for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering + * (quirky but required — the runner binary produces the same layout and our + * .pte was trained with it). Terminates with `<|im_start|>assistant` with + * no trailing newline, matching the binary exactly. */ private fun buildChatTemplate(userInput: String): String { val sb = StringBuilder() + sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n") if (SYSTEM_PROMPT.isNotEmpty()) { sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n") } - sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n") - sb.append("<|im_start|>assistant\n") + sb.append("<|im_start|>assistant") return sb.toString() }