LLM no-root: validate end-to-end pipeline, fix kv_io_bit_width detection
End-to-end validation on OnePlus Pad 3 with stream_llm intent:
Prompt: 'Bonjour, comment vas-tu ?'
Response: 'Bonjour ! Je suis là pour t'écouter. Comment vas-tu aujourd'hui ?'
TTS: Talker(PTE) 37ms/step, CP(PTE) 73ms/step, audio synthesized.
No su, no Magisk prompts.
Two fixes since the previous commit:
1. ExecuTorchLlmEngine: pass echo=false to LlmModule.generate() — by default
the runner echoes the prompt tokens back via the callback, which fed the
ChatML wrap (<|im_start|>user …) into the SentenceStreamer and TTS.
2. jni_layer_llama.cpp: pick Runner<uint8_t> vs Runner<uint16_t> based on the
model's get_kv_io_bit_width metadata, mirroring qnn_llama_runner.cpp main().
The hard-coded uint16_t was wrong for our Qwen3-4B export (which uses 8-bit
KV I/O) and produced fluent-looking but completely random tokens
("blocked罩ug darkestSOLEQuotes作者本人 …") — same symptom whether greedy or
sampled, the smoking gun for a width-mismatched KV cache reinterpretation.
Other tweaks:
- temperature=0.0 in the QNN_LLAMA branch of jni_layer_llama.cpp (greedy,
matches the working qnn_llama_runner --temperature 0 invocation)
- shared_buffer=true (same as binary defaults)
- Kotlin chat template mirrors qnn_llama_runner.cpp's get_formatted_prompt for
Qwen3 (user-first, then optional system, then "<|im_start|>assistant" with
no trailing newline — that quirky ordering is what the .pte was trained on)
TFTT is ~4 s for a 77-token prompt on kv-only mode (sequential prefill, one
forward per token). To get a sub-second TTFT we'd need to re-export the model
in --model_mode hybrid which adds a parallel prefill_forward graph; not
required for the conversational use case.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
809a6d4fed
commit
f32b5ddfdd
|
|
@ -14,27 +14,50 @@ index e93731e..4951e1d 100644
|
||||||
${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
|
${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/pybind11
|
${CMAKE_CURRENT_BINARY_DIR}/pybind11
|
||||||
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
|
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
|
||||||
index 45f2414..e1c2a8f 100644
|
index 45f2414..7c4e1aa 100644
|
||||||
--- a/extension/android/jni/jni_layer_llama.cpp
|
--- a/extension/android/jni/jni_layer_llama.cpp
|
||||||
+++ b/extension/android/jni/jni_layer_llama.cpp
|
+++ b/extension/android/jni/jni_layer_llama.cpp
|
||||||
@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
|
@@ -171,14 +171,35 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
|
||||||
model_path->toStdString().c_str(),
|
model_path->toStdString().c_str(),
|
||||||
data_files_vector,
|
data_files_vector,
|
||||||
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
|
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
|
||||||
- std::string decoder_model = "llama3"; // use llama3 for now
|
- std::string decoder_model = "llama3"; // use llama3 for now
|
||||||
+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
|
- runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
|
||||||
runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
|
- std::move(module),
|
||||||
std::move(module),
|
- decoder_model.c_str(),
|
||||||
decoder_model.c_str(),
|
- model_path->toStdString().c_str(),
|
||||||
model_path->toStdString().c_str(),
|
- tokenizer_path->toStdString().c_str(),
|
||||||
tokenizer_path->toStdString().c_str(),
|
|
||||||
- "",
|
- "",
|
||||||
- "");
|
- "");
|
||||||
+ /* performance_output_path */ "",
|
+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
|
||||||
+ /* dump_logits_path */ "",
|
+
|
||||||
+ /* temperature */ 0.7f,
|
+ // Mirror qnn_llama_runner.cpp main(): pick the Runner<T> template based
|
||||||
+ /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward)
|
+ // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models
|
||||||
+ /* shared_buffer */ true);
|
+ // were introduced after the 8-bit ones, and using the wrong T treats
|
||||||
|
+ // KV-cache bytes as the wrong width → garbage logits → gibberish output.
|
||||||
|
+ example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
|
||||||
|
+ if (module->method_names()->count("get_kv_io_bit_width") > 0) {
|
||||||
|
+ kv_bitwidth = static_cast<example::KvBitWidth>(
|
||||||
|
+ module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
|
||||||
|
+ }
|
||||||
|
+ auto make_runner = [&](auto sample) -> std::unique_ptr<llm::IRunner> {
|
||||||
|
+ using T = decltype(sample);
|
||||||
|
+ return std::make_unique<example::Runner<T>>(
|
||||||
|
+ std::move(module),
|
||||||
|
+ decoder_model.c_str(),
|
||||||
|
+ model_path->toStdString().c_str(),
|
||||||
|
+ tokenizer_path->toStdString().c_str(),
|
||||||
|
+ /* performance_output_path */ "",
|
||||||
|
+ /* dump_logits_path */ "",
|
||||||
|
+ /* temperature */ 0.0f, // greedy
|
||||||
|
+ /* eval_mode */ 0, // EvalMode::kKVCached
|
||||||
|
+ /* shared_buffer */ true);
|
||||||
|
+ };
|
||||||
|
+ if (kv_bitwidth == example::KvBitWidth::kWidth16) {
|
||||||
|
+ runner_ = make_runner(uint16_t{0});
|
||||||
|
+ } else {
|
||||||
|
+ runner_ = make_runner(uint8_t{0});
|
||||||
|
+ }
|
||||||
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
|
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
|
||||||
#endif
|
#endif
|
||||||
#if defined(EXECUTORCH_BUILD_MEDIATEK)
|
#if defined(EXECUTORCH_BUILD_MEDIATEK)
|
||||||
|
|
|
||||||
|
|
@ -122,7 +122,10 @@ class ExecuTorchLlmEngine(
|
||||||
|
|
||||||
val seqLen = minOf(params.maxNewTokens, 512)
|
val seqLen = minOf(params.maxNewTokens, 512)
|
||||||
val rc = try {
|
val rc = try {
|
||||||
mod.generate(fullPrompt, seqLen, cb)
|
// echo=false so onResult() only receives the generated completion,
|
||||||
|
// not the prompt tokens echoed back — otherwise the sentence
|
||||||
|
// streamer would feed '<|im_start|>user …' to the TTS.
|
||||||
|
mod.generate(fullPrompt, seqLen, cb, /* echo */ false)
|
||||||
} catch (e: Throwable) {
|
} catch (e: Throwable) {
|
||||||
nlog("generate() threw: ${e.message}")
|
nlog("generate() threw: ${e.message}")
|
||||||
-1
|
-1
|
||||||
|
|
@ -146,18 +149,19 @@ class ExecuTorchLlmEngine(
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrap user input in Qwen3's ChatML template so the instruct model
|
* Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt()
|
||||||
* actually follows the system directive instead of echoing the prompt.
|
* for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering
|
||||||
* Terminating with `<|im_start|>assistant\n` signals the model to begin
|
* (quirky but required — the runner binary produces the same layout and our
|
||||||
* its reply; no trailing tokens.
|
* .pte was trained with it). Terminates with `<|im_start|>assistant` with
|
||||||
|
* no trailing newline, matching the binary exactly.
|
||||||
*/
|
*/
|
||||||
private fun buildChatTemplate(userInput: String): String {
|
private fun buildChatTemplate(userInput: String): String {
|
||||||
val sb = StringBuilder()
|
val sb = StringBuilder()
|
||||||
|
sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
|
||||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||||
sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
|
sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
|
||||||
}
|
}
|
||||||
sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
|
sb.append("<|im_start|>assistant")
|
||||||
sb.append("<|im_start|>assistant\n")
|
|
||||||
return sb.toString()
|
return sb.toString()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue