From 19f934af2506bdddfdc809dbc75067113f8522be Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Mon, 13 Apr 2026 22:56:42 +0200 Subject: [PATCH] LLM NPU: Qwen3-4B QNN export patches + deployment notes Adds executorch-patches/ with the local modifications to /opt/Kazeia/executorch (upstream pytorch/executorch v1.2.0) required to export Qwen3-4B to QNN for the OnePlus Pad 3 Hexagon V79. Tablet runs 18.2 tok/s (gen), TTFT 0.9 s, RSS 1.76 GB. Co-Authored-By: Claude Opus 4.6 (1M context) --- executorch-patches/README.md | 55 +++++++++++++++ executorch-patches/qwen3_4b_decoder.patch | 69 +++++++++++++++++++ .../torchtune_quantization.patch | 28 ++++++++ 3 files changed, 152 insertions(+) create mode 100644 executorch-patches/README.md create mode 100644 executorch-patches/qwen3_4b_decoder.patch create mode 100644 executorch-patches/torchtune_quantization.patch diff --git a/executorch-patches/README.md b/executorch-patches/README.md new file mode 100644 index 0000000..9fab32c --- /dev/null +++ b/executorch-patches/README.md @@ -0,0 +1,55 @@ +# Executorch patches for Kazeia + +Local modifications to /opt/Kazeia/executorch (upstream pytorch/executorch @ v1.2.0) +required to export Qwen3-4B to QNN for OnePlus Pad 3 (Snapdragon 8 Elite, Hexagon V79). + +Not upstreamable as-is (phi_4_mini torchtune guard is a local dependency workaround; +Qwen3_4B class matches upstream style but hasn't been submitted). + +## qwen3_4b_decoder.patch + +Applied to: `/opt/Kazeia/executorch/` + +``` +cd /opt/Kazeia/executorch && git apply ../executorch-patches/qwen3_4b_decoder.patch +``` + +Adds: +- `examples/qualcomm/oss_scripts/llama/__init__.py`: + - `try/except` around `convert_phi_4_mini_weights` import (phi_4_mini pulls torchtune + which conflicts with our torchao 0.17 pin). + - New `Qwen3_4B` class registered as `qwen3-4b`, `num_sharding=2` (4B at num_sharding=1 + OOMed during QNN compile even with 48 GB free RAM; sharding=2 is the minimum that + lets the compile partitioner split the HTP context). +- `examples/qualcomm/oss_scripts/llama/decoder_constants.py`: + - Adds `"qwen3-4b": "qwen3"` to `DECODER_MODEL_VERSION`. + +## torchtune_quantization.patch + +Applied to: `/opt/Kazeia/et_venv/lib64/python3.10/site-packages/torchtune/training/quantization.py` + +torchao 0.17+ removed `int4_weight_only` and `int8_dynamic_activation_int4_weight`. +torchtune 0.6.1 still imports them. Since our Qwen3 QNN export path doesn't use either, +wrap the import in try/except and set them to None on ImportError. + +## Host env reminders (not in patches) + +- symlink `libc++.so.1` and `libc++abi.so.1` in `backends/qualcomm/sdk/libcxx-14.0.0/` +- copy `build-x86/backends/qualcomm/PyQnn*.so` to `backends/qualcomm/python/` +- `QNN_SDK_ROOT=/opt/Kazeia/executorch/backends/qualcomm/sdk/qnn` +- `LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang:.../sdk/libcxx-14.0.0` +- `PATH+=build-x86/third-party/flatc_ep/bin` +- `PYTHONPATH=/opt/Kazeia` + +## RAM/swap for 4B export + +Peak RAM during prepare_pt2e + QNN compile: **46 GB anon-rss**. +On a 62 GB + 8 GB zram box this OOMs. Fix: add a swapfile: + +``` +sudo dd if=/dev/zero of=/swapfile bs=1M count=49152 +sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile +``` + +Compile then uses ~59 GB RAM + 24 GB swap, completes in ~30 min wall. +Put `--artifact` on `/home` not `/tmp` (the 25 GB `decode_qdq.pt2` overflows tmpfs). diff --git a/executorch-patches/qwen3_4b_decoder.patch b/executorch-patches/qwen3_4b_decoder.patch new file mode 100644 index 0000000..91b6c34 --- /dev/null +++ b/executorch-patches/qwen3_4b_decoder.patch @@ -0,0 +1,69 @@ +diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py +index 963db6e..953dc4c 100644 +--- a/examples/qualcomm/oss_scripts/llama/__init__.py ++++ b/examples/qualcomm/oss_scripts/llama/__init__.py +@@ -25,9 +25,14 @@ from executorch.examples.models.granite import ( + from executorch.examples.models.internvl3 import ( + convert_weights as convert_internvl3_weights, + ) +-from executorch.examples.models.phi_4_mini import ( +- convert_weights as convert_phi_4_mini_weights, +-) ++try: ++ from executorch.examples.models.phi_4_mini import ( ++ convert_weights as convert_phi_4_mini_weights, ++ ) ++except ImportError: ++ # phi_4_mini pulls in torchtune which conflicts with our torchao pin. ++ # We don't need phi for Qwen3 export, so tolerate the missing dep. ++ convert_phi_4_mini_weights = None + from executorch.examples.models.qwen2_5 import ( + convert_weights as convert_qwen2_5_weights, + ) +@@ -479,6 +484,34 @@ class Qwen3_1_7B(LLMModelConfig): + quant_recipe = Qwen3_1_7BQuantRecipe + + ++@register_llm_model("qwen3-4b") ++@dataclass(init=False, frozen=True) ++class Qwen3_4B(LLMModelConfig): ++ # Local Kazeia addition. Mirrors the Qwen3_1_7B registration; the 4B ++ # variant uses the same convert_weights and 16a4w quant recipe but a ++ # bigger params file. With 4B params at 16a4w the .pte stays under the ++ # 4 GB HTP single-context limit on V79 (empirically ~2.5 GB), so ++ # num_sharding=1 is fine. Compile time on the host is the main cost ++ # (3-4 h on a 16-core x86_64 machine). ++ repo_id: str = "Qwen/Qwen3-4B" ++ params_path: str = os.path.join( ++ BASE_DIR, "../../../models/qwen3/config/4b_config.json" ++ ) ++ convert_weights = convert_qwen3_weights ++ transform_weight = False ++ instruct_model = True ++ # Bumped to 2 to halve peak host RAM during QNN compile (4B at sharding=1 ++ # OOMed on a 62 GB box, peak anon-rss 46 GB). At sharding=2 each shard ++ # compile fits comfortably; runner stitches them at load time. ++ num_sharding = 2 ++ masked_softmax = True ++ seq_mse_candidates = 0 ++ r1 = False ++ r2 = False ++ r3 = True ++ quant_recipe = Qwen3_1_7BQuantRecipe ++ ++ + @register_llm_model("smollm2_135m") + @dataclass(init=False, frozen=True) + class Smollm2_135M(LLMModelConfig): +diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py +index 74e3959..995c498 100644 +--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py ++++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py +@@ -55,6 +55,7 @@ DECODER_MODEL_VERSION = { + "qwen2_5-1_5b": "qwen2_5", + "qwen3-0_6b": "qwen3", + "qwen3-1_7b": "qwen3", ++ "qwen3-4b": "qwen3", + "smollm2_135m": "smollm2_135m", + "smollm3-3b": "smollm3", + "glm-1_5b": "glm", diff --git a/executorch-patches/torchtune_quantization.patch b/executorch-patches/torchtune_quantization.patch new file mode 100644 index 0000000..b7a3338 --- /dev/null +++ b/executorch-patches/torchtune_quantization.patch @@ -0,0 +1,28 @@ +--- a/torchtune/training/quantization.py ++++ b/torchtune/training/quantization.py +@@ -17,11 +17,20 @@ try: + except ImportError: + # torchao 0.6 and before + from torchao.dtypes import TensorCoreTiledLayoutType as TensorCoreTiledLayout + +-from torchao.quantization import ( +- int4_weight_only, +- int8_dynamic_activation_int4_weight, +- quantize_, +-) ++try: ++ from torchao.quantization import ( ++ int4_weight_only, ++ int8_dynamic_activation_int4_weight, ++ quantize_, ++ ) ++except ImportError: ++ # torchao 0.17+ renamed these. Not needed for Qwen3 QNN export path. ++ int4_weight_only = None ++ int8_dynamic_activation_int4_weight = None ++ try: ++ from torchao.quantization import quantize_ ++ except ImportError: ++ quantize_ = None + + try: