From a688edc9ece2a3c3bfdec17853a2b84fe664f5aa Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Thu, 9 Apr 2026 12:47:30 +0200
Subject: [PATCH] =?UTF-8?q?Reduce=20talker=20KV=5FLEN=20100=E2=86=9264:=20?=
 =?UTF-8?q?saves=20148ms=20(RTF=201.31)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KV window of 64 sufficient for ~70 token generation (10 prefill + 58 gen).
36% less KV memcpy per talker step (28L × 2 × 64×8×128 vs 100×8×128).

Generation: 3795ms → 3647ms, total: 6438ms → 6093ms

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 executorch-custom/jni_layer_tts.cpp                           | 2 +-
 .../app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt        | 2 +-
 scripts/export_talker_pte.py                                  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/executorch-custom/jni_layer_tts.cpp b/executorch-custom/jni_layer_tts.cpp
index 7783d0c..5105ff2 100644
--- a/executorch-custom/jni_layer_tts.cpp
+++ b/executorch-custom/jni_layer_tts.cpp
@@ -692,7 +692,7 @@ ExecuTorchJni::runTtsPipelineImpl(
     jint maxTokens)
 {
     static const int DIM=1024,VOCAB=3072,CB_SIZE=2048,NUM_CB=16;
-    static const int T_L=28,T_KV=8,T_HD=128,T_KV_LEN=100;
+    static const int T_L=28,T_KV=8,T_HD=128,T_KV_LEN=64;
     static const int C_L=5,C_KV=8,C_HD=128,C_KV_LEN=16;
     static const int CODEC_EOS=2150;
 
diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
index 98263f9..5f3b96d 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@@ -68,7 +68,7 @@ class Qwen3TtsEngine(
         private const val CP_KV_LEN = 16  // max 16 past positions (17 total with current)
 
         // Talker .pte constants
-        private const val TALKER_PTE_KV_LEN = 100  // .pte talker KV window size
+        private const val TALKER_PTE_KV_LEN = 64  // .pte talker KV window size (reduced from 100)
 
         // Codec special token IDs (in talker's 3072 vocab space)
         private const val CODEC_EOS = 2150
diff --git a/scripts/export_talker_pte.py b/scripts/export_talker_pte.py
index b3e85bc..29635fe 100644
--- a/scripts/export_talker_pte.py
+++ b/scripts/export_talker_pte.py
@@ -11,7 +11,7 @@ warnings.filterwarnings('ignore')
 
 N_L = 28; N_H = 16; N_KV = 8; HD = 128; DIM = 1024; N_REP = 2
 VOCAB = 3072; FFN = 3072
-KV_LEN = 16  # Small KV for testing HTP viability
+KV_LEN = 64  # Reduced from 100: saves 36% memcpy, sufficient for ~70 token generation
 
 state = torch.load("/opt/Kazeia/models_qnn/qwen3-tts-export/qwen3_tts_talker.pth",
                     map_location="cpu", weights_only=False)
@@ -117,7 +117,7 @@ edge = to_edge_transform_and_lower_to_qnn(w, (e, m, c0, s0, *kvs), compiler_spec
 print("LOWERED!")
 
 pte = edge.to_executorch()
-OUT = "/opt/Kazeia/models_qnn/talker_transformer_fp16_kv16.pte"
+OUT = "/opt/Kazeia/models_qnn/talker_transformer_fp16.pte"
 with open(OUT, "wb") as f:
     pte.write_to_file(f)
 print(f"SAVED: {OUT} ({os.path.getsize(OUT)/1024/1024:.0f} MB)")