kv-cache : simplify SWA logic

ggerganov · ggerganov · commit 02d9a19a54d2 · 2025-05-15T16:20:02.000+03:00
ggml-ci
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -362,17 +362,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);
+        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 }
 
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);
+        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 
     if (self_kq_mask_swa) {
-        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn, true);
+        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
     }
 }
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -15,8 +15,9 @@ enum llama_expert_gating_func_type {
 };
 
 enum llama_swa_type {
-    LLAMA_SWA_TYPE_STANDARD = 0,
-    LLAMA_SWA_TYPE_CHUNKED  = 1,
+    LLAMA_SWA_TYPE_NONE     = 0,
+    LLAMA_SWA_TYPE_STANDARD = 1,
+    LLAMA_SWA_TYPE_CHUNKED  = 2,
 };
 
 struct llama_hparams_posnet {
@@ -100,7 +101,7 @@ struct llama_hparams {
     std::array<int, 4> rope_sections;
 
     // Sliding Window Attention (SWA)
-    llama_swa_type swa_type = LLAMA_SWA_TYPE_STANDARD;
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
     uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
     uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -30,7 +30,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
                      bool    v_trans,
                      bool    offload,
                  uint32_t    kv_size,
-                 uint32_t    padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
+                 uint32_t    padding,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
     GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
 
     this->type_k = type_k;
@@ -640,7 +642,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
     return ggml_cpy(ctx, v_cur, v_view);
 }
 
-void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn, bool swa) const {
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     const int64_t n_tokens     = ubatch->n_tokens;
     const int64_t n_seq_tokens = ubatch->n_seq_tokens;
     const int64_t n_seqs       = ubatch->n_seqs;
@@ -667,41 +669,28 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
             const llama_seq_id seq_id = ubatch->seq_id[s][0];
 
             for (int j = 0; j < n_seq_tokens; ++j) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
+                const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
 
                 for (int i = 0; i < n_kv; ++i) {
-                    float f;
-                    // mask the token if:
-                    if (!cells[i].has_seq_id(seq_id) // not the correct sequence
-                            || (causal_attn && cells[i].pos > pos) // for causal, mask future tokens
-                       ) {
-                        f = -INFINITY;
-                    } else {
-                        if (hparams.use_alibi) {
-                            f = -std::abs(cells[i].pos - pos);
-                        } else {
-                            f = 0.0f;
-                        }
-                    }
+                    const llama_pos p0 = cells[i].pos;
+
+                    bool masked = false;
+
+                    // mask the token if not the same sequence
+                    masked = masked || (!cells[i].has_seq_id(seq_id));
+
+                    // mask future tokens
+                    masked = masked || (causal_attn && p0 > p1);
 
-                    if (swa) {
-                        // may need to cut off old tokens for sliding window
-                        switch (hparams.swa_type) {
-                            case LLAMA_SWA_TYPE_STANDARD:
-                                {
-                                    if (pos - cells[i].pos >= (int32_t) hparams.n_swa) {
-                                        f = -INFINITY;
-                                    }
-                                } break;
-                            case LLAMA_SWA_TYPE_CHUNKED:
-                                {
-                                    const llama_pos pos_chunk_start = (pos / hparams.n_swa) * hparams.n_swa;
-
-                                    if (cells[i].pos < pos_chunk_start) {
-                                        f = -INFINITY;
-                                    }
-                                } break;
-                        }
+                    // apply SWA if any
+                    masked = masked || (is_masked_swa(p0, p1));
+
+                    float f = 0.0f;
+
+                    if (masked) {
+                        f = -INFINITY;
+                    } else if (hparams.use_alibi) {
+                        f = -std::abs(p0 - p1);
                     }
 
                     data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
@@ -1191,6 +1180,30 @@ uint32_t llama_kv_cache_unified::cell_max() const {
     return 0;
 }
 
+bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+            {
+                if (p1 - p0 >= (int32_t) n_swa) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                if (p0 < pos_chunk_start) {
+                    return true;
+                }
+            } break;
+    }
+
+    return false;
+}
+
 void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
@@ -1586,11 +1599,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
 
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, kv_size_base);
 
-    kv_base = std::make_unique<llama_kv_cache_unified>(model, std::move(filter_base), type_k, type_v, v_trans, offload, kv_size_base, padding);
+    kv_base = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_base), type_k, type_v,
+            v_trans, offload, kv_size_base, padding,
+            0, LLAMA_SWA_TYPE_NONE);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, kv_size_swa);
 
-    kv_swa  = std::make_unique<llama_kv_cache_unified>(model, std::move(filter_swa),  type_k, type_v, v_trans, offload, kv_size_swa,  padding);
+    kv_swa = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_swa), type_k, type_v,
+            v_trans, offload, kv_size_swa,  padding,
+            hparams.n_swa, hparams.swa_type);
 }
 
 void llama_kv_cache_unified_iswa::clear() {
@@ -2801,5 +2820,4 @@ void llama_kv_cache_view_free(llama_kv_cache_view * view) {
 void llama_kv_cache_view_update(llama_kv_cache_view * , const llama_kv_cache * ) {
     // TODO: will be removed soon, keep this for now to avoid too many changes in
     //       https://github.com/ggml-org/llama.cpp/pull/13194
-    GGML_ABORT("not implemented");
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -102,7 +102,9 @@ class llama_kv_cache_unified : public llama_kv_cache {
                          bool    v_trans,
                          bool    offload,
                      uint32_t    kv_size,
-                     uint32_t    padding);
+                     uint32_t    padding,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);
 
     ~llama_kv_cache_unified() = default;
 
@@ -169,7 +171,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
 
-    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn, bool swa) const;
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_k_shift   (ggml_tensor * dst) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 
@@ -223,6 +225,11 @@ class llama_kv_cache_unified : public llama_kv_cache {
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
+    // SWA
+    uint32_t n_swa = 0;
+
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
@@ -264,6 +271,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
     ggml_tensor * build_rope_shift(
             const llama_cparams & cparams,
                    ggml_context * ctx,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13228,7 +13228,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             !cparams.flash_attn,
                             cparams.offload_kqv,
                             cparams.n_ctx,
-                            padding);
+                            padding,
+                            hparams.n_swa,
+                            hparams.swa_type);
                 }
             }
     }

Original file line number	Diff line number	Diff line change
`@@ -362,17 +362,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {`
`362`	`362`
`363`	`363`	`void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {`
`364`	`364`	`if (self_kq_mask) {`
`365`		`- kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);`
	`365`	`+ kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);`
`366`	`366`	`}`
`367`	`367`	`}`
`368`	`368`
`369`	`369`	`void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {`
`370`	`370`	`if (self_kq_mask) {`
`371`		`- kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);`
	`371`	`+ kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);`
`372`	`372`	`}`
`373`	`373`
`374`	`374`	`if (self_kq_mask_swa) {`
`375`		`- kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn, true);`
	`375`	`+ kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);`
`376`	`376`	`}`
`377`	`377`	`}`
`378`	`378`
Original file line number	Diff line number	Diff line change
`@@ -13228,7 +13228,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,`
`13228`	`13228`	`!cparams.flash_attn,`
`13229`	`13229`	`cparams.offload_kqv,`
`13230`	`13230`	`cparams.n_ctx,`
`13231`		`- padding);`
	`13231`	`+ padding,`
	`13232`	`+ hparams.n_swa,`
	`13233`	`+ hparams.swa_type);`
`13232`	`13234`	`}`
`13233`	`13235`	`}`
`13234`	`13236`	`}`