From 466c6cddba76c95780c2d098ae9c691203b6c416 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 17:46:04 +0200
Subject: [PATCH 01/59] server : (experimental) vision support via libmtmd

---
 common/arg.cpp                 |   8 +-
 examples/server/CMakeLists.txt |   3 +-
 examples/server/server.cpp     | 248 ++++++++++++++++++++---------
 examples/server/utils.hpp      | 280 ++++++++++++++++++++++++++++++++-
 4 files changed, 458 insertions(+), 81 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 642fefb57548f..17955872e61ef 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -834,9 +834,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // allow --mmproj to be set from -hf
     // assuming that mmproj is always in the same repo as text model
-    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
+    if (!params.model.hf_repo.empty() && (
+            ctx_arg.ex == LLAMA_EXAMPLE_LLAVA || ctx_arg.ex == LLAMA_EXAMPLE_SERVER)) {
         params.mmproj.hf_repo = params.model.hf_repo;
     }
+    // TODO @ngxson : this will break non-vision model with -hf, need to fix before merging
     common_params_handle_model(params.mmproj,            params.hf_token, "", true);
 
     if (params.escape) {
@@ -2101,14 +2103,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.mmproj.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--mmproj-url"}, "URL",
         "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index aee90388e4fb3..17109fddbd307 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -34,8 +34,9 @@ endforeach()
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
+target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1bf1ee876b40f..17b0ccfa108e1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -7,6 +7,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "speculative.h"
+#include "mtmd.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -196,8 +197,8 @@ struct server_task {
     int id_target = -1;
 
     // used by SERVER_TASK_TYPE_INFERENCE
-    slot_params  params;
-    llama_tokens prompt_tokens;
+    slot_params   params;
+    server_inputs prompt_tokens;
     int id_selected_slot = -1;
 
     // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
@@ -1246,6 +1247,9 @@ struct server_slot {
     llama_context * ctx = nullptr;
     llama_context * ctx_dft = nullptr;
 
+    // multimodal
+    mtmd_context * mctx = nullptr;
+
     common_speculative * spec = nullptr;
 
     std::vector<common_adapter_lora_info> lora;
@@ -1273,14 +1277,14 @@ struct server_slot {
     int32_t n_prompt_tokens_processed = 0;
 
     // input prompt tokens
-    llama_tokens prompt_tokens;
+    server_inputs prompt_tokens;
 
     size_t last_nl_pos = 0;
 
     std::string  generated_text;
     llama_tokens generated_tokens;
 
-    llama_tokens cache_tokens;
+    server_inputs cache_tokens;
 
     std::vector<completion_token_output> generated_token_probs;
 
@@ -1474,7 +1478,7 @@ struct server_slot {
             {"is_processing", is_processing()},
             {"non_causal",    is_non_causal()},
             {"params",        params.to_json()},
-            {"prompt",        common_detokenize(ctx, prompt_tokens)},
+            {"prompt",        ""}, // TODO @ngxson, hacky patch, to fix before merge
             {"next_token",
                 {
                     {"has_next_token", has_next_token},
@@ -1552,11 +1556,11 @@ struct server_queue {
     std::condition_variable condition_tasks;
 
     // callback functions
-    std::function<void(server_task)> callback_new_task;
-    std::function<void(void)>        callback_update_slots;
+    std::function<void(server_task&)> callback_new_task;
+    std::function<void(void)>         callback_update_slots;
 
     // Add a new task to the end of the queue
-    int post(server_task task, bool front = false) {
+    int post(server_task & task, bool front = false) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         GGML_ASSERT(task.id != -1);
         // if this is cancel task make sure to clean up pending tasks
@@ -1596,7 +1600,7 @@ struct server_queue {
     }
 
     // Add a new task, but defer until one slot is available
-    void defer(server_task task) {
+    void defer(server_task & task) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         QUE_DBG("defer task, id = %d\n", task.id);
         queue_tasks_deferred.push_back(std::move(task));
@@ -1611,7 +1615,7 @@ struct server_queue {
     }
 
     // Register function to process a new task
-    void on_new_task(std::function<void(server_task)> callback) {
+    void on_new_task(std::function<void(server_task&)> callback) {
         callback_new_task = std::move(callback);
     }
 
@@ -1660,12 +1664,12 @@ struct server_queue {
                     lock.unlock();
                     break;
                 }
-                server_task task = queue_tasks.front();
+                server_task task = std::move(queue_tasks.front());
                 queue_tasks.pop_front();
                 lock.unlock();
 
                 QUE_DBG("processing task, id = %d\n", task.id);
-                callback_new_task(std::move(task));
+                callback_new_task(task);
             }
 
             // all tasks in the current loop is processed, slots data is now ready
@@ -1846,6 +1850,9 @@ struct server_context {
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
 
+    // multimodal
+    mtmd_context * mctx = nullptr;
+
     const llama_vocab * vocab = nullptr;
 
     llama_model * model_dft = nullptr;
@@ -1875,6 +1882,8 @@ struct server_context {
     common_chat_templates_ptr chat_templates;
 
     ~server_context() {
+        mtmd_free(mctx);
+
         // Clear any sampling context
         for (server_slot & slot : slots) {
             common_sampler_free(slot.smpl);
@@ -1962,6 +1971,18 @@ struct server_context {
             chat_templates = common_chat_templates_init(model, "chatml");
         }
 
+        std::string & mmproj_path = params_base.mmproj.path;
+        if (!mmproj_path.empty()) {
+            mtmd_context_params mparams;
+            mparams.n_threads = params_base.cpuparams.n_threads;
+            mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
+            if (mctx == nullptr) {
+                SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
+                return false;
+            }
+            SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
+        }
+
         return true;
     }
 
@@ -1977,6 +1998,7 @@ struct server_context {
             slot.ctx = ctx;
             slot.n_ctx = n_ctx_slot;
             slot.n_predict = params_base.n_predict;
+            slot.mctx = mctx;
 
             if (model_dft) {
                 slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
@@ -2004,7 +2026,7 @@ struct server_context {
 
             slot.reset();
 
-            slots.push_back(slot);
+            slots.push_back(std::move(slot));
         }
 
         default_generation_settings_for_props = slots[0].to_json();
@@ -2051,10 +2073,10 @@ struct server_context {
                 }
 
                 // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
 
                 // fraction of the common subsequence length compared to the current slot's prompt length
-                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
+                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.n_tokens());
 
                 // select the current slot if the criteria match
                 if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
@@ -2093,19 +2115,14 @@ struct server_context {
         return ret;
     }
 
-    bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+    bool can_be_detokenized(const struct llama_context * ctx, const server_inputs & inp) {
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
         const int32_t n_vocab = llama_vocab_n_tokens(vocab);
-        for (const auto & token : tokens) {
-            if (token < 0 || token >= n_vocab) {
-                return false;
-            }
-        }
-        return true;
+        return inp.validate(n_vocab);
     }
 
-    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
+    bool launch_slot_with_task(server_slot & slot, server_task & task) {
         slot.reset();
         slot.id_task       = task.id;
         slot.index         = task.index;
@@ -2421,7 +2438,7 @@ struct server_context {
         res->content         = std::move(slot.generated_text);
         res->tokens          = std::move(slot.generated_tokens);
         res->timings         = slot.get_timings();
-        res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
+        //res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true); // TODO @ngxson : hacky, need to fix
         res->response_fields = std::move(slot.params.response_fields);
 
         res->truncated           = slot.truncated;
@@ -2547,7 +2564,7 @@ struct server_context {
             server_task task(SERVER_TASK_TYPE_CANCEL);
             task.id_target = id_task;
             queue_results.remove_waiting_task_id(id_task);
-            cancel_tasks.push_back(task);
+            cancel_tasks.push_back(std::move(task));
         }
         // push to beginning of the queue, so it has highest priority
         queue_tasks.post(cancel_tasks, true);
@@ -2637,7 +2654,7 @@ struct server_context {
     // Functions to process the task
     //
 
-    void process_single_task(server_task task) {
+    void process_single_task(server_task & task) {
         switch (task.type) {
             case SERVER_TASK_TYPE_COMPLETION:
             case SERVER_TASK_TYPE_INFILL:
@@ -2729,7 +2746,7 @@ struct server_context {
                     }
                     queue_results.send(std::move(res));
                 } break;
-            case SERVER_TASK_TYPE_SLOT_SAVE:
+            /*case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
@@ -2833,7 +2850,11 @@ struct server_context {
                     res->id_slot  = id_slot;
                     res->n_erased = n_erased;
                     queue_results.send(std::move(res));
-                } break;
+                } break;*/
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                GGML_ASSERT(false && "TODO @ngxson : removed due to not compat with multimodal");
             case SERVER_TASK_TYPE_SET_LORA:
                 {
                     params_base.lora_adapters = std::move(task.set_lora);
@@ -2841,6 +2862,7 @@ struct server_context {
                     res->id = task.id;
                     queue_results.send(std::move(res));
                 } break;
+                 
         }
     }
 
@@ -2876,7 +2898,8 @@ struct server_context {
 
         // apply context-shift if needed
         // TODO: simplify and improve
-        for (server_slot & slot : slots) {
+        // TODO @ngxson : hacky, need to disable context shift for multimodal
+        /*for (server_slot & slot : slots) {
             if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
                 if (!params_base.ctx_shift) {
                     // this check is redundant (for good)
@@ -2908,7 +2931,7 @@ struct server_context {
 
                 slot.truncated = true;
             }
-        }
+        }*/
 
         // start populating the batch for this iteration
         common_batch_clear(batch);
@@ -2940,17 +2963,21 @@ struct server_context {
             slot.n_past += 1;
 
             if (slot.params.cache_prompt) {
-                slot.cache_tokens.push_back(slot.sampled);
+                slot.cache_tokens.add_text_token(slot.sampled);
             }
 
             SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
-                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
+                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.n_tokens(), slot.truncated);
         }
 
         // process in chunks of params.n_batch
         int32_t n_batch  = llama_n_batch(ctx);
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
+        // for multimodal
+        bool is_decoding_embd = false;
+        server_embd_batch batch_embd;
+
         // next, batch any pending prompts without exceeding n_batch
         if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
@@ -2973,23 +3000,23 @@ struct server_context {
                         slot.t_start_generation = 0;
 
                         slot.n_past = 0;
-                        slot.n_prompt_tokens = prompt_tokens.size();
+                        slot.n_prompt_tokens = prompt_tokens.n_tokens();
                         slot.state = SLOT_STATE_PROCESSING_PROMPT;
 
                         SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
                         // print prompt tokens (for debugging)
-                        if (1) {
-                            // first 16 tokens (avoid flooding logs)
-                            for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
-                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
-                            }
-                        } else {
-                            // all
-                            for (int i = 0; i < (int) prompt_tokens.size(); i++) {
-                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
-                            }
-                        }
+                        // if (1) {
+                        //     // first 16 tokens (avoid flooding logs)
+                        //     for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
+                        //         SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                        //     }
+                        // } else {
+                        //     // all
+                        //     for (int i = 0; i < (int) prompt_tokens.size(); i++) {
+                        //         SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                        //     }
+                        // }
 
                         // empty prompt passed -> release the slot and send empty response
                         if (prompt_tokens.empty()) {
@@ -3030,7 +3057,8 @@ struct server_context {
                             slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
                             // if input prompt is too big, truncate it
-                            if (slot.n_prompt_tokens >= slot.n_ctx) {
+                            // TODO @ngxson : this won't work with multimodal
+                            /*if (slot.n_prompt_tokens >= slot.n_ctx) {
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
                                 const int n_block_size = n_left / 2;
@@ -3053,14 +3081,15 @@ struct server_context {
                                 SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
 
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
-                            }
+                            }*/
 
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params_base.n_cache_reuse > 0) {
+                                // TODO @ngxson : this won't work with multimodal
+                                /*if (params_base.n_cache_reuse > 0) {
                                     size_t head_c = slot.n_past; // cache
                                     size_t head_p = slot.n_past; // current prompt
 
@@ -3101,7 +3130,7 @@ struct server_context {
                                     }
 
                                     SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
-                                }
+                                }*/
                             }
                         }
 
@@ -3135,17 +3164,26 @@ struct server_context {
                     SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
 
                     // remove the non-common part from the cache
-                    slot.cache_tokens.resize(slot.n_past);
+                    slot.cache_tokens.keep_until(slot.n_past);
 
                     // add prompt tokens for processing in the current batch
                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
-
-                        if (slot.params.cache_prompt) {
-                            slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
+                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
+                        if (curr_chunk.tok_image) {
+                            // decode image
+                            server_encode_image(slot.mctx, batch_embd, curr_chunk, slot.n_past, slot.id);
+                            is_decoding_embd = true;
+                            SLT_INF(slot, "decoding image, n_past = %d, n_tokens = %d\n", slot.n_past, batch_embd.batch.n_tokens);
+                            slot.n_past += batch_embd.batch.n_tokens;
+                            break; // do not process any other slots
+                        } else {
+                            common_batch_add(batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                            if (slot.params.cache_prompt) {
+                                slot.cache_tokens.add_text_token(curr_chunk.tok_text);
+                            }
                         }
 
                         slot.n_prompt_tokens_processed++;
@@ -3163,8 +3201,11 @@ struct server_context {
                         common_sampler_reset(slot.smpl);
 
                         // Process all prompt tokens through sampler system
-                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
-                            common_sampler_accept(slot.smpl, prompt_tokens[i], false);
+                        for (size_t i = 0; i < slot.cache_tokens.n_tokens(); ++i) {
+                            auto & curr_chunk = slot.cache_tokens.get_chunk(i);
+                            if (curr_chunk.tok_text != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl, curr_chunk.tok_text, false);
+                            }
                         }
 
                         // extract the logits only for the last token
@@ -3201,7 +3242,7 @@ struct server_context {
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            llama_batch batch_view = {
+            llama_batch batch_view = is_decoding_embd ? batch_embd.batch : llama_batch{
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -3211,9 +3252,18 @@ struct server_context {
                 batch.logits   + i,
             };
 
+            // TODO @ngxson : maybe move this to llama_batch_ext
+            if (is_decoding_embd && mtmd_decode_use_non_causal(mctx)) {
+                llama_set_causal_attn(ctx, false);
+            }
+
             const int ret = llama_decode(ctx, batch_view);
             metrics.on_decoded(slots);
 
+            if (is_decoding_embd && mtmd_decode_use_non_causal(mctx)) {
+                llama_set_causal_attn(ctx, true);
+            }
+
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
@@ -3301,7 +3351,8 @@ struct server_context {
             }
 
             // do speculative decoding
-            for (auto & slot : slots) {
+            // TODO @ngxson : remove speculative decoding for multimodal
+            /*for (auto & slot : slots) {
                 if (!slot.is_processing() || !slot.can_speculate()) {
                     continue;
                 }
@@ -3394,7 +3445,7 @@ struct server_context {
                 }
 
                 SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
-            }
+            }*/
         }
 
         SRV_DBG("%s", "run slots completed\n");
@@ -3912,6 +3963,7 @@ int main(int argc, char ** argv) {
     const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
             server_task_type type,
             json & data,
+            std::vector<raw_buffer> & files,
             std::function<bool()> is_connection_closed,
             httplib::Response & res,
             oaicompat_type oaicompat) {
@@ -3930,15 +3982,55 @@ int main(int argc, char ** argv) {
             // TODO: this log can become very long, put it behind a flag or think about a more compact format
             //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
 
-            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
-            tasks.reserve(tokenized_prompts.size());
-            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+            // process files
+            std::vector<mtmd_bitmap> bitmaps;
+            {
+                for (auto & file : files) {
+                    mtmd_bitmap bmp;
+                    int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
+                    if (res != 0) {
+                        throw std::runtime_error("Failed to load image");
+                    }
+                    bitmaps.push_back(std::move(bmp));
+                }
+            }
+
+            std::vector<server_inputs> inputs;
+            if (oaicompat) {
+                if (!prompt.is_string()) {
+                    throw std::runtime_error("prompt must be a string");
+                } else {
+                    printf("prompt: %s\n", prompt.get<std::string>().c_str());
+                    mtmd_input_text inp_txt = {
+                        prompt.get<std::string>(),
+                        /* add_special */   true,
+                        /* parse_special */ true,
+                    };
+                    mtmd_input_chunks * tokenized = mtmd_tokenize(ctx_server.mctx, inp_txt, bitmaps);
+                    if (!tokenized) {
+                        throw std::runtime_error("Failed to tokenize prompt");
+                    }
+                    server_inputs tmp(tokenized);
+                    inputs.push_back(std::move(tmp));
+                    mtmd_input_chunks_free(tokenized, false); // only free the container, not the images
+                }
+            } else {
+                // non-multimodal version
+                auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+                for (auto & p : tokenized_prompts) {
+                    auto tmp = convert_legacy_to_mtmd(p);
+                    inputs.push_back(std::move(tmp));
+                }
+            }
+
+            tasks.reserve(inputs.size());
+            for (size_t i = 0; i < inputs.size(); i++) {
                 server_task task = server_task(type);
 
                 task.id    = ctx_server.queue_tasks.get_new_id();
                 task.index = i;
 
-                task.prompt_tokens    = std::move(tokenized_prompts[i]);
+                task.prompt_tokens    = std::move(inputs[i]);
                 task.params           = server_task::params_from_json_cmpl(
                                             ctx_server.ctx,
                                             ctx_server.params_base,
@@ -3950,7 +4042,7 @@ int main(int argc, char ** argv) {
                 task.params.oaicompat_cmpl_id         = completion_id;
                 // oaicompat_model is already populated by params_from_json_cmpl
 
-                tasks.push_back(task);
+                tasks.push_back(std::move(task));
             }
         } catch (const std::exception & e) {
             res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
@@ -4020,9 +4112,11 @@ int main(int argc, char ** argv) {
 
     const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         json data = json::parse(req.body);
+        std::vector<raw_buffer> files; // dummy
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
+            files,
             req.is_connection_closed,
             res,
             OAICOMPAT_TYPE_NONE);
@@ -4030,9 +4124,11 @@ int main(int argc, char ** argv) {
 
     const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         json data = oaicompat_completion_params_parse(json::parse(req.body));
+        std::vector<raw_buffer> files; // dummy
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
+            files,
             req.is_connection_closed,
             res,
             OAICOMPAT_TYPE_COMPLETION);
@@ -4107,9 +4203,11 @@ int main(int argc, char ** argv) {
             tokenized_prompts[0]
         );
 
+        std::vector<raw_buffer> files; // dummy
         return handle_completions_impl(
             SERVER_TASK_TYPE_INFILL,
             data,
+            files,
             req.is_connection_closed,
             res,
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
@@ -4123,11 +4221,13 @@ int main(int argc, char ** argv) {
         }
 
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
+        std::vector<raw_buffer> files;
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get(), files);
 
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
+            files,
             req.is_connection_closed,
             res,
             OAICOMPAT_TYPE_CHAT);
@@ -4136,7 +4236,8 @@ int main(int argc, char ** argv) {
     // same with handle_chat_completions, but without inference part
     const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
+        std::vector<raw_buffer> files; // dummy, unused
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get(), files);
         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
     };
 
@@ -4241,7 +4342,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
         for (const auto & tokens : tokenized_prompts) {
             // this check is necessary for models that do not add BOS token to the input
             if (tokens.empty()) {
@@ -4260,12 +4361,12 @@ int main(int argc, char ** argv) {
 
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
-                task.prompt_tokens = std::move(tokenized_prompts[i]);
+                task.prompt_tokens = convert_legacy_to_mtmd(tokenized_prompts[i]);
 
                 // OAI-compat
                 task.params.oaicompat = oaicompat;
 
-                tasks.push_back(task);
+                tasks.push_back(std::move(task));
             }
 
             ctx_server.queue_results.add_waiting_tasks(tasks);
@@ -4354,14 +4455,15 @@ int main(int argc, char ** argv) {
         bool error = false;
         {
             std::vector<server_task> tasks;
-            std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
+            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
             tasks.reserve(tokenized_docs.size());
             for (size_t i = 0; i < tokenized_docs.size(); i++) {
+                auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
-                task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
-                tasks.push_back(task);
+                task.prompt_tokens = convert_legacy_to_mtmd(tmp);
+                tasks.push_back(std::move(task));
             }
 
             ctx_server.queue_results.add_waiting_tasks(tasks);
@@ -4566,7 +4668,7 @@ int main(int argc, char ** argv) {
         common_chat_templates_source(ctx_server.chat_templates.get()),
         common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
 
-    ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
+    ctx_server.queue_tasks.on_new_task([&ctx_server](server_task & task) {
         ctx_server.process_single_task(task);
     });
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index aba2f27f9b564..5103e22e163dd 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -4,6 +4,7 @@
 #include "log.h"
 #include "llama.h"
 #include "base64.hpp"
+#include "mtmd.h"
 
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -21,6 +22,7 @@
 #include <string>
 #include <vector>
 #include <memory>
+#include <cinttypes>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 
@@ -41,6 +43,8 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
+using raw_buffer = std::vector<uint8_t>;
+
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
     // Fallback null to default value
@@ -386,7 +390,7 @@ static inline bool is_base64(uint8_t c) {
     return (isalnum(c) || (c == '+') || (c == '/'));
 }
 
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
+static inline raw_buffer base64_decode(const std::string & encoded_string) {
     int i = 0;
     int j = 0;
     int in_ = 0;
@@ -396,7 +400,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
     uint8_t char_array_4[4];
     uint8_t char_array_3[3];
 
-    std::vector<uint8_t> ret;
+    raw_buffer ret;
 
     while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
         char_array_4[i++] = encoded_string[in_]; in_++;
@@ -579,7 +583,8 @@ static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
     common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls)
+    const struct common_chat_templates * tmpls,
+    std::vector<raw_buffer> & out_files)
 {
     json llama_params;
 
@@ -627,8 +632,47 @@ static json oaicompat_completion_params_parse(
         }
     }
 
+    // get input files
+    json messages = json_value(body, "messages", json::array());
+    if (!messages.is_array()) {
+        throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
+    }
+    for (auto & msg : messages) {
+        json & content = msg.at("content");
+        if (content.is_string()) {
+            continue;
+        }
+
+        if (!content.is_array()) {
+            throw std::runtime_error("Expected 'content' to be a string or an array");
+        }
+
+        for (auto & p : content) {
+            std::string type      = json_value(p, "type", std::string());
+            json        image_url = json_value(p, "image_url", json::object());
+            if (type == "image_url") {
+                std::string url = json_value(image_url, "url", std::string());
+                std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                if (parts.size() != 2) {
+                    throw std::runtime_error("Invalid image_url.url value");
+                } else if (!string_starts_with(parts[0], "data:image/")) {
+                    throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
+                } else if (!string_ends_with(parts[0], "base64")) {
+                    throw std::runtime_error("image_url.url must be base64 encoded");
+                } else {
+                    auto base64_data = parts[1];
+                    auto decoded_data = base64_decode(base64_data);
+                    out_files.push_back(decoded_data);
+                }
+                p["type"] = "text";
+                p["text"] = "<__image__>";
+                p.erase("image_url");
+            }
+        }        
+    }
+
     common_chat_templates_inputs inputs;
-    inputs.messages              = common_chat_msgs_parse_oaicompat(body.at("messages"));
+    inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
     inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
     inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
@@ -913,3 +957,231 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 
     return lora;
 }
+
+//
+// utils for interacting with libmtmd
+// (may need to refactor in near future)
+//
+
+struct server_inp_chunk {
+    llama_token tok_text;
+    mtmd_image_tokens_ptr tok_image;
+    std::string str() {
+        // for debugging
+        if (tok_image) {
+            return "<image> ";
+        } else {
+            return std::to_string(tok_text) + " ";
+        }
+    }
+};
+
+struct server_inputs {
+    std::vector<server_inp_chunk> chunks;
+
+    server_inputs() = default;
+    ~server_inputs() = default; // Important if unique_ptr is used
+
+    // Prevent copying
+    server_inputs(const server_inputs&) = delete;
+    server_inputs& operator=(const server_inputs&) = delete;
+
+    // Allow moving (usually implicitly generated if members are movable)
+    server_inputs(server_inputs&&) = default;
+    server_inputs& operator=(server_inputs&&) = default;
+
+    server_inputs(mtmd_input_chunks * mtmd_chunks) {
+        for (auto & c : *mtmd_chunks) {
+            if (c.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+                chunks.push_back({LLAMA_TOKEN_NULL, mtmd_image_tokens_ptr(c.tokens_image)});
+            } else if (c.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                for (auto & tok : c.tokens_text) {
+                    chunks.push_back({tok, nullptr});
+                }
+            } else {
+                GGML_ASSERT(false && "Invalid chunk type");
+            }
+        }
+    }
+
+    size_t n_tokens() const {
+        size_t res = 0;
+        for (const auto & chunk : chunks) {
+            if (chunk.tok_image) {
+                res += mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
+            } else {
+                res++;
+            }
+        }
+        return res;
+    }
+
+    bool empty() const {
+        return n_tokens() == 0;
+    }
+
+    void clear() {
+        chunks.clear();
+    }
+
+    void add_text_token(llama_token tok) {
+        GGML_ASSERT(tok != LLAMA_TOKEN_NULL);
+        chunks.push_back({tok, nullptr});
+    }
+
+    size_t get_common_prefix(const server_inputs & b) const {
+        size_t ret = 0;
+        size_t max_idx = std::min(chunks.size(), b.chunks.size());
+        for (size_t i = 0; i < max_idx; ++i) {
+            auto & ai =   chunks[i];
+            auto & bi = b.chunks[i];
+
+            if (ai.tok_text == bi.tok_text && !ai.tok_image && !bi.tok_image) {
+                ret++;
+                continue;
+            } else if (ai.tok_image && bi.tok_image) {
+                // TODO check image hash
+                break;
+            } else {
+                break;
+            }
+        }
+        return ret;
+    }
+
+    bool validate(llama_token max_vocab_id) const {
+        for (const auto & chunk : chunks) {
+            if (!chunk.tok_image) {
+                if (chunk.tok_text < 0 || chunk.tok_text >= max_vocab_id) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    server_inp_chunk & get_chunk(size_t pos) {
+        return chunks[get_chunk_idx(pos)];
+    }
+
+    size_t get_chunk_idx(size_t pos) const {
+        size_t current_pos = 0;
+        for (size_t i = 0; i < chunks.size(); ++i) {
+            const auto & chunk = chunks[i];
+            size_t chunk_size = chunk.tok_image ? mtmd_image_tokens_get_n_tokens(chunk.tok_image.get()) : 1;
+            size_t chunk_end_pos = current_pos + chunk_size;
+            if (pos < chunk_end_pos) {
+                // The target position 'pos' falls within this chunk
+                return i;
+            }
+
+            current_pos = chunk_end_pos;
+        }
+        // If the loop finishes, 'pos' is >= the total number of logical positions
+        return chunks.size();
+    }
+
+    // same idea with std::vector<llama_token> resize()
+    void keep_until(size_t pos) {
+        if (pos == 0) {
+            chunks.clear();
+            return;
+        }
+
+        size_t current_pos = 0;
+        for (size_t i = 0; i < chunks.size(); ++i) {
+            const auto & chunk = chunks[i];
+            size_t chunk_size = chunk.tok_image ? mtmd_image_tokens_get_n_tokens(chunk.tok_image.get()) : 1;
+            size_t chunk_end_pos = current_pos + chunk_size;
+            if (pos <= current_pos) {
+                // Truncation point is exactly at or before the start of this chunk.
+                // Keep only chunks before index 'i'.
+                chunks.resize(i);
+                return;
+            }
+            if (pos < chunk_end_pos) {
+                // Truncation point 'pos' falls within this chunk.
+                if (chunk.tok_image) {
+                    // It's an image chunk, keep the whole chunk.
+                    // Keep chunks up to and including index 'i'.
+                    chunks.resize(i + 1);
+                } else {
+                    // It's a text chunk. Since pos < chunk_end_pos and chunk_size is 1,
+                    // this means pos == current_pos.
+                    // Keep only chunks before index 'i'.
+                    chunks.resize(i);
+                }
+                return;
+            }
+            // pos >= chunk_end_pos, so keep this chunk entirely and continue.
+            current_pos = chunk_end_pos;
+        }
+        // If the loop completes, it means 'pos' is >= the total logical size.
+        // No truncation needed, the vector remains unchanged.
+    }
+};
+
+// helper struct to make working with embd batch easier
+// note: this will be removed after llama_batch_ext refactoring
+struct server_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    server_embd_batch() = default;
+    server_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
+// TODO @ngxson : quite hacky for now, but just to see if it works
+static int32_t server_encode_image(mtmd_context * mctx, server_embd_batch & batch_out, server_inp_chunk & chunk, llama_pos n_past, llama_seq_id seq_id) {
+    GGML_ASSERT(chunk.tok_image);
+
+    int64_t t0 = ggml_time_ms();
+    LOG_INF("encoding image...\n");
+    int32_t ret = mtmd_encode(mctx, chunk.tok_image.get());
+    if (ret != 0) {
+        LOG_ERR("failed to encode image\n");
+        batch_out = server_embd_batch{};
+        return ret;
+    }
+    LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+
+    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
+    float * embd = mtmd_get_output_embd(mctx);
+    batch_out = server_embd_batch(embd, n_tokens, n_past, seq_id);
+    return ret;
+}
+
+// hacky, support text-only for now
+static server_inputs convert_legacy_to_mtmd(llama_tokens & tokenized) {
+    server_inputs res;
+    for (auto & tok : tokenized) {
+        res.add_text_token(tok);
+    }
+    return res;
+}

From 2317e618b5df417eaf3980ddf6114caae7392dfb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 17:46:25 +0200
Subject: [PATCH 02/59] mtmd : add more api around mtmd_image_tokens

---
 examples/llava/mtmd.cpp | 39 ++++++++++++++++++++++++++++++++++-----
 examples/llava/mtmd.h   | 23 ++++++++++++++++++++---
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 58503d0b22c33..be856c0fa9ed6 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -166,15 +166,36 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
     return output;
 }
 
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
-    for (auto & chunk : *chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-            delete chunk.tokens_image;
+void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) {
+    if (free_images) {
+        for (auto & chunk : *chunks) {
+            if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
+                mtmd_image_tokens_free(chunk.tokens_image);
+                chunk.tokens_image = nullptr;
+            }
         }
     }
     delete chunks;
 }
 
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -289,7 +310,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = chunk.tokens_image->n_tokens();
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image);
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
@@ -339,3 +360,11 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
     std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
     return 0;
 }
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        return true;
+    }
+    return false;
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 598f6947bb092..ca3fb6fdc7960 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -81,13 +81,20 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
+// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
 MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
-// free image chunk data
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free()
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
+
+// access mtmd_image_tokens
+MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API void   mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
@@ -96,6 +103,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+
+
 //
 // helper functions (can be implemented based on other functions)
 //
@@ -133,10 +145,15 @@ struct mtmd_context_deleter {
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
 struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); }
 };
 using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From a46b6db6844c2d213965d7450a7eb0d2588d88e3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 17:46:25 +0200
Subject: [PATCH 03/59] mtmd : add more api around mtmd_image_tokens

---
 examples/llava/mtmd.cpp | 39 ++++++++++++++++++++++++++++++++++-----
 examples/llava/mtmd.h   | 23 ++++++++++++++++++++---
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 114c274bc1250..98d660a643809 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -166,15 +166,36 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
     return output;
 }
 
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
-    for (auto & chunk : *chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-            delete chunk.tokens_image;
+void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) {
+    if (free_images) {
+        for (auto & chunk : *chunks) {
+            if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
+                mtmd_image_tokens_free(chunk.tokens_image);
+                chunk.tokens_image = nullptr;
+            }
         }
     }
     delete chunks;
 }
 
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -289,7 +310,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = chunk.tokens_image->n_tokens();
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image);
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
@@ -339,3 +360,11 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
     std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
     return 0;
 }
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        return true;
+    }
+    return false;
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 598f6947bb092..ca3fb6fdc7960 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -81,13 +81,20 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
+// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
 MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
-// free image chunk data
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free()
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
+
+// access mtmd_image_tokens
+MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API void   mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
@@ -96,6 +103,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+
+
 //
 // helper functions (can be implemented based on other functions)
 //
@@ -133,10 +145,15 @@ struct mtmd_context_deleter {
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
 struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); }
 };
 using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From 7ac0b7b7b0433eacd8c9cabf3734f092637e6212 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 22:17:47 +0200
Subject: [PATCH 04/59] mtmd : ability to calc image hash

---
 examples/llava/gemma3-cli.cpp |  1 +
 examples/llava/mtmd.cpp       | 29 ++++++++++++++++++++++++++++-
 examples/llava/mtmd.h         | 12 ++++++++----
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 91a07e2a8f40d..b200d8f111918 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -89,6 +89,7 @@ struct gemma3_context {
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
+            /* hash */      false,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         }));
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 98d660a643809..1691a71bf27fc 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -16,15 +16,22 @@ struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
+
     bool print_timings;
     int n_threads;
     std::string image_marker;
+    bool calc_image_hash;
 
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
-                   const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const mtmd_context_params & ctx_params) :
+        print_timings  (ctx_params.print_timings),
+        n_threads      (ctx_params.n_threads),
+        image_marker   (ctx_params.image_marker),
+        calc_image_hash(ctx_params.calc_image_hash)
+    {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -49,6 +56,7 @@ struct mtmd_image_tokens {
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
+    size_t image_hash = 0; // hash of the image, useful for KV cache tracking
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -88,6 +96,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
     return result;
 }
 
+static uint64_t hash_vector_float(const std::vector<float> & vec) {
+    uint64_t seed = vec.size();
+    std::hash<float> hasher;
+    for (float val : vec) {
+        // inspired by boost::hash_combine
+        seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    return seed;
+}
+
 mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps) {
@@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
 
+            // optionally calculate the hash
+            if (ctx->calc_image_hash) {
+                image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf);
+            }
+
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {},
@@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
     return image_tokens->ny;
 }
 
+uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->image_hash;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index ca3fb6fdc7960..cadcfa16fdceb 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -52,6 +52,9 @@ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
 struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
+    // calc_image_hash is useful for tracking KV cache
+    // if not set, mtmd_image_tokens_get_hash will return 0
+    bool calc_image_hash = false;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";
@@ -91,10 +94,11 @@ MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
 MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
 
 // access mtmd_image_tokens
-MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
-MTMD_API void   mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens);
+MTMD_API void     mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,

From 58c47674aac9704cfbc2f8e44ebbbe318edc432e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 12 Apr 2025 10:34:12 +0200
Subject: [PATCH 05/59] shared_ptr for mtmd_image_tokens

---
 examples/llava/gemma3-cli.cpp | 11 +++----
 examples/llava/mtmd.cpp       | 56 +++++++++++++++--------------------
 examples/llava/mtmd.h         | 32 +++++++++-----------
 3 files changed, 44 insertions(+), 55 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index b200d8f111918..34296c87132b0 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -185,18 +185,19 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps));
-    if (chunks == nullptr) {
-        LOG_ERR("Unable to tokenize prompt\n");
+    mtmd_input_chunks chunks;
+    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
+    if (res != 0) {
+        LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
         return 1;
     }
 
-    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) {
+    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks.get());
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
 
     return 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 1691a71bf27fc..44e48c7270368 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -106,10 +106,10 @@ static uint64_t hash_vector_float(const std::vector<float> & vec) {
     return seed;
 }
 
-mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
-                                const mtmd_input_text & text,
-                                const std::vector<mtmd_bitmap> & bitmaps) {
-    mtmd_input_chunks * output = new mtmd_input_chunks;
+int32_t mtmd_tokenize(mtmd_context * ctx,
+                        std::vector<mtmd_input_chunk> & output,
+                        const mtmd_input_text & text,
+                        const std::vector<mtmd_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text.text);
@@ -124,8 +124,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
     }
 
     std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
-    output->clear();
-    output->reserve(parts.size());
+    output.clear();
+    output.reserve(parts.size());
 
     size_t i_img = 0;
 
@@ -141,14 +141,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             std::move(tokens),
             {},
         };
-        output->emplace_back(std::move(chunk));
+        output.emplace_back(std::move(chunk));
 
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts
 
             if (i_img >= bitmaps.size()) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return nullptr;
+                return 1;
             }
 
             // shim layer
@@ -163,10 +163,10 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
-                return nullptr;
+                return 2;
             }
 
-            mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
             image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
@@ -179,14 +179,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {},
-                image_tokens,
+                std::move(image_tokens),
             };
-            output->emplace_back(std::move(chunk));
+            output.emplace_back(std::move(chunk));
             i_img++;
         }
     }
 
-    return output;
+    return 0;
 }
 
 void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
@@ -195,18 +195,6 @@ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
     }
 }
 
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) {
-    if (free_images) {
-        for (auto & chunk : *chunks) {
-            if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-                mtmd_image_tokens_free(chunk.tokens_image);
-                chunk.tokens_image = nullptr;
-            }
-        }
-    }
-    delete chunks;
-}
-
 size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
     return image_tokens->n_tokens();
 }
@@ -238,9 +226,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
+size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
     size_t n_tokens = 0;
-    for (auto & chunk : *chunks) {
+    for (auto & chunk : chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             n_tokens += chunk.tokens_text.size();
         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -289,7 +277,7 @@ struct decode_embd_batch {
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
         llama_context * lctx,
-        mtmd_input_chunks * chunks,
+        mtmd_input_chunks & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
@@ -297,8 +285,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
 
-    for (auto & chunk : *chunks) {
-        bool is_last = &chunk == &chunks->back();
+    for (auto & chunk : chunks) {
+        bool is_last = &chunk == &chunks.back();
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             // TODO @ngxson : may need to split into smaller batches
             text_batch.n_tokens = chunk.tokens_text.size();
@@ -327,7 +315,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             if (ctx->print_timings) {
                 LOG_INF("encoding image...\n");
             }
-            ret = mtmd_encode(ctx, chunk.tokens_image);
+            ret = mtmd_encode(ctx, chunk.tokens_image.get());
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
@@ -337,7 +325,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image);
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
@@ -395,3 +383,7 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
     }
     return false;
 }
+
+void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
+    mtmd_image_tokens_free(val);
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index cadcfa16fdceb..f07814a56208c 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -41,10 +41,15 @@ struct mtmd_bitmap {
     std::vector<unsigned char> data;
 };
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val); // forward declaration
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 struct mtmd_input_chunk {
     mtmd_input_chunk_type type;
     std::vector<llama_token> tokens_text;
-    mtmd_image_tokens * tokens_image = nullptr;
+    mtmd_image_tokens_ptr tokens_image;
 };
 
 using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
@@ -84,15 +89,16 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
-// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
-MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
+// return values:
+//   0 on success
+//   1 on number of images not matching the number of markers
+//   2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+                                std::vector<mtmd_input_chunk> & output,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
-// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free()
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
-
 // access mtmd_image_tokens
 MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
 MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
@@ -117,7 +123,7 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 //
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
@@ -126,7 +132,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
 // otherwise, returns 0 on success
 MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_context * lctx,
-                                mtmd_input_chunks * chunks,
+                                mtmd_input_chunks & chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
@@ -148,16 +154,6 @@ struct mtmd_context_deleter {
 };
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
-struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); }
-};
-using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
-
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
-};
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From d3c3e20c424b02fedbef8d2fdddd0061c6255348 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 12 Apr 2025 11:03:38 +0200
Subject: [PATCH 06/59] move hash to user-define ID (fixed)

---
 examples/llava/gemma3-cli.cpp |  1 -
 examples/llava/mtmd.cpp       | 25 +++++--------------------
 examples/llava/mtmd.h         | 14 ++++++--------
 3 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 34296c87132b0..de206c85ae80c 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -89,7 +89,6 @@ struct gemma3_context {
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
-            /* hash */      false,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         }));
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 44e48c7270368..0898439d11d48 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -29,8 +29,7 @@ struct mtmd_context {
                    const mtmd_context_params & ctx_params) :
         print_timings  (ctx_params.print_timings),
         n_threads      (ctx_params.n_threads),
-        image_marker   (ctx_params.image_marker),
-        calc_image_hash(ctx_params.calc_image_hash)
+        image_marker   (ctx_params.image_marker)
     {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
@@ -56,7 +55,7 @@ struct mtmd_image_tokens {
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
-    size_t image_hash = 0; // hash of the image, useful for KV cache tracking
+    std::string id; // optional user-defined ID, useful for KV cache tracking
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -96,16 +95,6 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
     return result;
 }
 
-static uint64_t hash_vector_float(const std::vector<float> & vec) {
-    uint64_t seed = vec.size();
-    std::hash<float> hasher;
-    for (float val : vec) {
-        // inspired by boost::hash_combine
-        seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-    }
-    return seed;
-}
-
 int32_t mtmd_tokenize(mtmd_context * ctx,
                         std::vector<mtmd_input_chunk> & output,
                         const mtmd_input_text & text,
@@ -170,11 +159,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
-
-            // optionally calculate the hash
-            if (ctx->calc_image_hash) {
-                image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf);
-            }
+            image_tokens->id = bitmaps[i_img].id; // optional
 
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
@@ -207,8 +192,8 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
     return image_tokens->ny;
 }
 
-uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->image_hash;
+std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->id;
 }
 
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index f07814a56208c..78be192dd6eb6 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -39,6 +39,7 @@ struct mtmd_bitmap {
     uint32_t nx;
     uint32_t ny;
     std::vector<unsigned char> data;
+    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
 };
 
 struct mtmd_image_tokens_deleter {
@@ -57,9 +58,6 @@ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
 struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
-    // calc_image_hash is useful for tracking KV cache
-    // if not set, mtmd_image_tokens_get_hash will return 0
-    bool calc_image_hash = false;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";
@@ -100,11 +98,11 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
 // access mtmd_image_tokens
-MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t   mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
-MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens);
-MTMD_API void     mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,

From 5e6c7ba4a8f765639dc947afac94bac629fec6cd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 13 Apr 2025 23:38:32 +0200
Subject: [PATCH 07/59] abstract out the batch management

---
 examples/llava/mtmd.cpp    |  14 +----
 examples/server/server.cpp | 111 +++++++++++++++++++++----------------
 examples/server/utils.hpp  | 110 ++++++++++++++++++++++++++----------
 3 files changed, 147 insertions(+), 88 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 687abfbc472ee..fe6d769095011 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -112,7 +112,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
-    std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 
@@ -196,18 +196,6 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
     return image_tokens->id;
 }
 
-size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->n_tokens();
-}
-
-size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->nx;
-}
-
-size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->ny;
-}
-
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 17b0ccfa108e1..2c4b0b876d576 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1859,7 +1859,7 @@ struct server_context {
 
     llama_context_params cparams_dft;
 
-    llama_batch batch = {};
+    server_batch batch;
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -1897,8 +1897,6 @@ struct server_context {
 
             llama_batch_free(slot.batch_spec);
         }
-
-        llama_batch_free(batch);
     }
 
     bool load_model(const common_params & params) {
@@ -2035,9 +2033,7 @@ struct server_context {
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);
-
-            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch = server_batch(std::max(n_batch, params_base.n_parallel));
         }
 
         metrics.init();
@@ -2934,7 +2930,7 @@ struct server_context {
         }*/
 
         // start populating the batch for this iteration
-        common_batch_clear(batch);
+        batch.clear();
 
         // track if given slot can be batched with slots already in the batch
         server_slot * slot_batched = nullptr;
@@ -2956,9 +2952,9 @@ struct server_context {
                 continue;
             }
 
-            slot.i_batch = batch.n_tokens;
+            slot.i_batch = batch.n_tokens();
 
-            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
+            common_batch_add(batch.batch, slot.sampled, slot.n_past, { slot.id }, true);
 
             slot.n_past += 1;
 
@@ -2974,12 +2970,8 @@ struct server_context {
         int32_t n_batch  = llama_n_batch(ctx);
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
-        // for multimodal
-        bool is_decoding_embd = false;
-        server_embd_batch batch_embd;
-
         // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.n_tokens() == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
                 if (slot.is_processing()) {
@@ -3147,7 +3139,7 @@ struct server_context {
                     // non-causal tasks require to fit the entire prompt in the physical batch
                     if (slot.is_non_causal()) {
                         // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+                        if (batch.n_tokens() + slot.n_prompt_tokens > n_batch) {
                             continue;
                         }
                     }
@@ -3167,36 +3159,55 @@ struct server_context {
                     slot.cache_tokens.keep_until(slot.n_past);
 
                     // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
                         auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
                         if (curr_chunk.tok_image) {
-                            // decode image
-                            server_encode_image(slot.mctx, batch_embd, curr_chunk, slot.n_past, slot.id);
-                            is_decoding_embd = true;
-                            SLT_INF(slot, "decoding image, n_past = %d, n_tokens = %d\n", slot.n_past, batch_embd.batch.n_tokens);
-                            slot.n_past += batch_embd.batch.n_tokens;
-                            break; // do not process any other slots
+                            // if there are already TEXT tokens in the batch, we need to process them first
+                            if (batch.batch.n_tokens > 0) {
+                                break;
+                            }
+                            // encode the image
+                            server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
+                            GGML_ASSERT(batch.has_embd());
+                            SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
+
+                            if (slot.params.cache_prompt) {
+                                slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
+                            }
+
+                            slot.n_past                    += batch.n_tokens();
+                            slot.n_prompt_tokens_processed += batch.n_tokens();
+                            break; // we cannot have both text batch and image batch
+
                         } else {
-                            common_batch_add(batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                            GGML_ASSERT(!batch.has_embd());
+                            common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
                             if (slot.params.cache_prompt) {
                                 slot.cache_tokens.add_text_token(curr_chunk.tok_text);
                             }
+
+                            slot.n_prompt_tokens_processed++;
+                            slot.n_past++;
                         }
+                    }
+
+                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
 
-                        slot.n_prompt_tokens_processed++;
-                        slot.n_past++;
+                    if (batch.has_embd()) {
+                        // currently, we can only process one image at a time, so we skip other slots
+                        break;
                     }
 
-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
-                        GGML_ASSERT(batch.n_tokens > 0);
+                        GGML_ASSERT(batch.n_tokens() > 0);
 
                         common_sampler_reset(slot.smpl);
 
@@ -3209,27 +3220,32 @@ struct server_context {
                         }
 
                         // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch.logits[batch.n_tokens() - 1] = true;
 
                         slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens - 1;
+                        slot.i_batch   = batch.n_tokens() - 1;
 
-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens());
                     }
                 }
 
-                if (batch.n_tokens >= n_batch) {
+                if (batch.n_tokens() >= n_batch) {
                     break;
                 }
             }
         }
 
-        if (batch.n_tokens == 0) {
+        if (batch.n_tokens() == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
         }
 
-        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+        // debug
+        if (batch.has_embd()) {
+            SRV_INF("decoding embd batch, n_tokens = %d\n", batch.n_tokens());
+        } else {
+            SRV_INF("decoding batch, n_tokens = %d\n", batch.n_tokens());
+        }
 
         if (slot_batched) {
             // make sure we're in the right embedding mode
@@ -3239,28 +3255,29 @@ struct server_context {
         }
 
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+        for (int32_t i = 0; i < batch.n_tokens(); i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens() - i);
 
-            llama_batch batch_view = is_decoding_embd ? batch_embd.batch : llama_batch{
+            // TODO @ngxson : hacky here, we don't want to split the embd batch
+            llama_batch batch_view = batch.has_embd() ? batch.batch : llama_batch{
                 n_tokens,
-                batch.token    + i,
+                batch.batch.token    + i,
                 nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
+                batch.batch.pos      + i,
+                batch.batch.n_seq_id + i,
+                batch.batch.seq_id   + i,
+                batch.batch.logits   + i,
             };
 
             // TODO @ngxson : maybe move this to llama_batch_ext
-            if (is_decoding_embd && mtmd_decode_use_non_causal(mctx)) {
+            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
                 llama_set_causal_attn(ctx, false);
             }
 
             const int ret = llama_decode(ctx, batch_view);
             metrics.on_decoded(slots);
 
-            if (is_decoding_embd && mtmd_decode_use_non_causal(mctx)) {
+            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
                 llama_set_causal_attn(ctx, true);
             }
 
@@ -4006,13 +4023,13 @@ int main(int argc, char ** argv) {
                         /* add_special */   true,
                         /* parse_special */ true,
                     };
-                    mtmd_input_chunks * tokenized = mtmd_tokenize(ctx_server.mctx, inp_txt, bitmaps);
-                    if (!tokenized) {
+                    mtmd_input_chunks chunks;
+                    int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
+                    if (tokenized != 0) {
                         throw std::runtime_error("Failed to tokenize prompt");
                     }
-                    server_inputs tmp(tokenized);
+                    server_inputs tmp(chunks);
                     inputs.push_back(std::move(tmp));
-                    mtmd_input_chunks_free(tokenized, false); // only free the container, not the images
                 }
             } else {
                 // non-multimodal version
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 5103e22e163dd..3bc0d0da17ec3 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -964,18 +964,26 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 //
 
 struct server_inp_chunk {
+    size_t n_tokens = 1; // always 1 in case of text
     llama_token tok_text;
     mtmd_image_tokens_ptr tok_image;
-    std::string str() {
+    std::string str() const {
         // for debugging
         if (tok_image) {
-            return "<image> ";
+            return string_format("(<image> at %p) ", (void *)tok_image.get());
         } else {
             return std::to_string(tok_text) + " ";
         }
     }
 };
 
+/**
+ * server_inputs is a helper to manage the input tokens and image for the server.
+ * 
+ * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
+ * 
+ * it is made this way to simplify the logic of KV cache management.
+ */
 struct server_inputs {
     std::vector<server_inp_chunk> chunks;
 
@@ -990,13 +998,14 @@ struct server_inputs {
     server_inputs(server_inputs&&) = default;
     server_inputs& operator=(server_inputs&&) = default;
 
-    server_inputs(mtmd_input_chunks * mtmd_chunks) {
-        for (auto & c : *mtmd_chunks) {
+    server_inputs(mtmd_input_chunks & mtmd_chunks) {
+        for (auto & c : mtmd_chunks) {
             if (c.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-                chunks.push_back({LLAMA_TOKEN_NULL, mtmd_image_tokens_ptr(c.tokens_image)});
+                size_t n_tokens = mtmd_image_tokens_get_n_tokens(c.tokens_image.get());
+                chunks.push_back({n_tokens, LLAMA_TOKEN_NULL, std::move(c.tokens_image)});
             } else if (c.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
                 for (auto & tok : c.tokens_text) {
-                    chunks.push_back({tok, nullptr});
+                    chunks.push_back({1, tok, nullptr});
                 }
             } else {
                 GGML_ASSERT(false && "Invalid chunk type");
@@ -1004,11 +1013,20 @@ struct server_inputs {
         }
     }
 
+    std::string str() {
+        // for debugging
+        std::string ret;
+        for (const auto & chunk : chunks) {
+            ret += chunk.str();
+        }
+        return ret;
+    }
+
     size_t n_tokens() const {
         size_t res = 0;
         for (const auto & chunk : chunks) {
             if (chunk.tok_image) {
-                res += mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
+                res += chunk.n_tokens;
             } else {
                 res++;
             }
@@ -1026,7 +1044,13 @@ struct server_inputs {
 
     void add_text_token(llama_token tok) {
         GGML_ASSERT(tok != LLAMA_TOKEN_NULL);
-        chunks.push_back({tok, nullptr});
+        chunks.push_back({1, tok, nullptr});
+    }
+
+    void add_image_tokens(mtmd_image_tokens_ptr & image) {
+        GGML_ASSERT(image != nullptr);
+        size_t n_tokens = mtmd_image_tokens_get_n_tokens(image.get());
+        chunks.push_back({n_tokens, LLAMA_TOKEN_NULL, std::move(image)});
     }
 
     size_t get_common_prefix(const server_inputs & b) const {
@@ -1068,8 +1092,7 @@ struct server_inputs {
         size_t current_pos = 0;
         for (size_t i = 0; i < chunks.size(); ++i) {
             const auto & chunk = chunks[i];
-            size_t chunk_size = chunk.tok_image ? mtmd_image_tokens_get_n_tokens(chunk.tok_image.get()) : 1;
-            size_t chunk_end_pos = current_pos + chunk_size;
+            size_t chunk_end_pos = current_pos + chunk.n_tokens;
             if (pos < chunk_end_pos) {
                 // The target position 'pos' falls within this chunk
                 return i;
@@ -1123,57 +1146,88 @@ struct server_inputs {
 
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
-struct server_embd_batch {
+struct server_batch {
     std::vector<llama_pos>      pos;
+    std::vector<llama_token>    token;
     std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id>   seq_id;
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
+
     llama_batch batch;
-    server_embd_batch() = default;
-    server_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+
+    server_batch() : server_batch(1) {}
+    server_batch(int32_t n_tokens) {
+        token   .resize(n_tokens);
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
+        seq_id  .resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        seq_ids[n_tokens] = nullptr;
+
         batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
+            /*n_tokens       =*/ 0,
+            /*tokens         =*/ token.data(),
+            /*embd           =*/ nullptr,
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
             /*logits         =*/ logits.data(),
         };
+
+        for (int i = 0; i < n_tokens; i++) {
+            batch.n_seq_id[i] = 1; // only a single seq_id per token is needed
+            batch.seq_id  [i] = seq_id.data() + i;
+        }
+    }
+
+    void reserve_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        GGML_ASSERT(n_tokens <= (int32_t)pos.size());
+        seq_ids[n_tokens] = nullptr;
+        batch.n_tokens = n_tokens;
+        batch.embd     = embd;
+        batch.token    = nullptr;
         for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
+            batch.pos     [i]    = pos_0 + i;
+            batch.n_seq_id[i]    = 1;
+            batch.seq_id  [i][0] = seq_id;
+            batch.logits  [i]    = false;
         }
     }
+
+    void clear() {
+        batch.n_tokens = 0;
+        batch.embd     = nullptr;
+        batch.token    = token.data();
+    }
+
+    int32_t n_tokens() const {
+        return batch.n_tokens;
+    }
+
+    bool has_embd() const {
+        return batch.embd != nullptr;
+    }
 };
 
 // TODO @ngxson : quite hacky for now, but just to see if it works
-static int32_t server_encode_image(mtmd_context * mctx, server_embd_batch & batch_out, server_inp_chunk & chunk, llama_pos n_past, llama_seq_id seq_id) {
+static int32_t server_encode_image(mtmd_context * mctx, server_batch & batch_out, server_inp_chunk & chunk, llama_pos n_past, llama_seq_id seq_id) {
     GGML_ASSERT(chunk.tok_image);
+    batch_out.clear();
 
     int64_t t0 = ggml_time_ms();
     LOG_INF("encoding image...\n");
     int32_t ret = mtmd_encode(mctx, chunk.tok_image.get());
     if (ret != 0) {
         LOG_ERR("failed to encode image\n");
-        batch_out = server_embd_batch{};
         return ret;
     }
     LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
 
     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
     float * embd = mtmd_get_output_embd(mctx);
-    batch_out = server_embd_batch(embd, n_tokens, n_past, seq_id);
+    batch_out.reserve_embd_batch(embd, n_tokens, n_past, seq_id);
     return ret;
 }
 

From a6a36537d2018417b51a83f767719ddd7abe8672 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 21 Apr 2025 22:41:04 +0200
Subject: [PATCH 08/59] small fix

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a356e8180fcbb..fbabd2872b0c9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1670,7 +1670,7 @@ struct server_queue {
                 lock.unlock();
 
                 QUE_DBG("processing task, id = %d\n", task.id);
-                callback_new_task(task);
+                callback_new_task(std::move(task));
             }
 
             // all tasks in the current loop is processed, slots data is now ready

From f8bc46629fa4e669697106f251dd9e7d957ff218 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 21 Apr 2025 23:18:44 +0200
Subject: [PATCH 09/59] refactor logic adding tokens to batch

---
 examples/server/server.cpp | 70 ++++++++++++++++++++------------------
 examples/server/utils.hpp  | 13 ++++---
 2 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fbabd2872b0c9..e9b1de10cd1a5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2859,7 +2859,7 @@ struct server_context {
                     res->id = task.id;
                     queue_results.send(std::move(res));
                 } break;
-                 
+
         }
     }
 
@@ -3159,49 +3159,51 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.keep_until(slot.n_past);
 
-                    // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
-                        // without pooling, we want to output the embeddings for all the tokens in the batch
-                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+                    auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
 
-                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
-                        if (curr_chunk.tok_image) {
-                            // if there are already TEXT tokens in the batch, we need to process them first
-                            if (batch.batch.n_tokens > 0) {
-                                break;
-                            }
-                            // encode the image
-                            server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
-                            GGML_ASSERT(batch.has_embd());
-                            SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
+                    // check if we should process the image
+                    if (curr_chunk.tok_image) {
+                        if (batch.has_text()) {
+                            continue; // we cannot have both text batch and image batch
+                        }
 
-                            if (slot.params.cache_prompt) {
-                                slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
-                            }
+                        // encode the image
+                        server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
+                        GGML_ASSERT(batch.has_embd());
+                        SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
 
-                            slot.n_past                    += batch.n_tokens();
-                            slot.n_prompt_tokens_processed += batch.n_tokens();
-                            break; // we cannot have both text batch and image batch
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
+                        }
 
-                        } else {
-                            GGML_ASSERT(!batch.has_embd());
-                            common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
-                            if (slot.params.cache_prompt) {
-                                slot.cache_tokens.add_text_token(curr_chunk.tok_text);
-                            }
+                        slot.n_past                    += batch.n_tokens();
+                        slot.n_prompt_tokens_processed += batch.n_tokens();
 
-                            slot.n_prompt_tokens_processed++;
-                            slot.n_past++;
-                        }
+                        break; // currently, we can only process one image at a time, so we skip ALL other slots
                     }
 
-                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+                    // add prompt tokens for processing in the current batch
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
+                        GGML_ASSERT(!batch.has_embd());
+                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
+                        if (curr_chunk.tok_text == LLAMA_TOKEN_NULL) {
+                            break; // end of text chunk
+                        }
 
-                    if (batch.has_embd()) {
-                        // currently, we can only process one image at a time, so we skip other slots
-                        break;
+                        // without pooling, we want to output the embeddings for all the tokens in the batch
+                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+
+                        common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.add_text_token(curr_chunk.tok_text);
+                        }
+
+                        slot.n_prompt_tokens_processed++;
+                        slot.n_past++;
                     }
 
+                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 3bc0d0da17ec3..ce7e2780e3c16 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -668,7 +668,7 @@ static json oaicompat_completion_params_parse(
                 p["text"] = "<__image__>";
                 p.erase("image_url");
             }
-        }        
+        }
     }
 
     common_chat_templates_inputs inputs;
@@ -979,9 +979,9 @@ struct server_inp_chunk {
 
 /**
  * server_inputs is a helper to manage the input tokens and image for the server.
- * 
+ *
  * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
- * 
+ *
  * it is made this way to simplify the logic of KV cache management.
  */
 struct server_inputs {
@@ -1184,7 +1184,6 @@ struct server_batch {
 
     void reserve_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
         GGML_ASSERT(n_tokens <= (int32_t)pos.size());
-        seq_ids[n_tokens] = nullptr;
         batch.n_tokens = n_tokens;
         batch.embd     = embd;
         batch.token    = nullptr;
@@ -1207,7 +1206,11 @@ struct server_batch {
     }
 
     bool has_embd() const {
-        return batch.embd != nullptr;
+        return batch.embd != nullptr && batch.n_tokens > 0;
+    }
+
+    bool has_text() const {
+        return batch.token != nullptr && batch.n_tokens > 0;
     }
 };
 

From f5420e1d90bf7228c12bb5f8cd85808c4cb00ba8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 21 Apr 2025 23:35:20 +0200
Subject: [PATCH 10/59] implement hashing image

---
 examples/server/CMakeLists.txt |  3 ++-
 examples/server/server.cpp     | 19 ++++++++++++-------
 examples/server/utils.hpp      | 10 ++++++++--
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 17109fddbd307..0ff77b0944881 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -35,8 +35,9 @@ add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
 target_include_directories(${TARGET} PRIVATE ../llava)
+target_include_directories(${TARGET} PRIVATE ../gguf-hash/deps/sha1) # TODO @ngxson : this is a hacky way to get this working, to be fixed before merging
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd sha1 ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e9b1de10cd1a5..af9e1270d40b0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -8,6 +8,7 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "mtmd.h"
+#include "sha1.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -3202,7 +3203,7 @@ struct server_context {
                         slot.n_past++;
                     }
 
-                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+                    // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
 
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
@@ -3244,11 +3245,7 @@ struct server_context {
         }
 
         // debug
-        if (batch.has_embd()) {
-            SRV_INF("decoding embd batch, n_tokens = %d\n", batch.n_tokens());
-        } else {
-            SRV_INF("decoding batch, n_tokens = %d\n", batch.n_tokens());
-        }
+        SRV_DBG("decoding %s batch, n_tokens = %d\n", batch.has_embd() ? "embd" : "text", batch.n_tokens());
 
         if (slot_batched) {
             // make sure we're in the right embedding mode
@@ -4036,6 +4033,14 @@ int main(int argc, char ** argv) {
             {
                 for (auto & file : files) {
                     mtmd_bitmap bmp;
+                    // calculate hash (for KV caching)
+                    {
+                        SHA1_CTX sha1_ctx;
+                        SHA1Update(&sha1_ctx, (unsigned char const *)file.data(), file.size());
+                        unsigned char result[21];
+                        SHA1Final(result, &sha1_ctx);
+                        bmp.id = std::string((char *)result, 20);
+                    }
                     int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
                     if (res != 0) {
                         throw std::runtime_error("Failed to load image");
@@ -4049,7 +4054,7 @@ int main(int argc, char ** argv) {
                 if (!prompt.is_string()) {
                     throw std::runtime_error("prompt must be a string");
                 } else {
-                    printf("prompt: %s\n", prompt.get<std::string>().c_str());
+                    // SRV_INF("prompt: %s\n", prompt.get<std::string>().c_str());
                     mtmd_input_text inp_txt = {
                         prompt.get<std::string>(),
                         /* add_special */   true,
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index ce7e2780e3c16..d642d7831893c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1064,8 +1064,14 @@ struct server_inputs {
                 ret++;
                 continue;
             } else if (ai.tok_image && bi.tok_image) {
-                // TODO check image hash
-                break;
+                std::string ai_id = mtmd_image_tokens_get_id(ai.tok_image.get());
+                std::string bi_id = mtmd_image_tokens_get_id(bi.tok_image.get());
+                if (ai_id == bi_id) {
+                    ret += mtmd_image_tokens_get_n_tokens(ai.tok_image.get());
+                    continue;
+                } else {
+                    break;
+                }
             } else {
                 break;
             }

From cd115854786e5d83424cc97809d0b96746da5af6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 23 Apr 2025 20:57:22 +0200
Subject: [PATCH 11/59] use FNV hash, now hash bitmap instead of file data

---
 examples/server/CMakeLists.txt |  3 +--
 examples/server/server.cpp     | 11 ++---------
 examples/server/utils.hpp      | 12 ++++++++++++
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 0ff77b0944881..17109fddbd307 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -35,9 +35,8 @@ add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
 target_include_directories(${TARGET} PRIVATE ../llava)
-target_include_directories(${TARGET} PRIVATE ../gguf-hash/deps/sha1) # TODO @ngxson : this is a hacky way to get this working, to be fixed before merging
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE common mtmd sha1 ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index af9e1270d40b0..c9c33a0778c87 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -8,7 +8,6 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "mtmd.h"
-#include "sha1.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -4033,18 +4032,12 @@ int main(int argc, char ** argv) {
             {
                 for (auto & file : files) {
                     mtmd_bitmap bmp;
-                    // calculate hash (for KV caching)
-                    {
-                        SHA1_CTX sha1_ctx;
-                        SHA1Update(&sha1_ctx, (unsigned char const *)file.data(), file.size());
-                        unsigned char result[21];
-                        SHA1Final(result, &sha1_ctx);
-                        bmp.id = std::string((char *)result, 20);
-                    }
                     int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
                     if (res != 0) {
                         throw std::runtime_error("Failed to load image");
                     }
+                    // calculate bitmap hash (for KV caching)
+                    bmp.id = server_inputs::fnv_hash(bmp.data.data(), bmp.data.size());
                     bitmaps.push_back(std::move(bmp));
                 }
             }
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d642d7831893c..6a711376f0a95 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1148,6 +1148,18 @@ struct server_inputs {
         // If the loop completes, it means 'pos' is >= the total logical size.
         // No truncation needed, the vector remains unchanged.
     }
+
+    // Computes FNV-1a hash of the data 
+    static std::string fnv_hash(const uint8_t * data, size_t len) { 
+        const uint64_t fnv_prime = 0x100000001b3ULL; 
+        uint64_t hash = 0xcbf29ce484222325ULL; 
+
+        for (size_t i = 0; i < len; ++i) { 
+            hash ^= data[i]; 
+            hash *= fnv_prime; 
+        } 
+        return std::to_string(hash); 
+    }
 };
 
 // helper struct to make working with embd batch easier

From 8afa9528371fe7b0e7e5edd9fd9bdee06bb4f327 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 23 Apr 2025 22:20:52 +0200
Subject: [PATCH 12/59] allow decoding image embedding to be split into batches

---
 examples/server/server.cpp |  88 +++++++++++++++-----------------
 examples/server/utils.hpp  | 102 +++++++++++++++++++++++++++++--------
 2 files changed, 121 insertions(+), 69 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c9c33a0778c87..ec571de136c3e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1860,7 +1860,8 @@ struct server_context {
 
     llama_context_params cparams_dft;
 
-    server_batch batch;
+    llama_batch batch;
+    server_batch_embd batch_embd;
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -1898,6 +1899,8 @@ struct server_context {
 
             llama_batch_free(slot.batch_spec);
         }
+
+        llama_batch_free(batch);
     }
 
     bool load_model(const common_params & params) {
@@ -2034,7 +2037,8 @@ struct server_context {
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);
-            batch = server_batch(std::max(n_batch, params_base.n_parallel));
+            batch      = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch_embd = server_batch_embd(std::max(n_batch, params_base.n_parallel));
         }
 
         metrics.init();
@@ -2931,7 +2935,7 @@ struct server_context {
         }*/
 
         // start populating the batch for this iteration
-        batch.clear();
+        common_batch_clear(batch);
 
         // track if given slot can be batched with slots already in the batch
         server_slot * slot_batched = nullptr;
@@ -2953,9 +2957,9 @@ struct server_context {
                 continue;
             }
 
-            slot.i_batch = batch.n_tokens();
+            slot.i_batch = batch.n_tokens;
 
-            common_batch_add(batch.batch, slot.sampled, slot.n_past, { slot.id }, true);
+            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
 
             slot.n_past += 1;
 
@@ -2972,7 +2976,7 @@ struct server_context {
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
         // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens() == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
                 if (slot.is_processing()) {
@@ -3140,7 +3144,7 @@ struct server_context {
                     // non-causal tasks require to fit the entire prompt in the physical batch
                     if (slot.is_non_causal()) {
                         // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens() + slot.n_prompt_tokens > n_batch) {
+                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
                         }
                     }
@@ -3163,28 +3167,26 @@ struct server_context {
 
                     // check if we should process the image
                     if (curr_chunk.tok_image) {
-                        if (batch.has_text()) {
-                            continue; // we cannot have both text batch and image batch
+                        // process the image
+                        int32_t res = server_img_process(ctx, mctx, curr_chunk, batch_embd, slot.n_past, slot.id);
+                        if (res != 0) {
+                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            slot.release();
+                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            continue;
                         }
 
-                        // encode the image
-                        server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
-                        GGML_ASSERT(batch.has_embd());
-                        SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
-
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
                         }
 
-                        slot.n_past                    += batch.n_tokens();
-                        slot.n_prompt_tokens_processed += batch.n_tokens();
-
-                        break; // currently, we can only process one image at a time, so we skip ALL other slots
+                        slot.n_past                    += curr_chunk.n_tokens;
+                        slot.n_prompt_tokens_processed += curr_chunk.n_tokens;
                     }
 
                     // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
-                        GGML_ASSERT(!batch.has_embd());
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                        // get next token to process
                         auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
                         if (curr_chunk.tok_text == LLAMA_TOKEN_NULL) {
                             break; // end of text chunk
@@ -3193,7 +3195,7 @@ struct server_context {
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.add_text_token(curr_chunk.tok_text);
                         }
@@ -3204,47 +3206,47 @@ struct server_context {
 
                     // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
 
-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
-                        GGML_ASSERT(batch.n_tokens() > 0);
+                        GGML_ASSERT(batch.n_tokens > 0);
 
                         common_sampler_reset(slot.smpl);
 
                         // Process all prompt tokens through sampler system
                         for (size_t i = 0; i < slot.cache_tokens.n_tokens(); ++i) {
-                            auto & curr_chunk = slot.cache_tokens.get_chunk(i);
+                            auto & curr_chunk = slot.prompt_tokens.get_chunk(i);
                             if (curr_chunk.tok_text != LLAMA_TOKEN_NULL) {
                                 common_sampler_accept(slot.smpl, curr_chunk.tok_text, false);
                             }
                         }
 
                         // extract the logits only for the last token
-                        batch.logits[batch.n_tokens() - 1] = true;
+                        batch.logits[batch.n_tokens - 1] = true;
 
                         slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens() - 1;
+                        slot.i_batch   = batch.n_tokens - 1;
 
-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens());
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
                     }
                 }
 
-                if (batch.n_tokens() >= n_batch) {
+                if (batch.n_tokens >= n_batch) {
                     break;
                 }
             }
         }
 
-        if (batch.n_tokens() == 0) {
+        if (batch.n_tokens == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
         }
 
         // debug
-        SRV_DBG("decoding %s batch, n_tokens = %d\n", batch.has_embd() ? "embd" : "text", batch.n_tokens());
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         if (slot_batched) {
             // make sure we're in the right embedding mode
@@ -3254,32 +3256,22 @@ struct server_context {
         }
 
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens(); i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens() - i);
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            // TODO @ngxson : hacky here, we don't want to split the embd batch
-            llama_batch batch_view = batch.has_embd() ? batch.batch : llama_batch{
+            llama_batch batch_view = llama_batch{
                 n_tokens,
-                batch.batch.token    + i,
+                batch.token    + i,
                 nullptr,
-                batch.batch.pos      + i,
-                batch.batch.n_seq_id + i,
-                batch.batch.seq_id   + i,
-                batch.batch.logits   + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
             };
 
-            // TODO @ngxson : maybe move this to llama_batch_ext
-            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
-                llama_set_causal_attn(ctx, false);
-            }
-
             const int ret = llama_decode(ctx, batch_view);
             metrics.on_decoded(slots);
 
-            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
-                llama_set_causal_attn(ctx, true);
-            }
-
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 6a711376f0a95..e6a67e2febd2b 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -963,6 +963,8 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 // (may need to refactor in near future)
 //
 
+// each chunk can contain either one SINGLE text token or an image (multiple token embeddings)
+// this is to simplify the logic of KV cache management
 struct server_inp_chunk {
     size_t n_tokens = 1; // always 1 in case of text
     llama_token tok_text;
@@ -981,6 +983,15 @@ struct server_inp_chunk {
  * server_inputs is a helper to manage the input tokens and image for the server.
  *
  * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
+ * 
+ * for example, server_inputs may contain 5 text tokens followed by 1 image chunk:
+ *   1 41 2635 325 463 <image of 15 tokens>
+ * 
+ * in this example:
+ *   - n_tokens() returns 5+15 = 20 total tokens
+ *   - get_chunk(1) returns chunk containing token ID 41
+ *   - get_chunk(5) returns image chunk (15 tokens)
+ *   - get_chunk(7) returns same image chunk
  *
  * it is made this way to simplify the logic of KV cache management.
  */
@@ -1079,6 +1090,7 @@ struct server_inputs {
         return ret;
     }
 
+    // make sure all text tokens are within the vocab range
     bool validate(llama_token max_vocab_id) const {
         for (const auto & chunk : chunks) {
             if (!chunk.tok_image) {
@@ -1090,24 +1102,26 @@ struct server_inputs {
         return true;
     }
 
+    // pos is also referred as logical index
     server_inp_chunk & get_chunk(size_t pos) {
-        return chunks[get_chunk_idx(pos)];
+        size_t physical_idx = get_chunk_physical_idx(pos);
+        return chunks[physical_idx];
     }
 
-    size_t get_chunk_idx(size_t pos) const {
+    // returns physical_index
+    size_t get_chunk_physical_idx(size_t logical_idx) const {
         size_t current_pos = 0;
         for (size_t i = 0; i < chunks.size(); ++i) {
             const auto & chunk = chunks[i];
             size_t chunk_end_pos = current_pos + chunk.n_tokens;
-            if (pos < chunk_end_pos) {
+            if (logical_idx < chunk_end_pos) {
                 // The target position 'pos' falls within this chunk
                 return i;
             }
-
             current_pos = chunk_end_pos;
         }
         // If the loop finishes, 'pos' is >= the total number of logical positions
-        return chunks.size();
+        throw std::out_of_range("Position out of range");
     }
 
     // same idea with std::vector<llama_token> resize()
@@ -1164,7 +1178,7 @@ struct server_inputs {
 
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
-struct server_batch {
+struct server_batch_embd {
     std::vector<llama_pos>      pos;
     std::vector<llama_token>    token;
     std::vector<int32_t>        n_seq_id;
@@ -1174,8 +1188,8 @@ struct server_batch {
 
     llama_batch batch;
 
-    server_batch() : server_batch(1) {}
-    server_batch(int32_t n_tokens) {
+    server_batch_embd() : server_batch_embd(1) {}
+    server_batch_embd(int32_t n_tokens) {
         token   .resize(n_tokens);
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
@@ -1233,23 +1247,69 @@ struct server_batch {
 };
 
 // TODO @ngxson : quite hacky for now, but just to see if it works
-static int32_t server_encode_image(mtmd_context * mctx, server_batch & batch_out, server_inp_chunk & chunk, llama_pos n_past, llama_seq_id seq_id) {
+static int32_t server_img_process(
+        llama_context * ctx,
+        mtmd_context * mctx,
+        server_inp_chunk & chunk,
+        server_batch_embd & batch,
+        llama_pos n_past,
+        int slot_id) {
     GGML_ASSERT(chunk.tok_image);
-    batch_out.clear();
-
-    int64_t t0 = ggml_time_ms();
-    LOG_INF("encoding image...\n");
-    int32_t ret = mtmd_encode(mctx, chunk.tok_image.get());
-    if (ret != 0) {
-        LOG_ERR("failed to encode image\n");
-        return ret;
+    int32_t ret;
+
+    // encode the image
+    {
+        int64_t t0 = ggml_time_ms();
+        SRV_INF("encoding image (%d tokens)...\n", (int)chunk.n_tokens);
+        ret = mtmd_encode(mctx, chunk.tok_image.get());
+        if (ret != 0) {
+            SRV_ERR("failed to encode image, status = %d\n", ret);
+            return ret;
+        }
+        SRV_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
     }
-    LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
 
-    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
     float * embd = mtmd_get_output_embd(mctx);
-    batch_out.reserve_embd_batch(embd, n_tokens, n_past, seq_id);
-    return ret;
+    // decode the embeddings
+    int64_t t1            = ggml_time_ms();
+    int32_t n_embd        = llama_model_n_embd(llama_get_model(ctx));
+    int32_t n_tokens      = chunk.n_tokens;
+    int32_t n_batch       = batch.pos.size();
+    int32_t i_batch       = 0;
+    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+    // split into batches
+    while (i_batch < n_img_batches) {
+        int32_t pos_offset = i_batch*n_batch;
+        int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+        float * embd_batch = embd + pos_offset*n_embd;
+        batch.clear();
+        batch.reserve_embd_batch(embd_batch, n_tokens_batch, n_past, slot_id);
+
+        SRV_INF("decoding embd batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+        // TODO @ngxson : maybe move this to llama_batch_ext
+        if (mtmd_decode_use_non_causal(mctx)) {
+            llama_set_causal_attn(ctx, false);
+        }
+
+        ret = llama_decode(ctx, batch.batch);
+        if (ret != 0) {
+            LOG_ERR("failed to decode image\n");
+            llama_set_causal_attn(ctx, true); // restore causal attn
+            return ret;
+        }
+
+        if (mtmd_decode_use_non_causal(mctx)) {
+            llama_set_causal_attn(ctx, true);
+        }
+
+        i_batch++;
+        n_past += n_tokens_batch;
+    }
+    SRV_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+
+    batch.clear();
+    return 0;
 }
 
 // hacky, support text-only for now

From 989730c6e1b87b495f2fc36ab986094a8295d7e1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 23 Apr 2025 22:21:40 +0200
Subject: [PATCH 13/59] rm whitespace

---
 examples/server/utils.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index e6a67e2febd2b..0914a9c425cf8 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -983,10 +983,10 @@ struct server_inp_chunk {
  * server_inputs is a helper to manage the input tokens and image for the server.
  *
  * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
- * 
+ *
  * for example, server_inputs may contain 5 text tokens followed by 1 image chunk:
  *   1 41 2635 325 463 <image of 15 tokens>
- * 
+ *
  * in this example:
  *   - n_tokens() returns 5+15 = 20 total tokens
  *   - get_chunk(1) returns chunk containing token ID 41
@@ -1163,16 +1163,16 @@ struct server_inputs {
         // No truncation needed, the vector remains unchanged.
     }
 
-    // Computes FNV-1a hash of the data 
-    static std::string fnv_hash(const uint8_t * data, size_t len) { 
-        const uint64_t fnv_prime = 0x100000001b3ULL; 
-        uint64_t hash = 0xcbf29ce484222325ULL; 
+    // Computes FNV-1a hash of the data
+    static std::string fnv_hash(const uint8_t * data, size_t len) {
+        const uint64_t fnv_prime = 0x100000001b3ULL;
+        uint64_t hash = 0xcbf29ce484222325ULL;
 
-        for (size_t i = 0; i < len; ++i) { 
-            hash ^= data[i]; 
-            hash *= fnv_prime; 
-        } 
-        return std::to_string(hash); 
+        for (size_t i = 0; i < len; ++i) {
+            hash ^= data[i];
+            hash *= fnv_prime;
+        }
+        return std::to_string(hash);
     }
 };
 

From 2df8c1a4b422fbf30c01ae1de75506826dd3499f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 23:14:13 +0200
Subject: [PATCH 14/59] disable some features when mtmd is on

---
 examples/server/server.cpp | 135 +++++++++++++++++++++++++------------
 examples/server/utils.hpp  |  21 ++++++
 2 files changed, 112 insertions(+), 44 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ec571de136c3e..dde300decef44 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1983,6 +1983,21 @@ struct server_context {
                 return false;
             }
             SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
+
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_INF("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_INF("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
+            }
+
+            if (!params_base.speculative.model.path.empty()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
+                return false;
+            }
         }
 
         return true;
@@ -2432,6 +2447,7 @@ struct server_context {
 
     void send_final_response(server_slot & slot) {
         auto res = std::make_unique<server_task_result_cmpl_final>();
+        llama_tokens text_tokens = slot.prompt_tokens.get_text_tokens();
         res->id              = slot.id_task;
         res->id_slot         = slot.id;
 
@@ -2439,7 +2455,7 @@ struct server_context {
         res->content         = std::move(slot.generated_text);
         res->tokens          = std::move(slot.generated_tokens);
         res->timings         = slot.get_timings();
-        //res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true); // TODO @ngxson : hacky, need to fix
+        res->prompt          = common_detokenize(ctx, text_tokens, true);
         res->response_fields = std::move(slot.params.response_fields);
 
         res->truncated           = slot.truncated;
@@ -2747,10 +2763,14 @@ struct server_context {
                     }
                     queue_results.send(std::move(res));
                 } break;
-            /*case SERVER_TASK_TYPE_SLOT_SAVE:
+            case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
+                    if (mctx) {
+                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+                        break;
+                    }
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
@@ -2762,13 +2782,14 @@ struct server_context {
                         break;
                     }
 
-                    const size_t token_count = slot->cache_tokens.size();
+                    const size_t token_count = slot->cache_tokens.n_tokens();
                     const int64_t t_start = ggml_time_us();
 
                     std::string filename = task.slot_action.filename;
                     std::string filepath = task.slot_action.filepath;
 
-                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
+                    const llama_tokens tokens = slot->cache_tokens.get_text_tokens();
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
 
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -2785,6 +2806,10 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
+                    if (mctx) {
+                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+                        break;
+                    }
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -2803,15 +2828,17 @@ struct server_context {
                     std::string filename = task.slot_action.filename;
                     std::string filepath = task.slot_action.filepath;
 
-                    slot->cache_tokens.resize(slot->n_ctx);
+                    llama_tokens tokens;
+                    tokens.resize(slot->n_ctx);
                     size_t token_count = 0;
-                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
                     if (nread == 0) {
-                        slot->cache_tokens.resize(0);
+                        slot->cache_tokens.clear(); // KV may already been invalidated?
                         send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
                         break;
                     }
-                    slot->cache_tokens.resize(token_count);
+                    tokens.resize(token_count);
+                    slot->cache_tokens.set_text_tokens(tokens);
 
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
@@ -2828,6 +2855,10 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
+                    if (mctx) {
+                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+                        break;
+                    }
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -2842,7 +2873,7 @@ struct server_context {
                     }
 
                     // Erase token cache
-                    const size_t n_erased = slot->cache_tokens.size();
+                    const size_t n_erased = slot->cache_tokens.n_tokens();
                     llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
@@ -2851,11 +2882,7 @@ struct server_context {
                     res->id_slot  = id_slot;
                     res->n_erased = n_erased;
                     queue_results.send(std::move(res));
-                } break;*/
-            case SERVER_TASK_TYPE_SLOT_SAVE:
-            case SERVER_TASK_TYPE_SLOT_RESTORE:
-            case SERVER_TASK_TYPE_SLOT_ERASE:
-                GGML_ASSERT(false && "TODO @ngxson : removed due to not compat with multimodal");
+                } break;
             case SERVER_TASK_TYPE_SET_LORA:
                 {
                     params_base.lora_adapters = std::move(task.set_lora);
@@ -2899,8 +2926,7 @@ struct server_context {
 
         // apply context-shift if needed
         // TODO: simplify and improve
-        // TODO @ngxson : hacky, need to disable context shift for multimodal
-        /*for (server_slot & slot : slots) {
+        for (server_slot & slot : slots) {
             if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
                 if (!params_base.ctx_shift) {
                     // this check is redundant (for good)
@@ -2910,6 +2936,12 @@ struct server_context {
                     continue;
                 }
 
+                if (mctx) {
+                    // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
+                    // we don't support ctx_shift because an image chunk may contains multiple tokens
+                    GGML_ABORT("not supported by multimodal");
+                }
+
                 // Shift context
                 const int n_keep    = slot.params.n_keep + add_bos_token;
                 const int n_left    = slot.n_past - n_keep;
@@ -2921,18 +2953,18 @@ struct server_context {
                 llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
-                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.chunks.size(); i++) {
+                        slot.cache_tokens.chunks[i - n_discard] = std::move(slot.cache_tokens.chunks[i]);
                     }
 
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+                    slot.cache_tokens.chunks.resize(slot.cache_tokens.chunks.size() - n_discard);
                 }
 
                 slot.n_past -= n_discard;
 
                 slot.truncated = true;
             }
-        }*/
+        }
 
         // start populating the batch for this iteration
         common_batch_clear(batch);
@@ -3054,51 +3086,59 @@ struct server_context {
                             slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
                             // if input prompt is too big, truncate it
-                            // TODO @ngxson : this won't work with multimodal
-                            /*if (slot.n_prompt_tokens >= slot.n_ctx) {
+                            if (slot.n_prompt_tokens >= slot.n_ctx) {
+                                if (mctx) {
+                                    // we should never reach this
+                                    GGML_ABORT("not supported by multimodal");
+                                }
+                                llama_tokens curr_tokens = slot.prompt_tokens.get_text_tokens();
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
                                 const int n_block_size = n_left / 2;
                                 const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
                                 llama_tokens new_tokens(
-                                        prompt_tokens.begin(),
-                                        prompt_tokens.begin() + slot.params.n_keep);
+                                        curr_tokens.begin(),
+                                        curr_tokens.begin() + slot.params.n_keep);
 
                                 new_tokens.insert(
                                         new_tokens.end(),
-                                        prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
-                                        prompt_tokens.end());
+                                        curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+                                        curr_tokens.end());
 
-                                prompt_tokens = std::move(new_tokens);
+                                prompt_tokens.set_text_tokens(new_tokens);
 
                                 slot.truncated = true;
-                                slot.n_prompt_tokens = prompt_tokens.size();
+                                slot.n_prompt_tokens = prompt_tokens.n_tokens();
 
                                 SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
 
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
-                            }*/
+                            }
 
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                // TODO @ngxson : this won't work with multimodal
-                                /*if (params_base.n_cache_reuse > 0) {
+                                if (params_base.n_cache_reuse > 0) {
                                     size_t head_c = slot.n_past; // cache
                                     size_t head_p = slot.n_past; // current prompt
 
+                                    if (mctx) {
+                                        // we should never reach this
+                                        GGML_ABORT("not supported by multimodal");
+                                    }
+
                                     SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
 
-                                    while (head_c < slot.cache_tokens.size() &&
-                                           head_p < prompt_tokens.size()) {
+                                    while (head_c < slot.cache_tokens.chunks.size() &&
+                                           head_p < prompt_tokens.chunks.size()) {
 
                                         size_t n_match = 0;
-                                        while (head_c + n_match < slot.cache_tokens.size() &&
-                                               head_p + n_match < prompt_tokens.size()     &&
-                                               slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
+                                        while (head_c + n_match < slot.cache_tokens.chunks.size() &&
+                                               head_p + n_match < prompt_tokens.chunks.size()     &&
+                                               slot.cache_tokens.chunks[head_c + n_match].tok_text == prompt_tokens.chunks[head_p + n_match].tok_text) {
 
                                             n_match++;
                                         }
@@ -3115,7 +3155,7 @@ struct server_context {
                                             llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
-                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
+                                                slot.cache_tokens.chunks[head_p + i].tok_text = slot.cache_tokens.chunks[head_c + i].tok_text;
                                                 slot.n_past++;
                                             }
 
@@ -3127,7 +3167,7 @@ struct server_context {
                                     }
 
                                     SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
-                                }*/
+                                }
                             }
                         }
 
@@ -3359,8 +3399,7 @@ struct server_context {
             }
 
             // do speculative decoding
-            // TODO @ngxson : remove speculative decoding for multimodal
-            /*for (auto & slot : slots) {
+            for (auto & slot : slots) {
                 if (!slot.is_processing() || !slot.can_speculate()) {
                     continue;
                 }
@@ -3369,6 +3408,11 @@ struct server_context {
                     continue;
                 }
 
+                if (mctx) {
+                    // we should never reach this
+                    GGML_ABORT("not supported by multimodal");
+                }
+
                 // determine the max draft that fits the current slot state
                 int n_draft_max = slot.params.speculative.n_max;
 
@@ -3395,7 +3439,8 @@ struct server_context {
                 params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
                 params_spec.p_min     = slot.params.speculative.p_min;
 
-                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
+                llama_tokens cached_text_tokens = slot.cache_tokens.get_text_tokens();
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
 
                 // keep track of total number of tokens generated in the draft
                 slot.n_draft_total += draft.size();
@@ -3428,8 +3473,10 @@ struct server_context {
                 // update how many tokens out of draft was accepted
                 slot.n_draft_accepted += ids.size() - 1;
 
-                slot.cache_tokens.push_back(id);
-                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
+                slot.cache_tokens.add_text_token(id);
+                for (auto & t : ids) {
+                    slot.cache_tokens.add_text_token(t);
+                }
 
                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
@@ -3453,7 +3500,7 @@ struct server_context {
                 }
 
                 SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
-            }*/
+            }
         }
 
         SRV_DBG("%s", "run slots completed\n");
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 0914a9c425cf8..425d48ba5ab9d 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1174,6 +1174,27 @@ struct server_inputs {
         }
         return std::to_string(hash);
     }
+
+    // TODO: maybe implement a (de)seralizer for this struct, so we can get rid of functions below
+
+    // return all text tokens (for legacy code), to be used by save/load slot
+    llama_tokens get_text_tokens() {
+        llama_tokens output;
+        for (auto & chunk : chunks) {
+            if (chunk.tok_text != LLAMA_TOKEN_NULL) {
+                output.push_back(chunk.tok_text);
+            }
+        }
+        return output;
+    }
+
+    // clear and set text tokens (for legacy code), to be used by save/load slot
+    void set_text_tokens(llama_tokens tokens) {
+        chunks.clear();
+        for (auto & tok : tokens) {
+            add_text_token(tok);
+        }
+    }
 };
 
 // helper struct to make working with embd batch easier

From b9ef895fd779cc9c29d76910fa12f9372f215386 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 25 Apr 2025 10:41:25 +0200
Subject: [PATCH 15/59] fix --no-mmproj-offload

---
 examples/server/server.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index dde300decef44..185af7bcabd1f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1975,8 +1975,12 @@ struct server_context {
 
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
-            mtmd_context_params mparams;
-            mparams.n_threads = params_base.cpuparams.n_threads;
+            mtmd_context_params mparams{
+                /* use_gpu */   params_base.mmproj_use_gpu,
+                /* timings */   true,
+                /* n_threads */ params_base.cpuparams.n_threads,
+                /* verbosity */ params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
+            };
             mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
             if (mctx == nullptr) {
                 SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());

From add9e215026b6d5465757fb369ba469da20db70e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 25 Apr 2025 11:55:03 +0200
Subject: [PATCH 16/59] mtmd_context_params no timings

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 185af7bcabd1f..3eaf01b1409de 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1977,7 +1977,7 @@ struct server_context {
         if (!mmproj_path.empty()) {
             mtmd_context_params mparams{
                 /* use_gpu */   params_base.mmproj_use_gpu,
-                /* timings */   true,
+                /* timings */   false,
                 /* n_threads */ params_base.cpuparams.n_threads,
                 /* verbosity */ params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
             };

From 58100b393d8c288c6f06fb6385d7a1127f1fc753 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 25 Apr 2025 17:57:11 +0200
Subject: [PATCH 17/59] refactor server_inp to server_tokens

---
 examples/server/server.cpp | 139 +++++++++++-----------
 examples/server/utils.hpp  | 228 ++++++++++++++-----------------------
 2 files changed, 162 insertions(+), 205 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3eaf01b1409de..9428fceb0396c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -198,7 +198,7 @@ struct server_task {
 
     // used by SERVER_TASK_TYPE_INFERENCE
     slot_params   params;
-    server_inputs prompt_tokens;
+    server_tokens prompt_tokens;
     int id_selected_slot = -1;
 
     // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
@@ -1277,14 +1277,14 @@ struct server_slot {
     int32_t n_prompt_tokens_processed = 0;
 
     // input prompt tokens
-    server_inputs prompt_tokens;
+    server_tokens prompt_tokens;
 
     size_t last_nl_pos = 0;
 
     std::string  generated_text;
     llama_tokens generated_tokens;
 
-    server_inputs cache_tokens;
+    server_tokens cache_tokens;
 
     std::vector<completion_token_output> generated_token_probs;
 
@@ -2020,6 +2020,7 @@ struct server_context {
             slot.n_ctx = n_ctx_slot;
             slot.n_predict = params_base.n_predict;
             slot.mctx = mctx;
+            slot.cache_tokens.has_mtmd = mctx != nullptr;
 
             if (model_dft) {
                 slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
@@ -2096,7 +2097,7 @@ struct server_context {
                 int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
 
                 // fraction of the common subsequence length compared to the current slot's prompt length
-                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.n_tokens());
+                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
 
                 // select the current slot if the criteria match
                 if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
@@ -2135,7 +2136,7 @@ struct server_context {
         return ret;
     }
 
-    bool can_be_detokenized(const struct llama_context * ctx, const server_inputs & inp) {
+    bool can_be_detokenized(const struct llama_context * ctx, const server_tokens & inp) {
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
         const int32_t n_vocab = llama_vocab_n_tokens(vocab);
@@ -2786,7 +2787,7 @@ struct server_context {
                         break;
                     }
 
-                    const size_t token_count = slot->cache_tokens.n_tokens();
+                    const size_t token_count = slot->cache_tokens.size();
                     const int64_t t_start = ggml_time_us();
 
                     std::string filename = task.slot_action.filename;
@@ -2877,7 +2878,7 @@ struct server_context {
                     }
 
                     // Erase token cache
-                    const size_t n_erased = slot->cache_tokens.n_tokens();
+                    const size_t n_erased = slot->cache_tokens.size();
                     llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
@@ -2957,11 +2958,11 @@ struct server_context {
                 llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
-                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.chunks.size(); i++) {
-                        slot.cache_tokens.chunks[i - n_discard] = std::move(slot.cache_tokens.chunks[i]);
+                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
+                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                     }
 
-                    slot.cache_tokens.chunks.resize(slot.cache_tokens.chunks.size() - n_discard);
+                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
                 }
 
                 slot.n_past -= n_discard;
@@ -3004,7 +3005,7 @@ struct server_context {
             }
 
             SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
-                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.n_tokens(), slot.truncated);
+                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
         }
 
         // process in chunks of params.n_batch
@@ -3033,23 +3034,23 @@ struct server_context {
                         slot.t_start_generation = 0;
 
                         slot.n_past = 0;
-                        slot.n_prompt_tokens = prompt_tokens.n_tokens();
+                        slot.n_prompt_tokens = prompt_tokens.size();
                         slot.state = SLOT_STATE_PROCESSING_PROMPT;
 
                         SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
                         // print prompt tokens (for debugging)
-                        // if (1) {
-                        //     // first 16 tokens (avoid flooding logs)
-                        //     for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
-                        //         SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
-                        //     }
-                        // } else {
-                        //     // all
-                        //     for (int i = 0; i < (int) prompt_tokens.size(); i++) {
-                        //         SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
-                        //     }
-                        // }
+                        /*if (1) {
+                            // first 16 tokens (avoid flooding logs)
+                            for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
+                        } else {
+                            // all
+                            for (int i = 0; i < (int) prompt_tokens.size(); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
+                        }*/
 
                         // empty prompt passed -> release the slot and send empty response
                         if (prompt_tokens.empty()) {
@@ -3113,7 +3114,7 @@ struct server_context {
                                 prompt_tokens.set_text_tokens(new_tokens);
 
                                 slot.truncated = true;
-                                slot.n_prompt_tokens = prompt_tokens.n_tokens();
+                                slot.n_prompt_tokens = prompt_tokens.size();
 
                                 SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
 
@@ -3136,13 +3137,13 @@ struct server_context {
 
                                     SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
 
-                                    while (head_c < slot.cache_tokens.chunks.size() &&
-                                           head_p < prompt_tokens.chunks.size()) {
+                                    while (head_c < slot.cache_tokens.size() &&
+                                           head_p < prompt_tokens.size()) {
 
                                         size_t n_match = 0;
-                                        while (head_c + n_match < slot.cache_tokens.chunks.size() &&
-                                               head_p + n_match < prompt_tokens.chunks.size()     &&
-                                               slot.cache_tokens.chunks[head_c + n_match].tok_text == prompt_tokens.chunks[head_p + n_match].tok_text) {
+                                        while (head_c + n_match < slot.cache_tokens.size() &&
+                                               head_p + n_match < prompt_tokens.size()     &&
+                                               slot.cache_tokens[head_c + n_match].txt == prompt_tokens[head_p + n_match].txt) {
 
                                             n_match++;
                                         }
@@ -3159,7 +3160,7 @@ struct server_context {
                                             llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
-                                                slot.cache_tokens.chunks[head_p + i].tok_text = slot.cache_tokens.chunks[head_c + i].tok_text;
+                                                slot.cache_tokens[head_p + i].txt = slot.cache_tokens[head_c + i].txt;
                                                 slot.n_past++;
                                             }
 
@@ -3207,12 +3208,13 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.keep_until(slot.n_past);
 
-                    auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
+                    auto & cur_tok = slot.prompt_tokens[slot.n_past];
 
                     // check if we should process the image
-                    if (curr_chunk.tok_image) {
+                    if (cur_tok.img) {
                         // process the image
-                        int32_t res = server_img_process(ctx, mctx, curr_chunk, batch_embd, slot.n_past, slot.id);
+                        int32_t res = server_img_process(ctx, mctx, cur_tok, batch_embd, slot.n_past, slot.id);
+                        int32_t n_tokens = mtmd_image_tokens_get_n_tokens(cur_tok.img.get());
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             slot.release();
@@ -3221,27 +3223,30 @@ struct server_context {
                         }
 
                         if (slot.params.cache_prompt) {
-                            slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
+                            // all ALL image tokens at once
+                            for (int32_t i = 0; i < n_tokens; i++) {
+                                slot.cache_tokens.add_token(std::move(slot.prompt_tokens[slot.n_past + i]));
+                            }
                         }
 
-                        slot.n_past                    += curr_chunk.n_tokens;
-                        slot.n_prompt_tokens_processed += curr_chunk.n_tokens;
+                        slot.n_past                    += n_tokens;
+                        slot.n_prompt_tokens_processed += n_tokens;
                     }
 
                     // add prompt tokens for processing in the current batch
                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
                         // get next token to process
-                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
-                        if (curr_chunk.tok_text == LLAMA_TOKEN_NULL) {
+                        auto & curr_chunk = slot.prompt_tokens[slot.n_past];
+                        if (curr_chunk.txt == LLAMA_TOKEN_NULL) {
                             break; // end of text chunk
                         }
 
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch, curr_chunk.txt, slot.n_past, { slot.id }, need_embd);
                         if (slot.params.cache_prompt) {
-                            slot.cache_tokens.add_text_token(curr_chunk.tok_text);
+                            slot.cache_tokens.add_text_token(curr_chunk.txt);
                         }
 
                         slot.n_prompt_tokens_processed++;
@@ -3261,10 +3266,10 @@ struct server_context {
                         common_sampler_reset(slot.smpl);
 
                         // Process all prompt tokens through sampler system
-                        for (size_t i = 0; i < slot.cache_tokens.n_tokens(); ++i) {
-                            auto & curr_chunk = slot.prompt_tokens.get_chunk(i);
-                            if (curr_chunk.tok_text != LLAMA_TOKEN_NULL) {
-                                common_sampler_accept(slot.smpl, curr_chunk.tok_text, false);
+                        for (size_t i = 0; i < slot.cache_tokens.size(); ++i) {
+                            auto & cur_tok = slot.prompt_tokens[i];
+                            if (cur_tok.txt != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl, cur_tok.txt, false);
                             }
                         }
 
@@ -3289,7 +3294,6 @@ struct server_context {
             return;
         }
 
-        // debug
         SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         if (slot_batched) {
@@ -3303,7 +3307,7 @@ struct server_context {
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            llama_batch batch_view = llama_batch{
+            llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -4072,7 +4076,11 @@ int main(int argc, char ** argv) {
 
             // process files
             std::vector<mtmd_bitmap> bitmaps;
+            const bool has_mtmd = ctx_server.mctx != nullptr;
             {
+                if (!has_mtmd && !files.empty()) {
+                    throw std::runtime_error("This server does not support multimodal");
+                }
                 for (auto & file : files) {
                     mtmd_bitmap bmp;
                     int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
@@ -4080,30 +4088,31 @@ int main(int argc, char ** argv) {
                         throw std::runtime_error("Failed to load image");
                     }
                     // calculate bitmap hash (for KV caching)
-                    bmp.id = server_inputs::fnv_hash(bmp.data.data(), bmp.data.size());
+                    bmp.id = server_tokens::fnv_hash(bmp.data.data(), bmp.data.size());
                     bitmaps.push_back(std::move(bmp));
                 }
             }
 
-            std::vector<server_inputs> inputs;
-            if (oaicompat) {
-                if (!prompt.is_string()) {
-                    throw std::runtime_error("prompt must be a string");
-                } else {
-                    // SRV_INF("prompt: %s\n", prompt.get<std::string>().c_str());
-                    mtmd_input_text inp_txt = {
-                        prompt.get<std::string>(),
-                        /* add_special */   true,
-                        /* parse_special */ true,
-                    };
-                    mtmd_input_chunks chunks;
-                    int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
-                    if (tokenized != 0) {
-                        throw std::runtime_error("Failed to tokenize prompt");
-                    }
-                    server_inputs tmp(chunks);
-                    inputs.push_back(std::move(tmp));
+            // process prompt
+            std::vector<server_tokens> inputs;
+            if (oaicompat && !prompt.is_string()) {
+                throw std::runtime_error("prompt must be a string");
+
+            } else if (oaicompat && has_mtmd) {
+                // multimodal
+                mtmd_input_text inp_txt = {
+                    prompt.get<std::string>(),
+                    /* add_special */   true,
+                    /* parse_special */ true,
+                };
+                mtmd_input_chunks chunks;
+                int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
+                if (tokenized != 0) {
+                    throw std::runtime_error("Failed to tokenize prompt");
                 }
+                server_tokens tmp(chunks, true);
+                inputs.push_back(std::move(tmp));
+
             } else {
                 // non-multimodal version
                 auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 425d48ba5ab9d..fb4ce9c0fb2b2 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -963,138 +963,140 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 // (may need to refactor in near future)
 //
 
-// each chunk can contain either one SINGLE text token or an image (multiple token embeddings)
+// each chunk can contain either one SINGLE text token or pointer to image
 // this is to simplify the logic of KV cache management
-struct server_inp_chunk {
-    size_t n_tokens = 1; // always 1 in case of text
-    llama_token tok_text;
-    mtmd_image_tokens_ptr tok_image;
+struct server_token {
+    llama_token txt;
+    std::shared_ptr<mtmd_image_tokens> img;
     std::string str() const {
         // for debugging
-        if (tok_image) {
-            return string_format("(<image> at %p) ", (void *)tok_image.get());
+        GGML_ASSERT(img || txt != LLAMA_TOKEN_NULL);
+        if (img) {
+            return "<embd> ";
         } else {
-            return std::to_string(tok_text) + " ";
+            return std::to_string(txt) + " ";
         }
     }
 };
 
 /**
- * server_inputs is a helper to manage the input tokens and image for the server.
- *
- * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
- *
- * for example, server_inputs may contain 5 text tokens followed by 1 image chunk:
- *   1 41 2635 325 463 <image of 15 tokens>
- *
- * in this example:
- *   - n_tokens() returns 5+15 = 20 total tokens
- *   - get_chunk(1) returns chunk containing token ID 41
- *   - get_chunk(5) returns image chunk (15 tokens)
- *   - get_chunk(7) returns same image chunk
- *
+ * server_tokens is a helper to manage the input tokens and image for the server.
  * it is made this way to simplify the logic of KV cache management.
+ *
+ * each token can be either a text token or a pointer to an image.
+ * if image usually contains multiple tokens, each token contains a shared_ptr to the same image.
  */
-struct server_inputs {
-    std::vector<server_inp_chunk> chunks;
+struct server_tokens {
+    bool has_mtmd = false;
+    std::vector<server_token> values;
 
-    server_inputs() = default;
-    ~server_inputs() = default; // Important if unique_ptr is used
+    server_tokens() = default;
+    ~server_tokens() = default;
 
     // Prevent copying
-    server_inputs(const server_inputs&) = delete;
-    server_inputs& operator=(const server_inputs&) = delete;
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
 
     // Allow moving (usually implicitly generated if members are movable)
-    server_inputs(server_inputs&&) = default;
-    server_inputs& operator=(server_inputs&&) = default;
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+
+    // Allow accessing elements using [] operator
+    server_token& operator[](size_t index) { return values[index]; }
+    const server_token& operator[](size_t index) const { return values[index]; }
 
-    server_inputs(mtmd_input_chunks & mtmd_chunks) {
+    server_tokens(mtmd_input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
         for (auto & c : mtmd_chunks) {
             if (c.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-                size_t n_tokens = mtmd_image_tokens_get_n_tokens(c.tokens_image.get());
-                chunks.push_back({n_tokens, LLAMA_TOKEN_NULL, std::move(c.tokens_image)});
+                add_image_tokens(std::move(c.tokens_image));
             } else if (c.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
                 for (auto & tok : c.tokens_text) {
-                    chunks.push_back({1, tok, nullptr});
+                    add_text_token(tok);
                 }
             } else {
-                GGML_ASSERT(false && "Invalid chunk type");
+                GGML_ABORT("Invalid chunk type");
             }
         }
     }
 
-    std::string str() {
+    std::string str() const {
         // for debugging
         std::string ret;
-        for (const auto & chunk : chunks) {
-            ret += chunk.str();
+        for (const auto & t : values) {
+            ret += t.str();
         }
         return ret;
     }
 
-    size_t n_tokens() const {
-        size_t res = 0;
-        for (const auto & chunk : chunks) {
-            if (chunk.tok_image) {
-                res += chunk.n_tokens;
-            } else {
-                res++;
-            }
-        }
-        return res;
+    size_t size() const {
+        return values.size();
     }
 
     bool empty() const {
-        return n_tokens() == 0;
+        return values.empty();
     }
 
     void clear() {
-        chunks.clear();
+        values.clear();
+    }
+
+    void resize(size_t n) {
+        values.resize(n);
+    }
+
+    void add_token(server_token && t) {
+        if (t.img) GGML_ASSERT(has_mtmd);
+        values.push_back(std::move(t));
     }
 
     void add_text_token(llama_token tok) {
         GGML_ASSERT(tok != LLAMA_TOKEN_NULL);
-        chunks.push_back({1, tok, nullptr});
+        values.push_back({tok, nullptr});
     }
 
-    void add_image_tokens(mtmd_image_tokens_ptr & image) {
+    void add_image_tokens(mtmd_image_tokens_ptr && image) {
+        GGML_ASSERT(has_mtmd);
         GGML_ASSERT(image != nullptr);
-        size_t n_tokens = mtmd_image_tokens_get_n_tokens(image.get());
-        chunks.push_back({n_tokens, LLAMA_TOKEN_NULL, std::move(image)});
+        std::shared_ptr<mtmd_image_tokens> tok_image(std::move(image));
+        size_t n_tokens = mtmd_image_tokens_get_n_tokens(tok_image.get());
+        GGML_ASSERT(n_tokens > 0 && "Invalid image token"); // should never happen
+        for (size_t i = 0; i < n_tokens; ++i) {
+            values.push_back({LLAMA_TOKEN_NULL, tok_image});
+        }
     }
 
-    size_t get_common_prefix(const server_inputs & b) const {
-        size_t ret = 0;
-        size_t max_idx = std::min(chunks.size(), b.chunks.size());
+    size_t get_common_prefix(const server_tokens & b) const {
+        size_t max_idx = std::min(values.size(), b.values.size());
         for (size_t i = 0; i < max_idx; ++i) {
-            auto & ai =   chunks[i];
-            auto & bi = b.chunks[i];
+            auto & ai =   values[i];
+            auto & bi = b.values[i];
 
-            if (ai.tok_text == bi.tok_text && !ai.tok_image && !bi.tok_image) {
-                ret++;
+            if (ai.txt == bi.txt && !ai.img && !bi.img) {
                 continue;
-            } else if (ai.tok_image && bi.tok_image) {
-                std::string ai_id = mtmd_image_tokens_get_id(ai.tok_image.get());
-                std::string bi_id = mtmd_image_tokens_get_id(bi.tok_image.get());
+            } else if (ai.img && bi.img) {
+                GGML_ASSERT(has_mtmd);
+                std::string ai_id = mtmd_image_tokens_get_id(ai.img.get());
+                std::string bi_id = mtmd_image_tokens_get_id(bi.img.get());
                 if (ai_id == bi_id) {
-                    ret += mtmd_image_tokens_get_n_tokens(ai.tok_image.get());
+                    size_t n_tokens = mtmd_image_tokens_get_n_tokens(ai.img.get());
+                    GGML_ASSERT(n_tokens > 0 && "Invalid image token"); // should never happen
+                    i += mtmd_image_tokens_get_n_tokens(ai.img.get()) - 1;
                     continue;
                 } else {
-                    break;
+                    return i;
                 }
             } else {
-                break;
+                return i;
             }
         }
-        return ret;
+        return max_idx; // all tokens are equal
     }
 
     // make sure all text tokens are within the vocab range
     bool validate(llama_token max_vocab_id) const {
-        for (const auto & chunk : chunks) {
-            if (!chunk.tok_image) {
-                if (chunk.tok_text < 0 || chunk.tok_text >= max_vocab_id) {
+        for (const auto & t : values) {
+            if (!t.img) {
+                if (t.txt < 0 || t.txt >= max_vocab_id) {
                     return false;
                 }
             }
@@ -1102,65 +1104,11 @@ struct server_inputs {
         return true;
     }
 
-    // pos is also referred as logical index
-    server_inp_chunk & get_chunk(size_t pos) {
-        size_t physical_idx = get_chunk_physical_idx(pos);
-        return chunks[physical_idx];
-    }
-
-    // returns physical_index
-    size_t get_chunk_physical_idx(size_t logical_idx) const {
-        size_t current_pos = 0;
-        for (size_t i = 0; i < chunks.size(); ++i) {
-            const auto & chunk = chunks[i];
-            size_t chunk_end_pos = current_pos + chunk.n_tokens;
-            if (logical_idx < chunk_end_pos) {
-                // The target position 'pos' falls within this chunk
-                return i;
-            }
-            current_pos = chunk_end_pos;
-        }
-        // If the loop finishes, 'pos' is >= the total number of logical positions
-        throw std::out_of_range("Position out of range");
-    }
-
-    // same idea with std::vector<llama_token> resize()
+    // same idea with std::vector<llama_token>::resize()
     void keep_until(size_t pos) {
-        if (pos == 0) {
-            chunks.clear();
-            return;
-        }
-
-        size_t current_pos = 0;
-        for (size_t i = 0; i < chunks.size(); ++i) {
-            const auto & chunk = chunks[i];
-            size_t chunk_size = chunk.tok_image ? mtmd_image_tokens_get_n_tokens(chunk.tok_image.get()) : 1;
-            size_t chunk_end_pos = current_pos + chunk_size;
-            if (pos <= current_pos) {
-                // Truncation point is exactly at or before the start of this chunk.
-                // Keep only chunks before index 'i'.
-                chunks.resize(i);
-                return;
-            }
-            if (pos < chunk_end_pos) {
-                // Truncation point 'pos' falls within this chunk.
-                if (chunk.tok_image) {
-                    // It's an image chunk, keep the whole chunk.
-                    // Keep chunks up to and including index 'i'.
-                    chunks.resize(i + 1);
-                } else {
-                    // It's a text chunk. Since pos < chunk_end_pos and chunk_size is 1,
-                    // this means pos == current_pos.
-                    // Keep only chunks before index 'i'.
-                    chunks.resize(i);
-                }
-                return;
-            }
-            // pos >= chunk_end_pos, so keep this chunk entirely and continue.
-            current_pos = chunk_end_pos;
-        }
-        // If the loop completes, it means 'pos' is >= the total logical size.
-        // No truncation needed, the vector remains unchanged.
+        // TODO : maybe throw error we remove part of the image (only allow removing the whole image)
+        //        this cannot happen currently because get_common_prefix() only never returns such pos
+        values.resize(pos);
     }
 
     // Computes FNV-1a hash of the data
@@ -1180,9 +1128,9 @@ struct server_inputs {
     // return all text tokens (for legacy code), to be used by save/load slot
     llama_tokens get_text_tokens() {
         llama_tokens output;
-        for (auto & chunk : chunks) {
-            if (chunk.tok_text != LLAMA_TOKEN_NULL) {
-                output.push_back(chunk.tok_text);
+        for (auto & t : values) {
+            if (t.txt != LLAMA_TOKEN_NULL) {
+                output.push_back(t.txt);
             }
         }
         return output;
@@ -1190,7 +1138,7 @@ struct server_inputs {
 
     // clear and set text tokens (for legacy code), to be used by save/load slot
     void set_text_tokens(llama_tokens tokens) {
-        chunks.clear();
+        values.clear();
         for (auto & tok : tokens) {
             add_text_token(tok);
         }
@@ -1267,22 +1215,22 @@ struct server_batch_embd {
     }
 };
 
-// TODO @ngxson : quite hacky for now, but just to see if it works
 static int32_t server_img_process(
         llama_context * ctx,
         mtmd_context * mctx,
-        server_inp_chunk & chunk,
+        server_token & chunk,
         server_batch_embd & batch,
         llama_pos n_past,
         int slot_id) {
-    GGML_ASSERT(chunk.tok_image);
+    GGML_ASSERT(chunk.img);
+    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.img.get());
     int32_t ret;
 
     // encode the image
     {
         int64_t t0 = ggml_time_ms();
-        SRV_INF("encoding image (%d tokens)...\n", (int)chunk.n_tokens);
-        ret = mtmd_encode(mctx, chunk.tok_image.get());
+        SRV_INF("encoding image (%d tokens)...\n", (int)n_tokens);
+        ret = mtmd_encode(mctx, chunk.img.get());
         if (ret != 0) {
             SRV_ERR("failed to encode image, status = %d\n", ret);
             return ret;
@@ -1294,7 +1242,6 @@ static int32_t server_img_process(
     // decode the embeddings
     int64_t t1            = ggml_time_ms();
     int32_t n_embd        = llama_model_n_embd(llama_get_model(ctx));
-    int32_t n_tokens      = chunk.n_tokens;
     int32_t n_batch       = batch.pos.size();
     int32_t i_batch       = 0;
     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
@@ -1334,8 +1281,9 @@ static int32_t server_img_process(
 }
 
 // hacky, support text-only for now
-static server_inputs convert_legacy_to_mtmd(llama_tokens & tokenized) {
-    server_inputs res;
+static server_tokens convert_legacy_to_mtmd(llama_tokens & tokenized) {
+    server_tokens res;
+    res.has_mtmd = false;
     for (auto & tok : tokenized) {
         res.add_text_token(tok);
     }

From e82fea8f0e95f38146a5d7be1935602162a2f5f4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 25 Apr 2025 22:41:51 +0200
Subject: [PATCH 18/59] fix the failing test case

---
 examples/server/utils.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index fb4ce9c0fb2b2..f048be0265e7e 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -633,9 +633,12 @@ static json oaicompat_completion_params_parse(
     }
 
     // get input files
-    json messages = json_value(body, "messages", json::array());
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    json messages = body.at("messages");
     if (!messages.is_array()) {
-        throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
+        throw std::runtime_error("Expected 'messages' to be an array");
     }
     for (auto & msg : messages) {
         json & content = msg.at("content");

From 4a4f35c8777562256a8801c1835dbb7db4f401d6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 10:54:07 +0200
Subject: [PATCH 19/59] init

---
 examples/llava/mtmd.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 78be192dd6eb6..30f6c7e0ccf47 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -152,10 +152,12 @@ struct mtmd_context_deleter {
 };
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
-#else
+#endif
+
+//
+// C API
+//
 
-static_assert(false && "C header is not yet supported by this library");
 
-#endif
 
 #endif

From f6b6517c00637083211f2dc1bcb9d7cf0620ce81 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 11:47:55 +0200
Subject: [PATCH 20/59] wip

---
 examples/llava/mtmd-cli.cpp |  14 ++---
 examples/llava/mtmd.cpp     |  14 ++++-
 examples/llava/mtmd.h       | 116 ++++++++++++++++++++++++------------
 3 files changed, 97 insertions(+), 47 deletions(-)

diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 250e8c9a9e871..bef2c7be4fba8 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -112,12 +112,12 @@ struct mtmd_cli_context {
 
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
-            /* use_gpu */   params.mmproj_use_gpu,
-            /* timings */   true,
-            /* n_threads */ params.cpuparams.n_threads,
-            /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
-        }));
+        mtmd_context_params mparams = mtmd_context_params_default();
+        mparams.use_gpu = params.mmproj_use_gpu;
+        mparams.print_timings = true;
+        mparams.n_threads = params.cpuparams.n_threads;
+        mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
             exit(1);
@@ -228,7 +228,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    mtmd_input_chunks chunks;
+    std::vector<mtmd_input_chunk> chunks;
 
     if (g_is_interrupted) return 0;
 
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index f95f0503569f9..c6e310c70d5e6 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -21,6 +21,16 @@ enum mtmd_slice_tmpl {
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
+mtmd_context_params mtmd_context_params_default() {
+    mtmd_context_params params;
+    params.use_gpu = true;
+    params.print_timings = true;
+    params.n_threads = 4;
+    params.verbosity = GGML_LOG_LEVEL_INFO;
+    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
+    return params;
+}
+
 struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
@@ -411,7 +421,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
+size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
     size_t n_tokens = 0;
     for (auto & chunk : chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -462,7 +472,7 @@ struct decode_embd_batch {
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
         llama_context * lctx,
-        mtmd_input_chunks & chunks,
+        std::vector<mtmd_input_chunk> & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 30f6c7e0ccf47..d1a6ecd58481f 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -5,9 +5,15 @@
 #include "llama.h"
 #include "clip.h"
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
 #include <vector>
 #include <cinttypes>
 #include <memory>
+#endif
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -23,7 +29,7 @@
 #    define MTMD_API
 #endif
 
-#ifdef __cplusplus
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 
 enum mtmd_input_chunk_type {
     MTMD_INPUT_CHUNK_TYPE_TEXT,
@@ -33,6 +39,75 @@ enum mtmd_input_chunk_type {
 struct mtmd_context;
 struct mtmd_image_tokens;
 
+//
+// C API
+// this is made to closely resemble the C++ API
+//
+
+// forward declaration for C API (the actual struct is defined in C++)
+struct mtmd_bitmap;
+struct mtmd_input_chunk;
+
+struct mtmd_context_params {
+    bool use_gpu;
+    bool print_timings;
+    int n_threads;
+    enum ggml_log_level verbosity;
+    const char * image_marker;
+};
+
+MTMD_API mtmd_context_params mtmd_context_params_default();
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+                                            const llama_model * text_model,
+                                            const mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// get output embeddings from the last encode pass
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// length of data must be nx * ny * 3
+// the data is in RGBRGBRGB... format
+// the id is optional (can be nullptr), but useful for KV cache tracking
+MTMD_API mtmd_bitmap * mtmd_bitmap_init(
+    uint32_t nx,
+    uint32_t ny,
+    const unsigned char * data,
+    const char * id, size_t id_len);
+MTMD_API uint32_t              mtmd_bitmap_get_nx  (mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny  (mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data(mtmd_bitmap * bitmap);
+MTMD_API const char *          mtmd_bitmap_get_id  (mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
+
+// mtmd_input_chunk
+//
+// the instance can be constructed via mtmd_tokenize()
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API void                       mtmd_input_chunk_free            (mtmd_input_chunk * chunk);
+
+
+//
+// C++ API
+//
+
+#ifdef __cplusplus
+
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
 struct mtmd_bitmap {
@@ -53,30 +128,12 @@ struct mtmd_input_chunk {
     mtmd_image_tokens_ptr tokens_image;
 };
 
-using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
-
-struct mtmd_context_params {
-    bool use_gpu = true;
-    bool print_timings = true;
-    int n_threads = 4;
-    enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
-    const char * image_marker = "<__image__>";
-};
-
 struct mtmd_input_text {
     std::string text;
     bool add_special;
     bool parse_special;
 };
 
-// initialize the mtmd context
-// return nullptr on failure
-MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
-                                                const llama_model * text_model,
-                                                const mtmd_context_params ctx_params);
-
-MTMD_API void mtmd_free(mtmd_context * ctx);
-
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
 // the marker will be replaced with the image tokens
@@ -108,12 +165,6 @@ MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
                             const mtmd_image_tokens * image_tokens);
 
-// get output embeddings from the last encode pass
-MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
-
-// whether we need to set non-causal mask before llama_decode
-MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
-
 
 
 //
@@ -121,7 +172,7 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 //
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
@@ -130,7 +181,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 // otherwise, returns 0 on success
 MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_context * lctx,
-                                mtmd_input_chunks & chunks,
+                                std::vector<mtmd_input_chunk> & chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
@@ -146,18 +197,7 @@ MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitm
 // this function is thread-safe
 MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
 
-// convenient unique_ptr wrappers
-struct mtmd_context_deleter {
-    void operator()(mtmd_context * val) { mtmd_free(val); }
-};
-using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
-
 #endif
 
-//
-// C API
-//
-
-
 
 #endif

From 82f4246ed5afa1eface3864c3eb709299cfa9d76 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 15:38:57 +0200
Subject: [PATCH 21/59] working version

---
 examples/llava/mtmd-cli.cpp |  32 ++-
 examples/llava/mtmd.cpp     | 468 ++++++++++++++++++++++++------------
 examples/llava/mtmd.h       | 248 ++++++++++++-------
 3 files changed, 505 insertions(+), 243 deletions(-)

diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 565606ad18809..cf754e63f36ec 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
 #endif
 
 struct mtmd_cli_context {
-    mtmd_context_ptr ctx_vision;
+    mtmd::context_ptr ctx_vision;
     common_init_result llama_init;
 
     llama_model       * model;
@@ -173,7 +173,7 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
 }
 
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
-    std::vector<mtmd_bitmap> bitmaps;
+    std::vector<mtmd_bitmap *> bitmaps;
 
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
@@ -183,8 +183,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 
     for (auto & fname : images_fname) {
-        mtmd_bitmap bitmap;
-        if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
+        mtmd_bitmap * bitmap = mtmd_helper_bitmap_init_from_file(fname.c_str());
+        if (!bitmap) {
             LOG_ERR("Unable to load image %s\n", fname.c_str());
             return 2; // image not found
         }
@@ -192,25 +192,37 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     }
 
     mtmd_input_text text;
-    text.text          = formatted_chat.prompt;
+    text.text          = formatted_chat.prompt.c_str();
     text.add_special   = add_bos;
     text.parse_special = true;
-    std::vector<mtmd_input_chunk> chunks;
 
     if (g_is_interrupted) return 0;
 
-    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
+    mtmd::input_chunks chunks;
+    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
+                        chunks.ptr.get(), // output
+                        &text, // text
+                        bitmaps.data(), // bitmaps
+                        bitmaps.size());
     if (res != 0) {
         LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
         return 1;
     }
 
-    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+    llama_pos new_n_past;
+    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
+                ctx.lctx, // lctx
+                chunks.ptr.get(), // chunks
+                ctx.n_past, // n_past
+                0, // seq_id
+                ctx.n_batch, // n_batch
+                true, // logits_last
+                &new_n_past)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_pos(chunks);
+    ctx.n_past = new_n_past;
 
     return 0;
 }
@@ -241,7 +253,7 @@ int main(int argc, char ** argv) {
     struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
-    // ctrl+C handling
+    // Ctrl+C handling
     {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
         struct sigaction sigint_action;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index d2c99aa2aa167..c543b2b2415d3 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -12,6 +12,30 @@
 #include <limits>
 #include <vector>
 
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct mtmd_bitmap {
+    uint32_t nx;
+    uint32_t ny;
+    std::vector<unsigned char> data;
+    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+};
+
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val); // forward declaration
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
+struct mtmd_input_chunk {
+    mtmd_input_chunk_type type;
+    std::vector<llama_token> tokens_text;
+    mtmd_image_tokens_ptr tokens_image;
+};
+
+struct mtmd_input_chunks {
+    std::vector<mtmd_input_chunk> entries;
+};
+
 // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
 // models not having it (llava-1.6) will process embeddings without any special tokens in-between
 enum mtmd_slice_tmpl {
@@ -182,12 +206,13 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
 }
 
 int32_t mtmd_tokenize(mtmd_context * ctx,
-                        std::vector<mtmd_input_chunk> & output,
-                        const mtmd_input_text & text,
-                        const std::vector<mtmd_bitmap> & bitmaps) {
+            mtmd_input_chunks * output,
+            const mtmd_input_text * text,
+            mtmd_bitmap ** bitmaps,
+            size_t n_bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
-    std::string prompt_modified(text.text);
+    std::string prompt_modified(text->text);
     std::string marker_modified(ctx->image_marker);
     projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
 
@@ -225,8 +250,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
 
     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
-    output.clear();
-    output.reserve(parts.size());
+    output->entries.clear();
+    output->entries.reserve(parts.size());
 
     size_t i_img = 0;
 
@@ -237,7 +262,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             std::move(tokens),
             {},
         };
-        output.emplace_back(std::move(chunk));
+        output->entries.emplace_back(std::move(chunk));
     };
 
     // utility for splitting batch of multiple images into chunks of batch having single images
@@ -265,7 +290,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     for (const auto & part : parts) {
         // printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
-        auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
+        auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
         if (tokens.empty()) {
             continue;
         }
@@ -274,22 +299,22 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             std::move(tokens),
             {},
         };
-        output.emplace_back(std::move(chunk));
+        output->entries.emplace_back(std::move(chunk));
 
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts
 
-            if (i_img >= bitmaps.size()) {
+            if (i_img >= n_bitmaps) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
                 return 1;
             }
 
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_img].nx;
-            img_u8->ny = bitmaps[i_img].ny;
-            img_u8->buf.resize(bitmaps[i_img].data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->nx = bitmaps[i_img]->nx;
+            img_u8->ny = bitmaps[i_img]->ny;
+            img_u8->buf.resize(bitmaps[i_img]->data.size());
+            std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
             clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
 
             // preprocess image
@@ -302,12 +327,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
             if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
                 // split batch into chunks of single images
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
                 GGML_ASSERT(chunks.size() > 0);
 
                 // add overview image
                 add_text_chunk({ctx->tok_ov_img_start});
-                output.emplace_back(std::move(chunks.front()));
+                output->entries.emplace_back(std::move(chunks.front()));
                 chunks.erase(chunks.begin());
                 add_text_chunk({ctx->tok_ov_img_end});
 
@@ -325,7 +350,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_start});
                             }
-                            output.emplace_back(std::move(chunks[y * n_col + x]));
+                            output->entries.emplace_back(std::move(chunks[y * n_col + x]));
                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_end});
                             }
@@ -357,7 +382,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     image_tokens->ny = 1;
                 }
                 image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmaps[i_img].id; // optional
+                image_tokens->id = bitmaps[i_img]->id; // optional
 
                 LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                 LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -368,7 +393,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     {},
                     std::move(image_tokens),
                 };
-                output.emplace_back(std::move(chunk));
+                output->entries.emplace_back(std::move(chunk));
             }
 
             i_img++; // move to next image
@@ -378,35 +403,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     return 0;
 }
 
-void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
     if (image_tokens) {
         delete image_tokens;
     }
 }
 
-size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->n_tokens();
-}
-
-size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->nx;
-}
-
-size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->ny;
-}
-
-std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->id;
-}
-
-llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
-    if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
-    }
-    return image_tokens->n_tokens();
-}
-
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -446,13 +448,18 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
+size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
-    for (auto & chunk : chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            n_tokens += chunk.tokens_text.size();
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
+    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+        auto chunk = mtmd_input_chunks_get(chunks, i);
+        auto chunk_type = mtmd_input_chunk_get_type(chunk);
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens_text;
+            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
+            n_tokens += n_tokens_text;
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
+            n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
         } else {
             GGML_ASSERT(false && "chunk type not supported");
         }
@@ -460,13 +467,18 @@ size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
     return n_tokens;
 }
 
-llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
+llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks * chunks) {
     llama_pos n_pos = 0;
-    for (auto & chunk : chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            n_pos += chunk.tokens_text.size();
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+        auto chunk = mtmd_input_chunks_get(chunks, i);
+        auto chunk_type = mtmd_input_chunk_get_type(chunk);
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens_text;
+            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
+            n_pos += n_tokens_text;
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
+            n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
         } else {
             GGML_ASSERT(false && "chunk type not supported");
         }
@@ -562,143 +574,172 @@ struct decode_embd_batch {
     }
 };
 
-int32_t mtmd_helper_eval(mtmd_context * ctx,
-        llama_context * lctx,
-        std::vector<mtmd_input_chunk> & chunks,
-        llama_pos pos0,
+int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+        struct llama_context * lctx,
+        mtmd_input_chunk * chunk,
+        llama_pos n_past,
         llama_seq_id seq_id,
-        int32_t n_batch) {
+        int32_t n_batch,
+        bool logits_last,
+        llama_pos * new_n_past) {
     int32_t ret;
-    llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+    auto chunk_type = mtmd_input_chunk_get_type(chunk);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
-    for (auto & chunk : chunks) {
-        bool is_last = &chunk == &chunks.back();
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            text_batch.n_tokens = chunk.tokens_text.size();
-            size_t i = 0;
-            while (i < chunk.tokens_text.size()) { // split into batches
-                for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
-                    text_batch.token   [i]    = chunk.tokens_text[i];
-                    text_batch.pos     [i]    = n_past++;
-                    text_batch.n_seq_id[i]    = 1;
-                    text_batch.seq_id  [i][0] = seq_id;
-                    text_batch.logits  [i]    = false;
-                }
-                if (is_last) {
-                    // always get logits for last input chunk
-                    text_batch.logits[text_batch.n_tokens - 1] = true;
-                }
-                ret = llama_decode(lctx, text_batch);
-                if (ret != 0) {
-                    LOG_ERR("failed to decode text\n");
-                    llama_batch_free(text_batch);
-                    return ret;
-                }
+    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        size_t n_tokens;
+        const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+        text_batch.n_tokens = n_tokens;
+        LOG_DBG("decoding text chunk, n_tokens = %zu\n", n_tokens);
+        size_t i = 0;
+        while (i < n_tokens) { // split into batches
+            for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
+                text_batch.token   [i]    = tokens[i];
+                text_batch.pos     [i]    = n_past++;
+                text_batch.n_seq_id[i]    = 1;
+                text_batch.seq_id  [i][0] = seq_id;
+                text_batch.logits  [i]    = false;
             }
-
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
-            GGML_ASSERT(chunk.tokens_image != nullptr);
-            int64_t t0 = ggml_time_ms();
-            if (ctx->print_timings) {
-                LOG_INF("encoding image or slice...\n");
+            bool is_last_batch = (i == n_tokens);
+            if (logits_last && is_last_batch) {
+                text_batch.logits[text_batch.n_tokens - 1] = true;
             }
-            ret = mtmd_encode(ctx, chunk.tokens_image.get());
+            ret = llama_decode(lctx, text_batch);
             if (ret != 0) {
-                LOG_ERR("failed to encode image\n");
+                LOG_ERR("failed to decode text\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
-            if (ctx->print_timings) {
-                LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-            }
+            *new_n_past += text_batch.n_tokens;
+        }
 
-            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
-            int32_t i_batch = 0;
-            int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-            float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+        int64_t t0 = ggml_time_ms();
+        if (ctx->print_timings) {
+            LOG_INF("encoding image or slice...\n");
+        }
+        ret = mtmd_encode(ctx, image_tokens);
+        if (ret != 0) {
+            LOG_ERR("failed to encode image\n");
+            llama_batch_free(text_batch);
+            return ret;
+        }
+        if (ctx->print_timings) {
+            LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        }
 
-            const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
-            const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
+        int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+        int32_t i_batch = 0;
+        int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+        float * embd = mtmd_get_output_embd(ctx);
+        decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 
-            if (mtmd_decode_use_mrope(ctx)) {
-                batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
-            } else {
-                batch_embd.set_position_normal(n_past, seq_id);
+        const int nx = mtmd_image_tokens_get_nx(image_tokens);
+        const int ny = mtmd_image_tokens_get_ny(image_tokens);
+
+        if (mtmd_decode_use_mrope(ctx)) {
+            batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
+        } else {
+            batch_embd.set_position_normal(n_past, seq_id);
+        }
+
+        if (mtmd_decode_use_non_causal(ctx)) {
+            llama_set_causal_attn(lctx, false);
+            // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+        }
+
+        while (i_batch < n_img_batches) { // split into batches
+            int pos_offset = i_batch*n_batch;
+            int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+            llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
+
+            LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+            int64_t t1 = ggml_time_ms();
+            ret = llama_decode(lctx, batch_embd_view);
+            if (ret != 0) {
+                LOG_ERR("failed to decode image\n");
+                llama_set_causal_attn(lctx, true); // restore causal attn
+                llama_batch_free(text_batch);
+                return ret;
             }
 
-            if (mtmd_decode_use_non_causal(ctx)) {
-                llama_set_causal_attn(lctx, false);
-                // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+            if (ctx->print_timings) {
+                LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
             }
 
-            while (i_batch < n_img_batches) { // split into batches
-                int pos_offset = i_batch*n_batch;
-                int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-                llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
+            i_batch++;
+        }
 
-                LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+        // for mrope, one image is one single **temporal** position
+        n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
+        *new_n_past = n_past;
 
-                int64_t t1 = ggml_time_ms();
-                ret = llama_decode(lctx, batch_embd_view);
-                if (ret != 0) {
-                    LOG_ERR("failed to decode image\n");
-                    llama_set_causal_attn(lctx, true); // restore causal attn
-                    llama_batch_free(text_batch);
-                    return ret;
-                }
+        if (mtmd_decode_use_non_causal(ctx)) {
+            llama_set_causal_attn(lctx, true);
+        }
 
-                if (ctx->print_timings) {
-                    LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
-                }
+    } else {
+        GGML_ABORT("chunk type not supported");
+    }
 
-                i_batch++;
-            }
+    return 0;
+}
 
-            // for mrope, one image is one single **temporal** position
-            n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
+int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                struct llama_context * lctx,
+                                mtmd_input_chunks * chunks,
+                                llama_pos n_past,
+                                llama_seq_id seq_id,
+                                int32_t n_batch,
+                                bool logits_last,
+                                llama_pos * new_n_past) {
+    size_t n_chunks = mtmd_input_chunks_size(chunks);
+    if (n_chunks == 0) {
+        LOG_WRN("no chunks to eval\n");
+        return 0;
+    }
 
-            if (mtmd_decode_use_non_causal(ctx)) {
-                llama_set_causal_attn(lctx, true);
-            }
+    for (size_t i = 0; i < n_chunks; i++) {
+        bool is_last_chunk = (i == n_chunks - 1);
+        auto chunk = mtmd_input_chunks_get(chunks, i);
 
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
+        int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, is_last_chunk && logits_last, &n_past);
+        if (res != 0) {
+            LOG_ERR("failed to eval chunk %zu\n", i);
+            return res;
         }
+        *new_n_past = n_past;
     }
 
-    llama_batch_free(text_batch);
     return 0;
 }
 
-int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
     clip_image_u8_ptr img_u8(clip_image_u8_init());
     bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
     if (!ok) {
         LOG_ERR("Unable to load image from buffer\n");
-        return 1;
+        return nullptr;
     }
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
-    output.data.resize(output.nx * output.ny * 3);
-    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
-    return 0;
+    uint32_t nx, ny;
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
+    return mtmd_bitmap_init(nx, ny, data);
 }
 
-int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
     clip_image_u8_ptr img_u8(clip_image_u8_init());
     bool ok = clip_image_load_from_file(fname, img_u8.get());
     if (!ok) {
         LOG_ERR("Unable to load image %s\n", fname);
-        return 1;
+        return nullptr;
     }
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
-    output.data.resize(output.nx * output.ny * 3);
-    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
-    return 0;
+    uint32_t nx, ny;
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
+    return mtmd_bitmap_init(nx, ny, data);
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
@@ -716,3 +757,134 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
 void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
     mtmd_image_tokens_free(val);
 }
+
+
+//
+// public API functions
+//
+
+// mtmd_bitmap
+
+mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
+                               uint32_t ny,
+                               const unsigned char * data) {
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    size_t data_size = (size_t)nx * ny * 3;
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
+uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
+    return bitmap->nx;
+}
+
+uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
+    return bitmap->ny;
+}
+
+const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
+    return bitmap->data.data();
+}
+
+const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
+    return bitmap->id.c_str();
+}
+
+void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
+    if (id) {
+        bitmap->id = std::string(id);
+    } else {
+        bitmap->id.clear();
+    }
+}
+
+void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
+    if (bitmap) {
+        delete bitmap;
+    }
+}
+
+// mtmd_input_chunks
+
+mtmd_input_chunks * mtmd_input_chunks_init() {
+    return new mtmd_input_chunks;
+}
+
+size_t mtmd_input_chunks_size(mtmd_input_chunks * chunks) {
+    return chunks->entries.size();
+}
+
+mtmd_input_chunk * mtmd_input_chunks_get(mtmd_input_chunks * chunks, size_t idx) {
+    if (idx >= chunks->entries.size()) {
+        return nullptr;
+    }
+    return &chunks->entries[idx];
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
+    if (chunks) {
+        delete chunks;
+    }
+}
+
+// mtmd_input_chunk
+
+enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
+    return chunk->type;
+}
+
+const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        *n_tokens_output = chunk->tokens_text.size();
+        return chunk->tokens_text.data();
+    }
+    *n_tokens_output = 0;
+    return nullptr;
+}
+
+const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image.get();
+    }
+    return nullptr;
+}
+
+mtmd_input_chunk * mtmd_input_chunk_release(mtmd_input_chunk * chunk) {
+    mtmd_input_chunk * copy = new mtmd_input_chunk;
+    *copy = std::move(*chunk);
+    return copy;
+}
+
+void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
+    if (chunk) {
+        delete chunk;
+    }
+}
+
+// mtmd_image_tokens
+
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
+const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->id.c_str();
+}
+
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+    if (image_tokens->use_mrope_pos) {
+        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+    }
+    return image_tokens->n_tokens();
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 0c44fc6897a84..c9eca3929d386 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -31,22 +31,39 @@
 
 #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum mtmd_input_chunk_type {
     MTMD_INPUT_CHUNK_TYPE_TEXT,
     MTMD_INPUT_CHUNK_TYPE_IMAGE,
 };
 
+// opaque types
 struct mtmd_context;
+struct mtmd_bitmap;
 struct mtmd_image_tokens;
+struct mtmd_input_chunk;
+struct mtmd_input_chunks;
+
+struct mtmd_input_text {
+    const char * text;
+    bool add_special;
+    bool parse_special;
+};
 
 //
 // C API
 // this is made to closely resemble the C++ API
 //
 
-// forward declaration for C API (the actual struct is defined in C++)
-struct mtmd_bitmap;
-struct mtmd_input_chunk;
+typedef struct mtmd_context      mtmd_context;
+typedef struct mtmd_bitmap       mtmd_bitmap;
+typedef struct mtmd_image_tokens mtmd_image_tokens;
+typedef struct mtmd_input_chunk  mtmd_input_chunk;
+typedef struct mtmd_input_chunks mtmd_input_chunks;
+typedef struct mtmd_input_text   mtmd_input_text;
 
 struct mtmd_context_params {
     bool use_gpu;
@@ -61,81 +78,70 @@ MTMD_API mtmd_context_params mtmd_context_params_default();
 // initialize the mtmd context
 // return nullptr on failure
 MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
-                                            const llama_model * text_model,
-                                            const mtmd_context_params ctx_params);
+                                            const struct llama_model * text_model,
+                                            const struct mtmd_context_params ctx_params);
 
 MTMD_API void mtmd_free(mtmd_context * ctx);
 
-// get output embeddings from the last encode pass
-MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
-
 // whether we need to set non-causal mask before llama_decode
 MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 
 // whether the current model use M-RoPE for llama_decode
 MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
 
+
 // mtmd_bitmap
 //
 // length of data must be nx * ny * 3
 // the data is in RGBRGBRGB... format
-// the id is optional (can be nullptr), but useful for KV cache tracking
-MTMD_API mtmd_bitmap * mtmd_bitmap_init(
-    uint32_t nx,
-    uint32_t ny,
-    const unsigned char * data,
-    const char * id, size_t id_len);
-MTMD_API uint32_t              mtmd_bitmap_get_nx  (mtmd_bitmap * bitmap);
-MTMD_API uint32_t              mtmd_bitmap_get_ny  (mtmd_bitmap * bitmap);
-MTMD_API const unsigned char * mtmd_bitmap_get_data(mtmd_bitmap * bitmap);
-MTMD_API const char *          mtmd_bitmap_get_id  (mtmd_bitmap * bitmap);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init    (uint32_t nx,
+                                                    uint32_t ny,
+                                                    const unsigned char * data);
+MTMD_API uint32_t              mtmd_bitmap_get_nx  (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny  (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
+// bitmap ID is optional, but useful for KV cache tracking
+// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
+MTMD_API const char * mtmd_bitmap_get_id  (const mtmd_bitmap * bitmap);
+MTMD_API void         mtmd_bitmap_set_id  (mtmd_bitmap * bitmap, const char * id);
+
+
+// mtmd_input_chunks
+//
+// this is simply a list of mtmd_input_chunk
+// the elements can only be populated via mtmd_tokenize()
+MTMD_API mtmd_input_chunks * mtmd_input_chunks_init();
+MTMD_API size_t              mtmd_input_chunks_size(mtmd_input_chunks * chunks);
+MTMD_API mtmd_input_chunk *  mtmd_input_chunks_get (mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void                mtmd_input_chunks_free(mtmd_input_chunks * chunks);
 
 // mtmd_input_chunk
 //
-// the instance can be constructed via mtmd_tokenize()
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunks
 MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
 MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
-MTMD_API void                       mtmd_input_chunk_free            (mtmd_input_chunk * chunk);
 
+// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
+// you can move the chunk ownership to your own code
+// this will release the chunk from the list of input chunks
+// remember to free the chunk when you are done with it
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_release(mtmd_input_chunk * chunk);
+MTMD_API void               mtmd_input_chunk_free   (mtmd_input_chunk * chunk);
 
-//
-// C++ API
-//
-
-#ifdef __cplusplus
-
-struct mtmd_context_deleter {
-    void operator()(mtmd_context * val) { mtmd_free(val); }
-};
-using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
-struct mtmd_bitmap {
-    uint32_t nx;
-    uint32_t ny;
-    std::vector<unsigned char> data;
-    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
-};
-
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val); // forward declaration
-};
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
-struct mtmd_input_chunk {
-    mtmd_input_chunk_type type;
-    std::vector<llama_token> tokens_text;
-    mtmd_image_tokens_ptr tokens_image;
-};
-
-struct mtmd_input_text {
-    std::string text;
-    bool add_special;
-    bool parse_special;
-};
+// mtmd_image_tokens
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunk
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
+// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -153,58 +159,130 @@ struct mtmd_input_text {
 //   1 on number of images not matching the number of markers
 //   2 on image preprocessing error
 MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
-                                std::vector<mtmd_input_chunk> & output,
-                                const mtmd_input_text & text,
-                                const std::vector<mtmd_bitmap> & bitmaps);
-
-// access mtmd_image_tokens
-MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
-MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
-MTMD_API llama_pos   mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+                               mtmd_input_chunks * output,
+                               const mtmd_input_text * text,
+                               mtmd_bitmap ** bitmaps,
+                               size_t n_bitmaps);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
-                            const mtmd_image_tokens * image_tokens);
+                             const mtmd_image_tokens * image_tokens);
 
+// get output embeddings from the last encode pass
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+/////////////////////////////////////////
 
 //
 // helper functions (can be implemented based on other functions)
 //
+// please note that these helpers are not guaranteed to be stable, there can be breaking changes in the future
+//
+
+// helper function to construct a mtmd_bitmap from a file
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// the file content must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
-MTMD_API size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
 
 // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API llama_pos mtmd_helper_get_n_pos(std::vector<mtmd_input_chunk> & chunks);
+MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks * chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
 // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
-MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
-                                llama_context * lctx,
-                                std::vector<mtmd_input_chunk> & chunks,
-                                llama_pos pos0,
-                                llama_seq_id seq_id,
-                                int32_t n_batch);
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// helper function to evaluate a single chunk
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+/////////////////////////////////////////
 
-// helper function to construct a mtmd_bitmap from a file
-// returns 0 on success
-// this function is thread-safe
-MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
+#ifdef __cplusplus
+} // extern "C"
+#endif
 
-// helper function to construct a mtmd_bitmap from a buffer
-// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
-// returns 0 on success
-// this function is thread-safe
-MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
+//
+// C++ wrapper
+//
 
-#endif
+#ifdef __cplusplus
+
+namespace mtmd {
+
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
+struct mtmd_bitmap_deleter {
+    void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
+};
+using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+struct bitmap {
+    bitmap_ptr ptr;
+    bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
+        ptr.reset(mtmd_bitmap_init(nx, ny, data));
+    }
+    ~bitmap() = default;
+    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
+    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
+};
+
+struct input_chunks {
+    input_chunks_ptr ptr;
+    input_chunks() {
+        ptr.reset(mtmd_input_chunks_init());
+    }
+    ~input_chunks() = default;
+    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
+    mtmd_input_chunk * operator[](size_t idx) {
+        return mtmd_input_chunks_get(ptr.get(), idx);
+    }
+};
+
+} // namespace mtmd
+
+#endif
 
 #endif

From f8c27b9e04db0375048c95e5225cdeabac6e3b4b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 16:26:11 +0200
Subject: [PATCH 22/59] add mtmd::bitmaps

---
 examples/llava/mtmd-cli.cpp | 13 +++++++------
 examples/llava/mtmd.h       | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index cf754e63f36ec..830b9ef60cab0 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -173,7 +173,7 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
 }
 
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
-    std::vector<mtmd_bitmap *> bitmaps;
+    mtmd::bitmaps bitmaps;
 
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
@@ -183,12 +183,12 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 
     for (auto & fname : images_fname) {
-        mtmd_bitmap * bitmap = mtmd_helper_bitmap_init_from_file(fname.c_str());
-        if (!bitmap) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
+        if (!bmp.ptr) {
             LOG_ERR("Unable to load image %s\n", fname.c_str());
             return 2; // image not found
         }
-        bitmaps.push_back(std::move(bitmap));
+        bitmaps.entries.push_back(std::move(bmp));
     }
 
     mtmd_input_text text;
@@ -199,11 +199,12 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     if (g_is_interrupted) return 0;
 
     mtmd::input_chunks chunks;
+    auto bitmaps_c_ptr = bitmaps.c_ptr();
     int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
                         chunks.ptr.get(), // output
                         &text, // text
-                        bitmaps.data(), // bitmaps
-                        bitmaps.size());
+                        bitmaps_c_ptr.data(),
+                        bitmaps_c_ptr.size());
     if (res != 0) {
         LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
         return 1;
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index c9eca3929d386..b85f3f910eef8 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -257,7 +257,9 @@ using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_de
 
 struct bitmap {
     bitmap_ptr ptr;
+    bitmap() : ptr(nullptr) {}
     bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+    bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
     bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
         ptr.reset(mtmd_bitmap_init(nx, ny, data));
     }
@@ -269,6 +271,22 @@ struct bitmap {
     void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };
 
+struct bitmaps {
+    std::vector<bitmap> entries;
+    ~bitmaps() = default;
+    // return list of pointers to mtmd_bitmap
+    // example:
+    //   auto bitmaps_c_ptr = bitmaps.c_ptr();
+    //   int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+    std::vector<mtmd_bitmap *> c_ptr() {
+        std::vector<mtmd_bitmap *> res(entries.size());
+        for (size_t i = 0; i < entries.size(); i++) {
+            res[i] = entries[i].ptr.get();
+        }
+        return res;
+    }
+};
+
 struct input_chunks {
     input_chunks_ptr ptr;
     input_chunks() {

From 3357961506aab06f74ee0fbd18a85215c39ca7f9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 18:16:18 +0200
Subject: [PATCH 23/59] add test target

---
 examples/llava/clip.h   |  8 +++---
 examples/llava/mtmd.cpp | 33 ++++++++++++++++++++++
 examples/llava/mtmd.h   |  7 +++--
 tests/CMakeLists.txt    |  4 +++
 tests/test-mtmd-c-api.c | 61 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 6 deletions(-)
 create mode 100644 tests/test-mtmd-c-api.c

diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 0a53bd8eb78e1..0b0eb02956a32 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
 
-CLIP_API struct clip_image_size      * clip_image_size_init();
-CLIP_API struct clip_image_u8        * clip_image_u8_init ();
-CLIP_API struct clip_image_f32       * clip_image_f32_init();
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
+CLIP_API struct clip_image_size      * clip_image_size_init(void);
+CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
+CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
 
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index c543b2b2415d3..5567bd49b089b 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -888,3 +888,36 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     }
     return image_tokens->n_tokens();
 }
+
+// test function
+
+mtmd_input_chunks * mtmd_test_create_input_chunks() {
+    mtmd_input_chunks * chunks = mtmd_input_chunks_init();
+    if (!chunks) {
+        return nullptr;
+    }
+
+    // create a text chunk
+    std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
+    mtmd_input_chunk chunk_text{
+        MTMD_INPUT_CHUNK_TYPE_TEXT,
+        std::move(tokens_text),
+        {},
+    };
+    chunks->entries.emplace_back(std::move(chunk_text));
+
+    // create an image chunk
+    mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+    image_tokens->nx = 4;
+    image_tokens->ny = 4;
+    image_tokens->batch_f32.entries.resize(16);
+    image_tokens->id = "image_1";
+    mtmd_input_chunk chunk_image{
+        MTMD_INPUT_CHUNK_TYPE_IMAGE,
+        {},
+        std::move(image_tokens),
+    };
+    chunks->entries.emplace_back(std::move(chunk_image));
+
+    return chunks;
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index b85f3f910eef8..c89e386d1232e 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -73,7 +73,7 @@ struct mtmd_context_params {
     const char * image_marker;
 };
 
-MTMD_API mtmd_context_params mtmd_context_params_default();
+MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 
 // initialize the mtmd context
 // return nullptr on failure
@@ -111,7 +111,7 @@ MTMD_API void         mtmd_bitmap_set_id  (mtmd_bitmap * bitmap, const char * id
 //
 // this is simply a list of mtmd_input_chunk
 // the elements can only be populated via mtmd_tokenize()
-MTMD_API mtmd_input_chunks * mtmd_input_chunks_init();
+MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
 MTMD_API size_t              mtmd_input_chunks_size(mtmd_input_chunks * chunks);
 MTMD_API mtmd_input_chunk *  mtmd_input_chunks_get (mtmd_input_chunks * chunks, size_t idx);
 MTMD_API void                mtmd_input_chunks_free(mtmd_input_chunks * chunks);
@@ -228,6 +228,9 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 
 /////////////////////////////////////////
 
+// test function, to be used in test-mtmd-c-api.c
+MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ae68275251d01..ffc086d352f06 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -162,6 +162,10 @@ if (NOT GGML_BACKEND_DL)
     llama_build_and_test(test-rope.cpp)
 endif()
 
+# libmtmd
+set(LLAMA_TEST_NAME test-mtmd-c-api)
+llama_build_and_test(test-mtmd-c-api.c)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c
new file mode 100644
index 0000000000000..e3105f24863ec
--- /dev/null
+++ b/tests/test-mtmd-c-api.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <assert.h>
+
+#define MTMD_TEST
+#include "mtmd.h"
+
+int main(void) {
+    printf("\n\nTesting libmtmd C API...\n");
+    printf("--------\n\n");
+
+    struct mtmd_context_params params = mtmd_context_params_default();
+    printf("Default image marker: %s\n", params.image_marker);
+
+    mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
+
+    if (!chunks) {
+        fprintf(stderr, "Failed to create input chunks\n");
+        return 1;
+    }
+
+    size_t n_chunks = mtmd_input_chunks_size(chunks);
+    printf("Number of chunks: %zu\n", n_chunks);
+    assert(n_chunks > 0);
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+        assert(chunk != NULL);
+        enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
+        printf("Chunk %zu type: %d\n", i, type);
+
+        if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            printf("    Text chunk with %zu tokens\n", n_tokens);
+            assert(tokens != NULL);
+            assert(n_tokens > 0);
+            for (size_t j = 0; j < n_tokens; j++) {
+                assert(tokens[j] >= 0);
+                printf("    > Token %zu: %d\n", j, tokens[j]);
+            }
+
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+            size_t nx = mtmd_image_tokens_get_nx(image_tokens);
+            size_t ny = mtmd_image_tokens_get_ny(image_tokens);
+            const char * id = mtmd_image_tokens_get_id(image_tokens);
+            assert(n_tokens > 0);
+            assert(nx > 0);
+            assert(ny > 0);
+            assert(id != NULL);
+            printf("    Image chunk with %zu tokens\n", n_tokens);
+            printf("    Image size: %zu x %zu\n", nx, ny);
+            printf("    Image ID: %s\n", id);
+        }
+    }
+
+    printf("\n\nDONE: test libmtmd C API...\n");
+
+    return 0;
+}

From 92d24045db72efde03d46257cfca54316b0c953f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 18:17:15 +0200
Subject: [PATCH 24/59] rm redundant define

---
 tests/test-mtmd-c-api.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c
index e3105f24863ec..298afd6d42b73 100644
--- a/tests/test-mtmd-c-api.c
+++ b/tests/test-mtmd-c-api.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-#define MTMD_TEST
 #include "mtmd.h"
 
 int main(void) {

From 111d5afa841ab20834b5c21b1aa4737a186aeb92 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 18:18:38 +0200
Subject: [PATCH 25/59] test: mtmd_input_chunks_free

---
 tests/test-mtmd-c-api.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c
index 298afd6d42b73..6d968e2afa6c4 100644
--- a/tests/test-mtmd-c-api.c
+++ b/tests/test-mtmd-c-api.c
@@ -54,6 +54,9 @@ int main(void) {
         }
     }
 
+    // Free the chunks
+    mtmd_input_chunks_free(chunks);
+
     printf("\n\nDONE: test libmtmd C API...\n");
 
     return 0;

From 08d0f9cfe0a6919022cf72213726cec0be578333 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 18:20:41 +0200
Subject: [PATCH 26/59] rm outdated comment

---
 examples/llava/mtmd.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index c89e386d1232e..1fdd0912c0a70 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -55,7 +55,6 @@ struct mtmd_input_text {
 
 //
 // C API
-// this is made to closely resemble the C++ API
 //
 
 typedef struct mtmd_context      mtmd_context;
@@ -211,11 +210,7 @@ MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
                                          bool logits_last,
                                          llama_pos * new_n_past);
 
-// helper function to evaluate a single chunk
-// 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
-// otherwise, returns 0 on success
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                struct llama_context * lctx,

From 863db3153f5af46d58edee7e3579d120ddbc7a05 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 2 May 2025 22:15:02 +0200
Subject: [PATCH 27/59] fix merging issue

---
 tools/llava/mtmd-cli.cpp |  2 ++
 tools/llava/mtmd.cpp     | 18 +++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/llava/mtmd-cli.cpp b/tools/llava/mtmd-cli.cpp
index 836d48b9fd6d6..9deb2e55e52af 100644
--- a/tools/llava/mtmd-cli.cpp
+++ b/tools/llava/mtmd-cli.cpp
@@ -211,6 +211,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
         return 1;
     }
 
+    ctx.bitmaps.entries.clear();
+
     llama_pos new_n_past;
     if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
                 ctx.lctx, // lctx
diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 6dc3977b2a6ee..5dfcce58d1213 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -591,21 +591,29 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
     if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
         size_t n_tokens;
         const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
-        text_batch.n_tokens = n_tokens;
         LOG_DBG("decoding text chunk, n_tokens = %zu\n", n_tokens);
         size_t i = 0;
         while (i < n_tokens) { // split into batches
+            text_batch.n_tokens = 0; // clear the batch
             for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
+                text_batch.n_tokens++;
                 text_batch.token   [i]    = tokens[i];
                 text_batch.pos     [i]    = n_past++;
                 text_batch.n_seq_id[i]    = 1;
                 text_batch.seq_id  [i][0] = seq_id;
                 text_batch.logits  [i]    = false;
             }
-            bool is_last_batch = (i == n_tokens);
-            if (logits_last && is_last_batch) {
+            bool is_last_token = (i == n_tokens);
+            if (logits_last && is_last_token) {
                 text_batch.logits[text_batch.n_tokens - 1] = true;
             }
+            ret = llama_decode(lctx, text_batch);
+            if (ret != 0) {
+                LOG_ERR("failed to decode text\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+            *new_n_past += text_batch.n_tokens;
         }
 
     } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -697,10 +705,10 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
     }
 
     for (size_t i = 0; i < n_chunks; i++) {
-        bool is_last_chunk = (i == n_chunks - 1);
+        bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
         auto chunk = mtmd_input_chunks_get(chunks, i);
 
-        int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, is_last_chunk && logits_last, &n_past);
+        int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
         if (res != 0) {
             LOG_ERR("failed to eval chunk %zu\n", i);
             return res;

From a0fb7016b6a5c5d37b9f8b526d2099d6f57c4a7b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 2 May 2025 22:16:44 +0200
Subject: [PATCH 28/59] explicitly create mtmd::input_chunks

---
 tools/llava/mtmd-cli.cpp | 2 +-
 tools/llava/mtmd.h       | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/llava/mtmd-cli.cpp b/tools/llava/mtmd-cli.cpp
index 9deb2e55e52af..dd18e0fe6ed0d 100644
--- a/tools/llava/mtmd-cli.cpp
+++ b/tools/llava/mtmd-cli.cpp
@@ -199,7 +199,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
 
     if (g_is_interrupted) return 0;
 
-    mtmd::input_chunks chunks;
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
     auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
     int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
                         chunks.ptr.get(), // output
diff --git a/tools/llava/mtmd.h b/tools/llava/mtmd.h
index 1fdd0912c0a70..68ffa4bd5efc4 100644
--- a/tools/llava/mtmd.h
+++ b/tools/llava/mtmd.h
@@ -287,9 +287,8 @@ struct bitmaps {
 
 struct input_chunks {
     input_chunks_ptr ptr;
-    input_chunks() {
-        ptr.reset(mtmd_input_chunks_init());
-    }
+    input_chunks() = default;
+    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
     ~input_chunks() = default;
     size_t size() { return mtmd_input_chunks_size(ptr.get()); }
     mtmd_input_chunk * operator[](size_t idx) {

From 6bc7a30af5cf87a54b70a730450e95930d3a1adb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 2 May 2025 22:25:01 +0200
Subject: [PATCH 29/59] mtmd_input_chunk_copy

---
 tools/llava/mtmd.cpp | 13 ++++++++++---
 tools/llava/mtmd.h   |  7 +++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 5dfcce58d1213..78e0ea0c57c22 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -853,9 +853,16 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
     return nullptr;
 }
 
-mtmd_input_chunk * mtmd_input_chunk_release(mtmd_input_chunk * chunk) {
-    mtmd_input_chunk * copy = new mtmd_input_chunk;
-    *copy = std::move(*chunk);
+mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+    mtmd_input_chunk * copy = new mtmd_input_chunk{
+        chunk->type,
+        chunk->tokens_text,
+        mtmd_image_tokens_ptr(),
+    };
+    if (chunk->tokens_image) {
+        // copy the image tokens
+        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens(*chunk->tokens_image));
+    }
     return copy;
 }
 
diff --git a/tools/llava/mtmd.h b/tools/llava/mtmd.h
index 68ffa4bd5efc4..0b40851ac6627 100644
--- a/tools/llava/mtmd.h
+++ b/tools/llava/mtmd.h
@@ -124,11 +124,10 @@ MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd
 MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
-// you can move the chunk ownership to your own code
-// this will release the chunk from the list of input chunks
+// you can move the chunk ownership to your own code by copying it
 // remember to free the chunk when you are done with it
-MTMD_API mtmd_input_chunk * mtmd_input_chunk_release(mtmd_input_chunk * chunk);
-MTMD_API void               mtmd_input_chunk_free   (mtmd_input_chunk * chunk);
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 
 
 // mtmd_image_tokens

From 4d842eb924e7757fa1a9c33499316ca71ac74189 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 2 May 2025 22:39:05 +0200
Subject: [PATCH 30/59] add clone()

---
 tools/llava/clip-impl.h |  9 +++++++++
 tools/llava/mtmd.cpp    | 13 ++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/llava/clip-impl.h b/tools/llava/clip-impl.h
index b575ca4d7c2a9..e498e8aaf8fd0 100644
--- a/tools/llava/clip-impl.h
+++ b/tools/llava/clip-impl.h
@@ -231,6 +231,15 @@ struct clip_image_u8_batch {
 
 struct clip_image_f32_batch {
     std::vector<clip_image_f32_ptr> entries;
+
+    clip_image_f32_batch clone() const {
+        clip_image_f32_batch new_batch;
+        new_batch.entries.reserve(entries.size());
+        for (const auto & entry : entries) {
+            new_batch.entries.emplace_back(new clip_image_f32(*entry));
+        }
+        return new_batch;
+    }
 };
 
 //
diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 78e0ea0c57c22..b2fbd4cafb987 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -166,6 +166,16 @@ struct mtmd_image_tokens {
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_image_tokens clone() {
+        return mtmd_image_tokens{
+            nx,
+            ny,
+            use_mrope_pos,
+            batch_f32.clone(),
+            id
+        };
+    }
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -861,7 +871,8 @@ mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
     };
     if (chunk->tokens_image) {
         // copy the image tokens
-        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens(*chunk->tokens_image));
+        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
+        *copy->tokens_image = chunk->tokens_image->clone();
     }
     return copy;
 }

From 2cedd1808aeafb71583022c1d4a09d79f87b3291 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 22:50:09 +0200
Subject: [PATCH 31/59] improve server_input struct

---
 tools/server/server.cpp |  71 ++++---
 tools/server/utils.hpp  | 415 ++++++++++++++++------------------------
 2 files changed, 204 insertions(+), 282 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9428fceb0396c..0c0c209b5fdaa 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1861,7 +1861,6 @@ struct server_context {
     llama_context_params cparams_dft;
 
     llama_batch batch;
-    server_batch_embd batch_embd;
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -2057,8 +2056,7 @@ struct server_context {
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);
-            batch      = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
-            batch_embd = server_batch_embd(std::max(n_batch, params_base.n_parallel));
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
         }
 
         metrics.init();
@@ -2843,7 +2841,8 @@ struct server_context {
                         break;
                     }
                     tokens.resize(token_count);
-                    slot->cache_tokens.set_text_tokens(tokens);
+                    slot->cache_tokens.clear();
+                    slot->cache_tokens.insert(tokens);
 
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
@@ -2958,11 +2957,13 @@ struct server_context {
                 llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
-                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens();
+                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
+                        new_tokens[i - n_discard] = new_tokens[i];
                     }
 
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+                    slot.cache_tokens.clear();
+                    slot.cache_tokens.insert(new_tokens);
                 }
 
                 slot.n_past -= n_discard;
@@ -3001,7 +3002,7 @@ struct server_context {
             slot.n_past += 1;
 
             if (slot.params.cache_prompt) {
-                slot.cache_tokens.add_text_token(slot.sampled);
+                slot.cache_tokens.push_back(slot.sampled);
             }
 
             SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
@@ -3111,7 +3112,8 @@ struct server_context {
                                         curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
                                         curr_tokens.end());
 
-                                prompt_tokens.set_text_tokens(new_tokens);
+                                prompt_tokens.clear();
+                                prompt_tokens.insert(new_tokens);
 
                                 slot.truncated = true;
                                 slot.n_prompt_tokens = prompt_tokens.size();
@@ -3143,7 +3145,7 @@ struct server_context {
                                         size_t n_match = 0;
                                         while (head_c + n_match < slot.cache_tokens.size() &&
                                                head_p + n_match < prompt_tokens.size()     &&
-                                               slot.cache_tokens[head_c + n_match].txt == prompt_tokens[head_p + n_match].txt) {
+                                               slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
 
                                             n_match++;
                                         }
@@ -3160,7 +3162,7 @@ struct server_context {
                                             llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
-                                                slot.cache_tokens[head_p + i].txt = slot.cache_tokens[head_c + i].txt;
+                                                slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
                                                 slot.n_past++;
                                             }
 
@@ -3206,15 +3208,15 @@ struct server_context {
                     SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
 
                     // remove the non-common part from the cache
-                    slot.cache_tokens.keep_until(slot.n_past);
+                    slot.cache_tokens.resize(slot.n_past);
 
-                    auto & cur_tok = slot.prompt_tokens[slot.n_past];
+                    llama_token cur_tok = slot.prompt_tokens[slot.n_past];
 
                     // check if we should process the image
-                    if (cur_tok.img) {
+                    if (cur_tok == LLAMA_TOKEN_NULL) {
                         // process the image
-                        int32_t res = server_img_process(ctx, mctx, cur_tok, batch_embd, slot.n_past, slot.id);
-                        int32_t n_tokens = mtmd_image_tokens_get_n_tokens(cur_tok.img.get());
+                        int32_t n_pos = slot.n_past;
+                        int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, n_pos);
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             slot.release();
@@ -3223,30 +3225,27 @@ struct server_context {
                         }
 
                         if (slot.params.cache_prompt) {
-                            // all ALL image tokens at once
-                            for (int32_t i = 0; i < n_tokens; i++) {
-                                slot.cache_tokens.add_token(std::move(slot.prompt_tokens[slot.n_past + i]));
-                            }
+                            slot.prompt_tokens.move_chunk(slot.cache_tokens, slot.n_past);
                         }
 
-                        slot.n_past                    += n_tokens;
-                        slot.n_prompt_tokens_processed += n_tokens;
+                        slot.n_past                    += n_pos;
+                        slot.n_prompt_tokens_processed += n_pos;
                     }
 
                     // add prompt tokens for processing in the current batch
                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
                         // get next token to process
-                        auto & curr_chunk = slot.prompt_tokens[slot.n_past];
-                        if (curr_chunk.txt == LLAMA_TOKEN_NULL) {
+                        llama_token cur_tok = slot.prompt_tokens[slot.n_past];
+                        if (cur_tok == LLAMA_TOKEN_NULL) {
                             break; // end of text chunk
                         }
 
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch, curr_chunk.txt, slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
                         if (slot.params.cache_prompt) {
-                            slot.cache_tokens.add_text_token(curr_chunk.txt);
+                            slot.cache_tokens.push_back(cur_tok);
                         }
 
                         slot.n_prompt_tokens_processed++;
@@ -3267,9 +3266,9 @@ struct server_context {
 
                         // Process all prompt tokens through sampler system
                         for (size_t i = 0; i < slot.cache_tokens.size(); ++i) {
-                            auto & cur_tok = slot.prompt_tokens[i];
-                            if (cur_tok.txt != LLAMA_TOKEN_NULL) {
-                                common_sampler_accept(slot.smpl, cur_tok.txt, false);
+                            llama_token id = slot.prompt_tokens[i];
+                            if (id != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl, id, false);
                             }
                         }
 
@@ -3447,7 +3446,7 @@ struct server_context {
                 params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
                 params_spec.p_min     = slot.params.speculative.p_min;
 
-                llama_tokens cached_text_tokens = slot.cache_tokens.get_text_tokens();
+                const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
                 llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
 
                 // keep track of total number of tokens generated in the draft
@@ -3481,9 +3480,9 @@ struct server_context {
                 // update how many tokens out of draft was accepted
                 slot.n_draft_accepted += ids.size() - 1;
 
-                slot.cache_tokens.add_text_token(id);
+                slot.cache_tokens.push_back(id);
                 for (auto & t : ids) {
-                    slot.cache_tokens.add_text_token(t);
+                    slot.cache_tokens.push_back(t);
                 }
 
                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
@@ -4088,7 +4087,7 @@ int main(int argc, char ** argv) {
                         throw std::runtime_error("Failed to load image");
                     }
                     // calculate bitmap hash (for KV caching)
-                    bmp.id = server_tokens::fnv_hash(bmp.data.data(), bmp.data.size());
+                    bmp.id = fnv_hash(bmp.data.data(), bmp.data.size());
                     bitmaps.push_back(std::move(bmp));
                 }
             }
@@ -4117,7 +4116,7 @@ int main(int argc, char ** argv) {
                 // non-multimodal version
                 auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
                 for (auto & p : tokenized_prompts) {
-                    auto tmp = convert_legacy_to_mtmd(p);
+                    auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
                     inputs.push_back(std::move(tmp));
                 }
             }
@@ -4461,7 +4460,7 @@ int main(int argc, char ** argv) {
 
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
-                task.prompt_tokens = convert_legacy_to_mtmd(tokenized_prompts[i]);
+                task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
 
                 // OAI-compat
                 task.params.oaicompat = oaicompat;
@@ -4562,7 +4561,7 @@ int main(int argc, char ** argv) {
                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
-                task.prompt_tokens = convert_legacy_to_mtmd(tmp);
+                task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
                 tasks.push_back(std::move(task));
             }
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 5232c62563520..1d093112de5ee 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -988,33 +988,30 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 // (may need to refactor in near future)
 //
 
-// each chunk can contain either one SINGLE text token or pointer to image
-// this is to simplify the logic of KV cache management
-struct server_token {
-    llama_token txt;
-    std::shared_ptr<mtmd_image_tokens> img;
-    std::string str() const {
-        // for debugging
-        GGML_ASSERT(img || txt != LLAMA_TOKEN_NULL);
-        if (img) {
-            return "<embd> ";
-        } else {
-            return std::to_string(txt) + " ";
-        }
-    }
-};
-
 /**
  * server_tokens is a helper to manage the input tokens and image for the server.
  * it is made this way to simplify the logic of KV cache management.
- *
- * each token can be either a text token or a pointer to an image.
- * if image usually contains multiple tokens, each token contains a shared_ptr to the same image.
  */
 struct server_tokens {
     bool has_mtmd = false;
-    std::vector<server_token> values;
 
+private: // disallow accessing these members directly, risking out-of-sync
+
+    // map a **start** position in tokens to the image chunk
+    std::unordered_map<llama_pos, mtmd_input_chunk> map_pos_to_image;
+
+    // list of tokens
+    // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
+    // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
+    // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
+    llama_tokens tokens;
+
+    // for ex. with input of 5 text tokens and 2 images:
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+    // pos  0   1   2   3   4   5      6      7      8      9
+    // map_pos_to_image will contain: {5, img0}, {8, img1}
+
+public:
     server_tokens() = default;
     ~server_tokens() = default;
 
@@ -1027,89 +1024,141 @@ struct server_tokens {
     server_tokens& operator=(server_tokens&&) = default;
 
     // Allow accessing elements using [] operator
-    server_token& operator[](size_t index) { return values[index]; }
-    const server_token& operator[](size_t index) const { return values[index]; }
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
 
     server_tokens(mtmd_input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
         for (auto & c : mtmd_chunks) {
-            if (c.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-                add_image_tokens(std::move(c.tokens_image));
-            } else if (c.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-                for (auto & tok : c.tokens_text) {
-                    add_text_token(tok);
-                }
+            push_back(std::move(c));
+        }
+    }
+
+    server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
+
+    // for debugging
+    std::string str() const {
+        std::ostringstream oss;
+        for (const auto & t : tokens) {
+            if (t == LLAMA_TOKEN_NULL) {
+                oss << "<embd> ";
             } else {
-                GGML_ABORT("Invalid chunk type");
+                oss << t << " ";
             }
         }
+        return oss.str();
     }
 
-    std::string str() const {
-        // for debugging
-        std::string ret;
-        for (const auto & t : values) {
-            ret += t.str();
+    const mtmd_input_chunk & find_chunk(llama_pos pos) const {
+        auto it = map_pos_to_image.find(pos);
+        if (it != map_pos_to_image.end()) {
+            return it->second;
+        } else {
+            throw std::runtime_error("Chunk not found");
+        }
+    }
+
+    void push_back(llama_token tok) {
+        if (tok == LLAMA_TOKEN_NULL) {
+            throw std::runtime_error("Invalid token");
         }
-        return ret;
+        tokens.emplace_back(tok);
+    }
+
+    void push_back(mtmd_input_chunk && chunk) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            GGML_ASSERT(has_mtmd);
+            GGML_ASSERT(chunk.tokens_image != nullptr);
+            const int n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+            llama_pos start_pos = tokens.size();
+            for (int i = 0; i < n_pos; ++i) {
+                tokens.emplace_back(LLAMA_TOKEN_NULL);
+            }
+            // TODO: use mtmd_input_chunk_copy when the C API is ready
+            map_pos_to_image[start_pos] = std::move(chunk);
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            for (auto & tok : chunk.tokens_text) {
+                push_back(tok);
+            }
+        } else {
+            GGML_ABORT("Invalid chunk type");
+        }
+    }
+
+    // TODO: use mtmd_input_chunk_copy when the C API is ready
+    void move_chunk(server_tokens & dst, llama_pos pos) {
+        auto it = map_pos_to_image.find(pos);
+        if (it == map_pos_to_image.end()) {
+            throw std::runtime_error("Chunk not found");
+        }
+        dst.push_back(std::move(it->second));
+    }
+
+    void insert(llama_tokens & tokens) {
+        tokens.insert(tokens.end(), tokens.begin(), tokens.end());
     }
 
     size_t size() const {
-        return values.size();
+        return tokens.size();
     }
 
     bool empty() const {
-        return values.empty();
+        return tokens.empty();
     }
 
     void clear() {
-        values.clear();
+        tokens.clear();
     }
 
     void resize(size_t n) {
-        values.resize(n);
-    }
-
-    void add_token(server_token && t) {
-        if (t.img) GGML_ASSERT(has_mtmd);
-        values.push_back(std::move(t));
+        // we throw an error if we try to remove a token in the middle of an image
+        // for ex. with input of 5 text tokens and 2 images:
+        //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+        // n  1   2   3   4   5   6      7      8      9      10
+        // allowed to resize      ^                    ^
+        // disallowed to resize          ^      ^             ^
+        if (n > 0) {
+            llama_token last_token = tokens[n - 1];
+            // make sure we never remove tokens in the middle of an image
+            if (last_token == LLAMA_TOKEN_NULL) {
+                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+            }
+        }
+        tokens.resize(n);
     }
 
-    void add_text_token(llama_token tok) {
-        GGML_ASSERT(tok != LLAMA_TOKEN_NULL);
-        values.push_back({tok, nullptr});
+    // for compatibility with speculative decoding and ctx shift
+    const llama_tokens & get_text_tokens() const {
+        return tokens;
     }
 
-    void add_image_tokens(mtmd_image_tokens_ptr && image) {
-        GGML_ASSERT(has_mtmd);
-        GGML_ASSERT(image != nullptr);
-        std::shared_ptr<mtmd_image_tokens> tok_image(std::move(image));
-        size_t n_tokens = mtmd_image_tokens_get_n_tokens(tok_image.get());
-        GGML_ASSERT(n_tokens > 0 && "Invalid image token"); // should never happen
-        for (size_t i = 0; i < n_tokens; ++i) {
-            values.push_back({LLAMA_TOKEN_NULL, tok_image});
-        }
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id) {
+        // TODO: may need validation
+        tokens[pos] = id;
     }
 
     size_t get_common_prefix(const server_tokens & b) const {
-        size_t max_idx = std::min(values.size(), b.values.size());
+        size_t max_idx = std::min(tokens.size(), b.tokens.size());
         for (size_t i = 0; i < max_idx; ++i) {
-            auto & ai =   values[i];
-            auto & bi = b.values[i];
+            auto & ai =   tokens[i];
+            auto & bi = b.tokens[i];
 
-            if (ai.txt == bi.txt && !ai.img && !bi.img) {
-                continue;
-            } else if (ai.img && bi.img) {
+            if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
                 GGML_ASSERT(has_mtmd);
-                std::string ai_id = mtmd_image_tokens_get_id(ai.img.get());
-                std::string bi_id = mtmd_image_tokens_get_id(bi.img.get());
+                const auto & a_chunk =   find_chunk(i);
+                const auto & b_chunk = b.find_chunk(i);
+                std::string ai_id = mtmd_image_tokens_get_id(a_chunk.tokens_image.get());
+                std::string bi_id = mtmd_image_tokens_get_id(b_chunk.tokens_image.get());
                 if (ai_id == bi_id) {
-                    size_t n_tokens = mtmd_image_tokens_get_n_tokens(ai.img.get());
-                    GGML_ASSERT(n_tokens > 0 && "Invalid image token"); // should never happen
-                    i += mtmd_image_tokens_get_n_tokens(ai.img.get()) - 1;
+                    size_t n_pos = mtmd_image_tokens_get_n_pos(a_chunk.tokens_image.get());
+                    GGML_ASSERT(n_pos > 0 && "Invalid image token"); // should never happen
+                    i += n_pos - 1; // will be +1 by the for loop
                     continue;
                 } else {
                     return i;
                 }
+            } else if (ai == bi) {
+                continue;
             } else {
                 return i;
             }
@@ -1119,198 +1168,72 @@ struct server_tokens {
 
     // make sure all text tokens are within the vocab range
     bool validate(llama_token max_vocab_id) const {
-        for (const auto & t : values) {
-            if (!t.img) {
-                if (t.txt < 0 || t.txt >= max_vocab_id) {
+        for (size_t i = 0; i < tokens.size(); ++i) {
+            auto & t = tokens[i];
+            if (t == LLAMA_TOKEN_NULL) {
+                try {
+                    const auto & chunk = find_chunk(i);
+                    size_t n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+                    i += n_pos - 1; // will be +1 by the for loop
+                } catch (const std::exception & e) {
                     return false;
                 }
             }
-        }
-        return true;
-    }
-
-    // same idea with std::vector<llama_token>::resize()
-    void keep_until(size_t pos) {
-        // TODO : maybe throw error we remove part of the image (only allow removing the whole image)
-        //        this cannot happen currently because get_common_prefix() only never returns such pos
-        values.resize(pos);
-    }
-
-    // Computes FNV-1a hash of the data
-    static std::string fnv_hash(const uint8_t * data, size_t len) {
-        const uint64_t fnv_prime = 0x100000001b3ULL;
-        uint64_t hash = 0xcbf29ce484222325ULL;
-
-        for (size_t i = 0; i < len; ++i) {
-            hash ^= data[i];
-            hash *= fnv_prime;
-        }
-        return std::to_string(hash);
-    }
-
-    // TODO: maybe implement a (de)seralizer for this struct, so we can get rid of functions below
-
-    // return all text tokens (for legacy code), to be used by save/load slot
-    llama_tokens get_text_tokens() {
-        llama_tokens output;
-        for (auto & t : values) {
-            if (t.txt != LLAMA_TOKEN_NULL) {
-                output.push_back(t.txt);
+            if (t < 0 || t >= max_vocab_id) {
+                return false;
             }
         }
-        return output;
+        return true;
     }
 
-    // clear and set text tokens (for legacy code), to be used by save/load slot
-    void set_text_tokens(llama_tokens tokens) {
-        values.clear();
-        for (auto & tok : tokens) {
-            add_text_token(tok);
+    // TODO: (IMPORTANT) this is hacky ; use mtmd helper when C API is ready
+    int32_t process_chunk(
+                llama_context * ctx,
+                mtmd_context * mctx,
+                llama_pos n_past,
+                int32_t seq_id,
+                llama_pos & n_pos_out) {
+        auto it = map_pos_to_image.find(n_past);
+        if (it == map_pos_to_image.end()) {
+            throw std::runtime_error("Chunk not found");
         }
-    }
-};
-
-// helper struct to make working with embd batch easier
-// note: this will be removed after llama_batch_ext refactoring
-struct server_batch_embd {
-    std::vector<llama_pos>      pos;
-    std::vector<llama_token>    token;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-
-    llama_batch batch;
-
-    server_batch_embd() : server_batch_embd(1) {}
-    server_batch_embd(int32_t n_tokens) {
-        token   .resize(n_tokens);
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        logits  .resize(n_tokens);
-        seq_id  .resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        seq_ids[n_tokens] = nullptr;
-
-        batch = {
-            /*n_tokens       =*/ 0,
-            /*tokens         =*/ token.data(),
-            /*embd           =*/ nullptr,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-
-        for (int i = 0; i < n_tokens; i++) {
-            batch.n_seq_id[i] = 1; // only a single seq_id per token is needed
-            batch.seq_id  [i] = seq_id.data() + i;
+        size_t n_pos = mtmd_image_tokens_get_n_pos(it->second.tokens_image.get());
+        mtmd_input_chunks chunks;
+        {
+            mtmd_input_chunk chunk0{
+                /* type          */ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                /* tokens_text   */ {},
+                /* tokens_image  */ std::move(it->second.tokens_image), // move it back later
+            };
+            mtmd_input_chunk chunk1{
+                /* type          */ MTMD_INPUT_CHUNK_TYPE_TEXT,
+                /* tokens_text   */ {},
+                /* tokens_image  */ nullptr,
+            };
+            chunks.emplace_back(std::move(chunk0));
+            chunks.emplace_back(std::move(chunk1));
         }
-    }
-
-    void reserve_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        GGML_ASSERT(n_tokens <= (int32_t)pos.size());
-        batch.n_tokens = n_tokens;
-        batch.embd     = embd;
-        batch.token    = nullptr;
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i]    = pos_0 + i;
-            batch.n_seq_id[i]    = 1;
-            batch.seq_id  [i][0] = seq_id;
-            batch.logits  [i]    = false;
+        int32_t n_batch = llama_n_batch(ctx);
+        int32_t result = mtmd_helper_eval(mctx, ctx, chunks, n_past, seq_id, n_batch);
+        it->second.tokens_image = std::move(chunks[0].tokens_image);
+        if (result != 0) {
+            LOG_ERR("mtmd_helper_eval failed with status %d", result);
+            n_pos_out = 0;
+            return result;
         }
-    }
-
-    void clear() {
-        batch.n_tokens = 0;
-        batch.embd     = nullptr;
-        batch.token    = token.data();
-    }
-
-    int32_t n_tokens() const {
-        return batch.n_tokens;
-    }
-
-    bool has_embd() const {
-        return batch.embd != nullptr && batch.n_tokens > 0;
-    }
-
-    bool has_text() const {
-        return batch.token != nullptr && batch.n_tokens > 0;
+        n_pos_out = n_pos;
+        return 0;
     }
 };
 
-static int32_t server_img_process(
-        llama_context * ctx,
-        mtmd_context * mctx,
-        server_token & chunk,
-        server_batch_embd & batch,
-        llama_pos n_past,
-        int slot_id) {
-    GGML_ASSERT(chunk.img);
-    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.img.get());
-    int32_t ret;
-
-    // encode the image
-    {
-        int64_t t0 = ggml_time_ms();
-        SRV_INF("encoding image (%d tokens)...\n", (int)n_tokens);
-        ret = mtmd_encode(mctx, chunk.img.get());
-        if (ret != 0) {
-            SRV_ERR("failed to encode image, status = %d\n", ret);
-            return ret;
-        }
-        SRV_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-    }
-
-    float * embd = mtmd_get_output_embd(mctx);
-    // decode the embeddings
-    int64_t t1            = ggml_time_ms();
-    int32_t n_embd        = llama_model_n_embd(llama_get_model(ctx));
-    int32_t n_batch       = batch.pos.size();
-    int32_t i_batch       = 0;
-    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-    // split into batches
-    while (i_batch < n_img_batches) {
-        int32_t pos_offset = i_batch*n_batch;
-        int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-        float * embd_batch = embd + pos_offset*n_embd;
-        batch.clear();
-        batch.reserve_embd_batch(embd_batch, n_tokens_batch, n_past, slot_id);
-
-        SRV_INF("decoding embd batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
-
-        // TODO @ngxson : maybe move this to llama_batch_ext
-        if (mtmd_decode_use_non_causal(mctx)) {
-            llama_set_causal_attn(ctx, false);
-        }
-
-        ret = llama_decode(ctx, batch.batch);
-        if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
-            llama_set_causal_attn(ctx, true); // restore causal attn
-            return ret;
-        }
-
-        if (mtmd_decode_use_non_causal(mctx)) {
-            llama_set_causal_attn(ctx, true);
-        }
-
-        i_batch++;
-        n_past += n_tokens_batch;
-    }
-    SRV_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
 
-    batch.clear();
-    return 0;
-}
-
-// hacky, support text-only for now
-static server_tokens convert_legacy_to_mtmd(llama_tokens & tokenized) {
-    server_tokens res;
-    res.has_mtmd = false;
-    for (auto & tok : tokenized) {
-        res.add_text_token(tok);
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
     }
-    return res;
+    return std::to_string(hash);
 }

From 3ee071c845a54274de6fb5f39910297b49ea35f5 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 23:11:26 +0200
Subject: [PATCH 32/59] clip :  fix confused naming ffn_up and ffn_down

---
 convert_hf_to_gguf.py          |  6 ++++++
 gguf-py/gguf/tensor_mapping.py | 11 ++++-------
 tools/llava/clip.cpp           | 27 +++++++++++++++++++++------
 tools/llava/mtmd-cli.cpp       |  4 ++++
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 838999531e580..cbc88134795a1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1778,6 +1778,12 @@ class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # fix for SmolVLM2, missing `num_attention_heads` in config.json
+        if self.hf_arch == "VLlama3ForCausalLM":
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
+
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 2b089f84a841a..003b0172c77b0 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -977,15 +977,12 @@ class TensorNameMap:
             "visual.blocks.{bid}.norm2", # qwen2vl
         ),
 
-        # some namings are messed up because the original llava code swapped fc1 and fc2
-        # we have no better way to fix it, just be careful
-        # new models like pixtral use the correct naming
         MODEL_TENSOR.V_ENC_FFN_UP: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
             "vpm.encoder.layers.{bid}.mlp.fc1",
-            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
+            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
             "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
-            "visual.blocks.{bid}.mlp.fc2", # qwen2vl
+            "visual.blocks.{bid}.mlp.fc1", # qwen2vl
             "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
         ),
 
@@ -997,9 +994,9 @@ class TensorNameMap:
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vpm.encoder.layers.{bid}.mlp.fc2",
-            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
+            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
             "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
-            "visual.blocks.{bid}.mlp.fc1", # qwen2vl
+            "visual.blocks.{bid}.mlp.fc2", # qwen2vl
             "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
         ),
 
diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
index 7607d4e3ae3a4..75cfa0946ef79 100644
--- a/tools/llava/clip.cpp
+++ b/tools/llava/clip.cpp
@@ -471,14 +471,14 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
 
         // siglip uses gelu
         cur = ggml_gelu(ctx0, cur);
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
@@ -1798,6 +1798,7 @@ struct clip_model_loader {
     }
 
     void load_tensors() {
+        auto & hparams = ctx_clip.vision_model.hparams;
         std::map<std::string, size_t> tensor_offset;
         std::vector<ggml_tensor *> tensors_to_load;
 
@@ -1851,8 +1852,8 @@ struct clip_model_loader {
         vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
 
         // layers
-        vision_model.layers.resize(vision_model.hparams.n_layer);
-        for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
+        vision_model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
             auto & layer = vision_model.layers[il];
             layer.k_w    = get_tensor(string_format(TN_ATTN_K,      "v", il, "weight"));
             layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "weight"));
@@ -1875,6 +1876,20 @@ struct clip_model_loader {
             layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
             layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
 
+            // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
+            if (layer.ff_up_w && layer.ff_down_w
+                    && layer.ff_up_w->ne[0] == hparams.n_intermediate
+                    && layer.ff_down_w->ne[0] == hparams.hidden_size) {
+                // swap up and down weights
+                ggml_tensor * tmp = layer.ff_up_w;
+                layer.ff_up_w = layer.ff_down_w;
+                layer.ff_down_w = tmp;
+                // swap up and down biases
+                tmp = layer.ff_up_b;
+                layer.ff_up_b = layer.ff_down_b;
+                layer.ff_down_b = tmp;
+            }
+
             // legacy naming (the in and out is reversed! don't ask me why)
             layer.ff_i_w = layer.ff_down_w;
             layer.ff_o_w = layer.ff_up_w;
diff --git a/tools/llava/mtmd-cli.cpp b/tools/llava/mtmd-cli.cpp
index 474e7c4f8357e..e3db823799674 100644
--- a/tools/llava/mtmd-cli.cpp
+++ b/tools/llava/mtmd-cli.cpp
@@ -92,6 +92,10 @@ struct mtmd_cli_context {
         batch = llama_batch_init(params.n_batch, 0, 1);
         n_batch = params.n_batch;
 
+        if (!model || !lctx) {
+            exit(1);
+        }
+
         if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
             LOG_ERR("Model does not have chat template.\n");
             LOG_ERR("  For old llava models, you may need to use '--chat-template vicuna'\n");

From 3fbf0bd01a3333345855a2f9af5c5eef5eae1709 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 23:20:49 +0200
Subject: [PATCH 33/59] rm ffn_i/o/g naming

---
 tools/llava/clip.cpp | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
index 75cfa0946ef79..3a890465d84e5 100644
--- a/tools/llava/clip.cpp
+++ b/tools/llava/clip.cpp
@@ -191,12 +191,6 @@ struct clip_layer {
     struct ggml_tensor * ln_1_w = nullptr;
     struct ggml_tensor * ln_1_b = nullptr;
 
-    // ff
-    struct ggml_tensor * ff_i_w = nullptr; // legacy naming
-    struct ggml_tensor * ff_i_b = nullptr; // legacy naming
-    struct ggml_tensor * ff_o_w = nullptr; // legacy naming
-    struct ggml_tensor * ff_o_b = nullptr; // legacy naming
-
     struct ggml_tensor * ff_up_w = nullptr;
     struct ggml_tensor * ff_up_b = nullptr;
     struct ggml_tensor * ff_gate_w = nullptr;
@@ -204,9 +198,6 @@ struct clip_layer {
     struct ggml_tensor * ff_down_w = nullptr;
     struct ggml_tensor * ff_down_b = nullptr;
 
-    struct ggml_tensor * ff_g_w = NULL;
-    struct ggml_tensor * ff_g_b = NULL;
-
     // layernorm 2
     struct ggml_tensor * ln_2_w = nullptr;
     struct ggml_tensor * ln_2_b = nullptr;
@@ -976,11 +967,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
 
         // mlp
         // ffn_up
-        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
+        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_down_b);
 
-        auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
-        cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
+        auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
+        cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_gate_b);
         // TODO : only 2 of these 3 are actually used, should we remove one of them?
         if (ctx->use_gelu) {
             cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
@@ -992,8 +983,8 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
         cur = ggml_mul(ctx0, cur_gate, cur_up);
 
         // ffn_down
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
@@ -1250,8 +1241,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
 
         if (ctx->use_gelu) {
             cur = ggml_gelu_inplace(ctx0, cur);
@@ -1261,8 +1252,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             cur = ggml_gelu_quick_inplace(ctx0, cur);
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
@@ -1889,14 +1880,6 @@ struct clip_model_loader {
                 layer.ff_up_b = layer.ff_down_b;
                 layer.ff_down_b = tmp;
             }
-
-            // legacy naming (the in and out is reversed! don't ask me why)
-            layer.ff_i_w = layer.ff_down_w;
-            layer.ff_o_w = layer.ff_up_w;
-            layer.ff_g_w = layer.ff_gate_w;
-            layer.ff_i_b = layer.ff_down_b;
-            layer.ff_o_b = layer.ff_up_b;
-            layer.ff_g_b = layer.ff_gate_b;
         }
 
         switch (ctx_clip.proj_type) {

From f3870a63ed37e2b2aa6a80aa05daf577457f84bb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 23:29:00 +0200
Subject: [PATCH 34/59] rename n_embd, n_ff

---
 tools/llava/clip.cpp | 98 ++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
index 3a890465d84e5..b70be80b4b043 100644
--- a/tools/llava/clip.cpp
+++ b/tools/llava/clip.cpp
@@ -155,8 +155,8 @@ enum patch_merge_type {
 struct clip_hparams {
     int32_t image_size;
     int32_t patch_size;
-    int32_t hidden_size;
-    int32_t n_intermediate;
+    int32_t n_embd;
+    int32_t n_ff;
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
@@ -377,9 +377,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
 
     const int patch_size  = hparams.patch_size;
     const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int hidden_size = hparams.hidden_size;
+    const int n_embd      = hparams.hidden_size;
     const int n_head      = hparams.n_head;
-    const int d_head      = hidden_size / n_head;
+    const int d_head      = n_embd / n_head;
     const int n_layer     = hparams.n_layer;
     const float eps       = hparams.eps;
 
@@ -400,7 +400,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
     ggml_set_input(inp_raw);
 
     struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
+    inp = ggml_reshape_2d(ctx0, inp, num_patches, n_embd);
     inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
     inp = ggml_add(ctx0, inp, model.patch_bias);
 
@@ -445,7 +445,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
             KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
             KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
-            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
+            cur = ggml_cont_2d(ctx0, KQV, n_embd, num_patches);
         }
 
         // attention output
@@ -493,11 +493,11 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
         const int kernel_size = patches_per_image / tokens_per_side;
 
         embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
-        embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
+        embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, n_embd, batch_size);
 
         // doing a pool2d to reduce the number of output tokens to 256
         embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], n_embd, batch_size);
         embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
 
         // apply norm before projection
@@ -626,9 +626,9 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
     const int n_patches_x = image_size_width  / patch_size;
     const int n_patches_y = image_size_height / patch_size;
     const int num_patches = n_patches_x * n_patches_y;
-    const int hidden_size = hparams.hidden_size;
+    const int n_embd      = hparams.n_embd;
     const int n_head      = hparams.n_head;
-    const int d_head      = hidden_size / n_head;
+    const int d_head      = n_embd / n_head;
     const int n_layer     = hparams.n_layer;
     const float eps       = hparams.eps;
     const int n_merge     = hparams.spatial_merge_size;
@@ -658,7 +658,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
     ggml_set_input(pos_w);
 
     struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
+    inp = ggml_reshape_2d(ctx0, inp, num_patches, n_embd);
     inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
 
     struct ggml_tensor * embeddings = inp;
@@ -699,7 +699,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
             KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
             KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
-            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
+            cur = ggml_cont_2d(ctx0, KQV, n_embd, num_patches);
 
             cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
         }
@@ -742,8 +742,8 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
         cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
 
         // reshape image tokens to 2D grid
-        cur = ggml_reshape_3d(ctx0, cur, hidden_size, n_patches_x, n_patches_y);
-        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, hidden_size]
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
         cur = ggml_cont(ctx0, cur);
 
         // torch.nn.functional.unfold is just an im2col under the hood
@@ -751,7 +751,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
         ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
         cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
 
-        // project to hidden_size
+        // project to n_embd
         cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
         cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
         embeddings = cur;
@@ -774,9 +774,9 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
     // arrangement of the [IMG_BREAK] token
     {
         // not efficient, but works
-        // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
+        // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
         // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
-        // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
+        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
 
         const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
         const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
@@ -816,9 +816,9 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
     const int patches_h            = image_size_height / patch_size;
     const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
     const int num_position_ids     = num_positions * 4; // m-rope requires 4 dim per position
-    const int hidden_size          = hparams.hidden_size;
+    const int n_embd               = hparams.n_embd;
     const int n_head               = hparams.n_head;
-    const int d_head               = hidden_size / n_head;
+    const int d_head               = n_embd / n_head;
     const int n_layer              = hparams.n_layer;
     const float eps                = hparams.eps;
 
@@ -853,14 +853,14 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
     inp = ggml_reshape_4d(
         ctx0, inp,
-        hidden_size * 2, patches_w / 2, patches_h, batch_size);
+        n_embd * 2, patches_w / 2, patches_h, batch_size);
     inp = ggml_reshape_4d(
         ctx0, inp,
-        hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
+        n_embd * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
     inp = ggml_reshape_3d(
         ctx0, inp,
-        hidden_size, patches_w * patches_h, batch_size);
+        n_embd, patches_w * patches_h, batch_size);
 
     if (model.patch_bias) {
         // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -893,11 +893,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
         ggml_set_name(window_mask, "window_mask");
         ggml_set_input(window_mask);
 
-        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
+        // embeddings shape: [n_embd, patches_w * patches_h, batch_size]
         GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * 4, patches_w * patches_h * batch_size / 4);
         embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, patches_w * patches_h, batch_size);
     }
 
     // loop over layers
@@ -950,7 +950,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
             KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
             KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
-            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
+            cur = ggml_cont_3d(ctx0, KQV, n_embd, num_positions, batch_size);
         }
 
         // attention output
@@ -967,8 +967,8 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
 
         // mlp
         // ffn_up
-        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
-        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_down_b);
+        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
+        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_up_b);
 
         auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
         cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_gate_b);
@@ -983,8 +983,8 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
         cur = ggml_mul(ctx0, cur_gate, cur_up);
 
         // ffn_down
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
@@ -1000,7 +1000,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
         embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
     }
 
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, num_positions / 4, batch_size);
 
     embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
     embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1017,7 +1017,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
         ggml_set_name(window_idx, "window_idx");
         ggml_set_input(window_idx);
 
-        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
+        // embeddings shape: [n_embd, patches_w * patches_h, batch_size]
         GGML_ASSERT(batch_size == 1);
         embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
         embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
@@ -1063,9 +1063,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     const int patches_h            = image_size_height / patch_size;
     const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
     const int num_position_ids     = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
-    const int hidden_size          = hparams.hidden_size;
+    const int n_embd               = hparams.n_embd;
     const int n_head               = hparams.n_head;
-    const int d_head               = hidden_size / n_head;
+    const int d_head               = n_embd / n_head;
     const float eps                = hparams.eps;
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
@@ -1103,17 +1103,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
         inp = ggml_reshape_4d(
             ctx0, inp,
-            hidden_size * 2, patches_w / 2, patches_h, batch_size);
+            n_embd * 2, patches_w / 2, patches_h, batch_size);
         inp = ggml_reshape_4d(
             ctx0, inp,
-            hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
+            n_embd * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
         inp = ggml_reshape_3d(
             ctx0, inp,
-            hidden_size, patches_w * patches_h, batch_size);
+            n_embd, patches_w * patches_h, batch_size);
     }
     else {
-        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+        inp = ggml_reshape_3d(ctx0, inp, num_patches, n_embd, batch_size);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
     }
 
@@ -1126,7 +1126,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
 
     // concat class_embeddings and patch_embeddings
     if (model.class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, num_positions, batch_size);
         embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
@@ -1223,7 +1223,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
             KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
-            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
+            cur = ggml_cont_3d(ctx0, KQV, n_embd, num_positions, batch_size);
         }
 
         // attention output
@@ -1485,9 +1485,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         }
 
         { // attention
-            int hidden_size = clip_n_mmproj_embd(ctx);
+            int n_embd = clip_n_mmproj_embd(ctx);
             const int d_head = 128;
-            int n_head = hidden_size/d_head;
+            int n_head = n_embd/d_head;
             int num_query = 96;
             if (ctx->minicpmv_version == 2) {
                 num_query = 96;
@@ -1517,7 +1517,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
             KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
             KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
+            KQV = ggml_cont_3d(ctx0, KQV, n_embd, num_query, batch_size);
 
             embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
         }
@@ -1553,7 +1553,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     }
 
     else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, num_positions / 4, batch_size);
 
         embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
         embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1678,9 +1678,9 @@ struct clip_model_loader {
             get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
             get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
 
-            get_u32(KEY_N_EMBD,         hparams.hidden_size);
+            get_u32(KEY_N_EMBD,         hparams.n_embd);
             get_u32(KEY_N_HEAD,         hparams.n_head);
-            get_u32(KEY_N_FF,           hparams.n_intermediate);
+            get_u32(KEY_N_FF,           hparams.n_ff);
             get_u32(KEY_N_BLOCK,        hparams.n_layer);
             get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
             get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
@@ -1869,8 +1869,8 @@ struct clip_model_loader {
 
             // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
             if (layer.ff_up_w && layer.ff_down_w
-                    && layer.ff_up_w->ne[0] == hparams.n_intermediate
-                    && layer.ff_down_w->ne[0] == hparams.hidden_size) {
+                    && layer.ff_up_w->ne[0]   == hparams.n_ff
+                    && layer.ff_down_w->ne[0] == hparams.n_embd) {
                 // swap up and down weights
                 ggml_tensor * tmp = layer.ff_up_w;
                 layer.ff_up_w = layer.ff_down_w;
@@ -2891,7 +2891,7 @@ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
 }
 
 int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.hidden_size;
+    return ctx->vision_model.hparams.n_embd;
 }
 
 const char * clip_patch_merge_type(const struct clip_ctx * ctx) {

From ae8322914452e47f0536064a9b17cb131f6df723 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 23:29:26 +0200
Subject: [PATCH 35/59] small fix

---
 tools/llava/clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
index b70be80b4b043..f672ec98ce727 100644
--- a/tools/llava/clip.cpp
+++ b/tools/llava/clip.cpp
@@ -377,7 +377,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
 
     const int patch_size  = hparams.patch_size;
     const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int n_embd      = hparams.hidden_size;
+    const int n_embd      = hparams.n_embd;
     const int n_head      = hparams.n_head;
     const int d_head      = n_embd / n_head;
     const int n_layer     = hparams.n_layer;

From 246a4e0a0cf7b6b9da1c6a57a589416635414fce Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 3 May 2025 23:34:22 +0200
Subject: [PATCH 36/59] no check n_ff

---
 tools/llava/clip.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
index cfd3433a6103d..8bd5e790f4394 100644
--- a/tools/llava/clip.cpp
+++ b/tools/llava/clip.cpp
@@ -1877,9 +1877,8 @@ struct clip_model_loader {
             layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
 
             // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
-            if (layer.ff_up_w && layer.ff_down_w
-                    && layer.ff_up_w->ne[0]   == hparams.n_ff
-                    && layer.ff_down_w->ne[0] == hparams.n_embd) {
+            // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
+            if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
                 // swap up and down weights
                 ggml_tensor * tmp = layer.ff_up_w;
                 layer.ff_up_w = layer.ff_down_w;

From 5f1fe1b3ed4dcc0c90eb2f0e2c2a9c2208c350cd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 4 May 2025 00:23:05 +0200
Subject: [PATCH 37/59] fix detokenize

---
 tools/llava/mtmd.cpp    |  1 -
 tools/llava/mtmd.h      |  1 -
 tools/server/server.cpp | 11 +++++------
 tools/server/utils.hpp  | 25 ++++++++++++++++++++++---
 4 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index ab241d4185588..73abf2ad18e55 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -29,7 +29,6 @@ struct mtmd_context {
     bool print_timings;
     int n_threads;
     std::string image_marker;
-    bool calc_image_hash;
 
     // for minicpmv, we need special tokens in-between slices
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
diff --git a/tools/llava/mtmd.h b/tools/llava/mtmd.h
index dea9e890f4cc0..6805e5e4816c3 100644
--- a/tools/llava/mtmd.h
+++ b/tools/llava/mtmd.h
@@ -87,7 +87,6 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
-// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
 // return values:
 //   0 on success
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0c0c209b5fdaa..f03d652326878 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1478,7 +1478,7 @@ struct server_slot {
             {"is_processing", is_processing()},
             {"non_causal",    is_non_causal()},
             {"params",        params.to_json()},
-            {"prompt",        ""}, // TODO @ngxson, hacky patch, to fix before merge
+            {"prompt",        prompt_tokens.detokenize(ctx, true)},
             {"next_token",
                 {
                     {"has_next_token", has_next_token},
@@ -2450,7 +2450,6 @@ struct server_context {
 
     void send_final_response(server_slot & slot) {
         auto res = std::make_unique<server_task_result_cmpl_final>();
-        llama_tokens text_tokens = slot.prompt_tokens.get_text_tokens();
         res->id              = slot.id_task;
         res->id_slot         = slot.id;
 
@@ -2458,7 +2457,7 @@ struct server_context {
         res->content         = std::move(slot.generated_text);
         res->tokens          = std::move(slot.generated_tokens);
         res->timings         = slot.get_timings();
-        res->prompt          = common_detokenize(ctx, text_tokens, true);
+        res->prompt          = slot.prompt_tokens.detokenize(ctx, true);
         res->response_fields = std::move(slot.params.response_fields);
 
         res->truncated           = slot.truncated;
@@ -2791,7 +2790,7 @@ struct server_context {
                     std::string filename = task.slot_action.filename;
                     std::string filepath = task.slot_action.filepath;
 
-                    const llama_tokens tokens = slot->cache_tokens.get_text_tokens();
+                    const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
 
                     const int64_t t_end = ggml_time_us();
@@ -2957,7 +2956,7 @@ struct server_context {
                 llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
-                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens();
+                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
                     for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
                         new_tokens[i - n_discard] = new_tokens[i];
                     }
@@ -3097,7 +3096,7 @@ struct server_context {
                                     // we should never reach this
                                     GGML_ABORT("not supported by multimodal");
                                 }
-                                llama_tokens curr_tokens = slot.prompt_tokens.get_text_tokens();
+                                llama_tokens curr_tokens = slot.prompt_tokens.get_text_tokens(); // copy
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
                                 const int n_block_size = n_left / 2;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 1d093112de5ee..be5613b77d4f9 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1038,6 +1038,7 @@ struct server_tokens {
     // for debugging
     std::string str() const {
         std::ostringstream oss;
+        oss << "tokens: ";
         for (const auto & t : tokens) {
             if (t == LLAMA_TOKEN_NULL) {
                 oss << "<embd> ";
@@ -1045,6 +1046,11 @@ struct server_tokens {
                 oss << t << " ";
             }
         }
+        oss << "\n";
+        oss << "image pos: ";
+        for (const auto & it : map_pos_to_image) {
+            oss << it.first << ", ";
+        }
         return oss.str();
     }
 
@@ -1126,7 +1132,7 @@ struct server_tokens {
         tokens.resize(n);
     }
 
-    // for compatibility with speculative decoding and ctx shift
+    // for compatibility with speculative decoding, ctx shift, slot save/load
     const llama_tokens & get_text_tokens() const {
         return tokens;
     }
@@ -1137,6 +1143,17 @@ struct server_tokens {
         tokens[pos] = id;
     }
 
+    std::string detokenize(const llama_context * ctx, bool special) const {
+        llama_tokens text_tokens;
+        text_tokens.reserve(tokens.size());
+        for (const auto & t : tokens) {
+            if (t != LLAMA_TOKEN_NULL) {
+                text_tokens.push_back(t);
+            }
+        }
+        return common_detokenize(ctx, text_tokens, special);
+    }
+
     size_t get_common_prefix(const server_tokens & b) const {
         size_t max_idx = std::min(tokens.size(), b.tokens.size());
         for (size_t i = 0; i < max_idx; ++i) {
@@ -1178,8 +1195,7 @@ struct server_tokens {
                 } catch (const std::exception & e) {
                     return false;
                 }
-            }
-            if (t < 0 || t >= max_vocab_id) {
+            } else if (t < 0 || t >= max_vocab_id) {
                 return false;
             }
         }
@@ -1213,8 +1229,11 @@ struct server_tokens {
             chunks.emplace_back(std::move(chunk0));
             chunks.emplace_back(std::move(chunk1));
         }
+        SRV_INF("%s\n", "processing image...");
         int32_t n_batch = llama_n_batch(ctx);
+        int64_t t0 = ggml_time_ms();
         int32_t result = mtmd_helper_eval(mctx, ctx, chunks, n_past, seq_id, n_batch);
+        SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
         it->second.tokens_image = std::move(chunks[0].tokens_image);
         if (result != 0) {
             LOG_ERR("mtmd_helper_eval failed with status %d", result);

From e9f7ff92f635cd691672724fecf364f65de41e44 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 4 May 2025 21:16:16 +0200
Subject: [PATCH 38/59] add const to various places

---
 tests/test-mtmd-c-api.c |  2 +-
 tools/llava/mtmd.cpp    | 16 ++++++++--------
 tools/llava/mtmd.h      | 32 ++++++++++++++++----------------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c
index 6d968e2afa6c4..02e762e6a2d3e 100644
--- a/tests/test-mtmd-c-api.c
+++ b/tests/test-mtmd-c-api.c
@@ -22,7 +22,7 @@ int main(void) {
     assert(n_chunks > 0);
 
     for (size_t i = 0; i < n_chunks; i++) {
-        mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+        const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
         assert(chunk != NULL);
         enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
         printf("Chunk %zu type: %d\n", i, type);
diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 5f5ad57b6a972..36f1e35cad4ff 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -218,7 +218,7 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
 int32_t mtmd_tokenize(mtmd_context * ctx,
             mtmd_input_chunks * output,
             const mtmd_input_text * text,
-            mtmd_bitmap ** bitmaps,
+            const mtmd_bitmap ** bitmaps,
             size_t n_bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
@@ -454,7 +454,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
+size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
         auto chunk = mtmd_input_chunks_get(chunks, i);
@@ -473,7 +473,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
     return n_tokens;
 }
 
-llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks * chunks) {
+llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
     llama_pos n_pos = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
         auto chunk = mtmd_input_chunks_get(chunks, i);
@@ -582,7 +582,7 @@ struct decode_embd_batch {
 
 int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         struct llama_context * lctx,
-        mtmd_input_chunk * chunk,
+        const mtmd_input_chunk * chunk,
         llama_pos n_past,
         llama_seq_id seq_id,
         int32_t n_batch,
@@ -698,7 +698,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 
 int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
                                 struct llama_context * lctx,
-                                mtmd_input_chunks * chunks,
+                                const mtmd_input_chunks * chunks,
                                 llama_pos n_past,
                                 llama_seq_id seq_id,
                                 int32_t n_batch,
@@ -820,11 +820,11 @@ mtmd_input_chunks * mtmd_input_chunks_init() {
     return new mtmd_input_chunks;
 }
 
-size_t mtmd_input_chunks_size(mtmd_input_chunks * chunks) {
+size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
     return chunks->entries.size();
 }
 
-mtmd_input_chunk * mtmd_input_chunks_get(mtmd_input_chunks * chunks, size_t idx) {
+const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
     if (idx >= chunks->entries.size()) {
         return nullptr;
     }
@@ -859,7 +859,7 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
     return nullptr;
 }
 
-mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+const mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
     mtmd_input_chunk * copy = new mtmd_input_chunk{
         chunk->type,
         chunk->tokens_text,
diff --git a/tools/llava/mtmd.h b/tools/llava/mtmd.h
index 0b40851ac6627..a50a8864901b8 100644
--- a/tools/llava/mtmd.h
+++ b/tools/llava/mtmd.h
@@ -102,18 +102,18 @@ MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
-MTMD_API const char * mtmd_bitmap_get_id  (const mtmd_bitmap * bitmap);
-MTMD_API void         mtmd_bitmap_set_id  (mtmd_bitmap * bitmap, const char * id);
+MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
 
 
 // mtmd_input_chunks
 //
 // this is simply a list of mtmd_input_chunk
 // the elements can only be populated via mtmd_tokenize()
-MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
-MTMD_API size_t              mtmd_input_chunks_size(mtmd_input_chunks * chunks);
-MTMD_API mtmd_input_chunk *  mtmd_input_chunks_get (mtmd_input_chunks * chunks, size_t idx);
-MTMD_API void                mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+MTMD_API mtmd_input_chunks *      mtmd_input_chunks_init(void);
+MTMD_API size_t                   mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chunks);
 
 // mtmd_input_chunk
 //
@@ -126,8 +126,8 @@ MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
 // you can move the chunk ownership to your own code by copying it
 // remember to free the chunk when you are done with it
-MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
-MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void                     mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 
 
 // mtmd_image_tokens
@@ -159,7 +159,7 @@ MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * i
 MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                mtmd_input_chunks * output,
                                const mtmd_input_text * text,
-                               mtmd_bitmap ** bitmaps,
+                               const mtmd_bitmap ** bitmaps,
                                size_t n_bitmaps);
 
 // returns 0 on success
@@ -189,10 +189,10 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 
 // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks * chunks);
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
@@ -202,7 +202,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks * chunks);
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
                                          struct llama_context * lctx,
-                                         mtmd_input_chunks * chunks,
+                                         const mtmd_input_chunks * chunks,
                                          llama_pos n_past,
                                          llama_seq_id seq_id,
                                          int32_t n_batch,
@@ -213,7 +213,7 @@ MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                struct llama_context * lctx,
-                                               mtmd_input_chunk * chunk,
+                                               const mtmd_input_chunk * chunk,
                                                llama_pos n_past,
                                                llama_seq_id seq_id,
                                                int32_t n_batch,
@@ -275,8 +275,8 @@ struct bitmaps {
     // example:
     //   auto bitmaps_c_ptr = bitmaps.c_ptr();
     //   int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
-    std::vector<mtmd_bitmap *> c_ptr() {
-        std::vector<mtmd_bitmap *> res(entries.size());
+    std::vector<const mtmd_bitmap *> c_ptr() {
+        std::vector<const mtmd_bitmap *> res(entries.size());
         for (size_t i = 0; i < entries.size(); i++) {
             res[i] = entries[i].ptr.get();
         }
@@ -290,7 +290,7 @@ struct input_chunks {
     input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
     ~input_chunks() = default;
     size_t size() { return mtmd_input_chunks_size(ptr.get()); }
-    mtmd_input_chunk * operator[](size_t idx) {
+    const mtmd_input_chunk * operator[](size_t idx) {
         return mtmd_input_chunks_get(ptr.get(), idx);
     }
 };

From 049ae2421e1c8afb69cba5274907747913c83378 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 4 May 2025 21:27:07 +0200
Subject: [PATCH 39/59] add warning about breaking changes

---
 tools/llava/mtmd.cpp |  2 +-
 tools/llava/mtmd.h   | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 36f1e35cad4ff..9ca4d5d33dcb2 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -859,7 +859,7 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
     return nullptr;
 }
 
-const mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
     mtmd_input_chunk * copy = new mtmd_input_chunk{
         chunk->type,
         chunk->tokens_text,
diff --git a/tools/llava/mtmd.h b/tools/llava/mtmd.h
index a50a8864901b8..e2f76e2e8d346 100644
--- a/tools/llava/mtmd.h
+++ b/tools/llava/mtmd.h
@@ -15,6 +15,15 @@
 #include <memory>
 #endif
 
+/**
+ * libmtmd: A library for multimodal support in llama.cpp.
+ *
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
+ *          Issues related to API usage may receive lower priority support.
+ *
+ * For the usage, see an example in mtmd-cli.cpp
+ */
+
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@@ -126,8 +135,8 @@ MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
 // you can move the chunk ownership to your own code by copying it
 // remember to free the chunk when you are done with it
-MTMD_API const mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
-MTMD_API void                     mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 
 
 // mtmd_image_tokens
@@ -172,9 +181,10 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 /////////////////////////////////////////
 
 //
-// helper functions (can be implemented based on other functions)
+// Helper functions (can be implemented based on other functions)
 //
-// please note that these helpers are not guaranteed to be stable, there can be breaking changes in the future
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
 //
 
 // helper function to construct a mtmd_bitmap from a file
@@ -192,6 +202,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * bu
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 
 // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
 MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 
 // helper function that automatically:
@@ -230,7 +241,7 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 #endif
 
 //
-// C++ wrapper
+// C++ wrappers
 //
 
 #ifdef __cplusplus
@@ -252,6 +263,11 @@ struct mtmd_input_chunks_deleter {
 };
 using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
 
+struct mtmd_input_chunk_deleter {
+    void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
+};
+using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
+
 struct bitmap {
     bitmap_ptr ptr;
     bitmap() : ptr(nullptr) {}

From d3fece5994fac243758491902406b22d371e1645 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 4 May 2025 23:34:09 +0200
Subject: [PATCH 40/59] add c api

---
 tools/server/server.cpp | 44 +++++++++++--------
 tools/server/utils.hpp  | 94 +++++++++++++++++++----------------------
 2 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index f03d652326878..fa35ede909924 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1974,12 +1974,11 @@ struct server_context {
 
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
-            mtmd_context_params mparams{
-                /* use_gpu */   params_base.mmproj_use_gpu,
-                /* timings */   false,
-                /* n_threads */ params_base.cpuparams.n_threads,
-                /* verbosity */ params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
-            };
+            mtmd_context_params mparams = mtmd_context_params_default();
+            mparams.use_gpu       = params_base.mmproj_use_gpu;
+            mparams.print_timings = false;
+            mparams.n_threads     = params_base.cpuparams.n_threads;
+            mparams.verbosity     = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
             mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
             if (mctx == nullptr) {
                 SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
@@ -3214,8 +3213,10 @@ struct server_context {
                     // check if we should process the image
                     if (cur_tok == LLAMA_TOKEN_NULL) {
                         // process the image
-                        int32_t n_pos = slot.n_past;
-                        int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, n_pos);
+                        int32_t new_n_past;
+                        int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
+                        int32_t n_pos = new_n_past - slot.n_past;
+
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             slot.release();
@@ -3224,7 +3225,8 @@ struct server_context {
                         }
 
                         if (slot.params.cache_prompt) {
-                            slot.prompt_tokens.move_chunk(slot.cache_tokens, slot.n_past);
+                            const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
+                            slot.cache_tokens.push_back(chunk.get()); // copy
                         }
 
                         slot.n_past                    += n_pos;
@@ -4073,21 +4075,21 @@ int main(int argc, char ** argv) {
             //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
 
             // process files
-            std::vector<mtmd_bitmap> bitmaps;
+            mtmd::bitmaps bitmaps;
             const bool has_mtmd = ctx_server.mctx != nullptr;
             {
                 if (!has_mtmd && !files.empty()) {
                     throw std::runtime_error("This server does not support multimodal");
                 }
                 for (auto & file : files) {
-                    mtmd_bitmap bmp;
-                    int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
-                    if (res != 0) {
+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
+                    if (!bmp.ptr) {
                         throw std::runtime_error("Failed to load image");
                     }
                     // calculate bitmap hash (for KV caching)
-                    bmp.id = fnv_hash(bmp.data.data(), bmp.data.size());
-                    bitmaps.push_back(std::move(bmp));
+                    std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
+                    bmp.set_id(hash.c_str());
+                    bitmaps.entries.push_back(std::move(bmp));
                 }
             }
 
@@ -4098,13 +4100,19 @@ int main(int argc, char ** argv) {
 
             } else if (oaicompat && has_mtmd) {
                 // multimodal
+                std::string prompt_str = prompt.get<std::string>();
                 mtmd_input_text inp_txt = {
-                    prompt.get<std::string>(),
+                    prompt_str.c_str(),
                     /* add_special */   true,
                     /* parse_special */ true,
                 };
-                mtmd_input_chunks chunks;
-                int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
+                mtmd::input_chunks chunks(mtmd_input_chunks_init());
+                auto bitmaps_c_ptr = bitmaps.c_ptr();
+                int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
+                                                    chunks.ptr.get(),
+                                                    &inp_txt,
+                                                    bitmaps_c_ptr.data(),
+                                                    bitmaps_c_ptr.size());
                 if (tokenized != 0) {
                     throw std::runtime_error("Failed to tokenize prompt");
                 }
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index be5613b77d4f9..98ae61f39c6f7 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -998,7 +998,7 @@ struct server_tokens {
 private: // disallow accessing these members directly, risking out-of-sync
 
     // map a **start** position in tokens to the image chunk
-    std::unordered_map<llama_pos, mtmd_input_chunk> map_pos_to_image;
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
 
     // list of tokens
     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@@ -1027,9 +1027,9 @@ struct server_tokens {
     llama_token operator[](size_t index) { return tokens[index]; }
     const llama_token& operator[](size_t index) const { return tokens[index]; }
 
-    server_tokens(mtmd_input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
-        for (auto & c : mtmd_chunks) {
-            push_back(std::move(c));
+    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
+        for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
+            push_back(mtmd_chunks[i]);
         }
     }
 
@@ -1054,7 +1054,7 @@ struct server_tokens {
         return oss.str();
     }
 
-    const mtmd_input_chunk & find_chunk(llama_pos pos) const {
+    const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
         auto it = map_pos_to_image.find(pos);
         if (it != map_pos_to_image.end()) {
             return it->second;
@@ -1070,35 +1070,31 @@ struct server_tokens {
         tokens.emplace_back(tok);
     }
 
-    void push_back(mtmd_input_chunk && chunk) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk * chunk) {
+        auto type = mtmd_input_chunk_get_type(chunk);
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(has_mtmd);
-            GGML_ASSERT(chunk.tokens_image != nullptr);
-            const int n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
             llama_pos start_pos = tokens.size();
+            printf("start_pos = %d, n_pos = %d\n", start_pos, n_pos);
             for (int i = 0; i < n_pos; ++i) {
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
-            // TODO: use mtmd_input_chunk_copy when the C API is ready
-            map_pos_to_image[start_pos] = std::move(chunk);
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            for (auto & tok : chunk.tokens_text) {
-                push_back(tok);
+            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+            map_pos_to_image[start_pos] = std::move(new_chunk);
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            for (size_t i = 0; i < n_tokens; ++i) {
+                push_back(text_tokens[i]);
             }
         } else {
             GGML_ABORT("Invalid chunk type");
         }
     }
 
-    // TODO: use mtmd_input_chunk_copy when the C API is ready
-    void move_chunk(server_tokens & dst, llama_pos pos) {
-        auto it = map_pos_to_image.find(pos);
-        if (it == map_pos_to_image.end()) {
-            throw std::runtime_error("Chunk not found");
-        }
-        dst.push_back(std::move(it->second));
-    }
-
     void insert(llama_tokens & tokens) {
         tokens.insert(tokens.end(), tokens.begin(), tokens.end());
     }
@@ -1116,6 +1112,7 @@ struct server_tokens {
     }
 
     void resize(size_t n) {
+        GGML_ASSERT(n <= tokens.size());
         // we throw an error if we try to remove a token in the middle of an image
         // for ex. with input of 5 text tokens and 2 images:
         //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@@ -1164,12 +1161,16 @@ struct server_tokens {
                 GGML_ASSERT(has_mtmd);
                 const auto & a_chunk =   find_chunk(i);
                 const auto & b_chunk = b.find_chunk(i);
-                std::string ai_id = mtmd_image_tokens_get_id(a_chunk.tokens_image.get());
-                std::string bi_id = mtmd_image_tokens_get_id(b_chunk.tokens_image.get());
-                if (ai_id == bi_id) {
-                    size_t n_pos = mtmd_image_tokens_get_n_pos(a_chunk.tokens_image.get());
-                    GGML_ASSERT(n_pos > 0 && "Invalid image token"); // should never happen
-                    i += n_pos - 1; // will be +1 by the for loop
+                GGML_ASSERT(a_chunk && b_chunk);
+                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
+                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
+                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
+                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
+                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
+                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
+                if (ai_id == bi_id && a_pos == b_pos) {
+                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+                    i += a_pos - 1; // will be +1 by the for loop
                     continue;
                 } else {
                     return i;
@@ -1190,7 +1191,8 @@ struct server_tokens {
             if (t == LLAMA_TOKEN_NULL) {
                 try {
                     const auto & chunk = find_chunk(i);
-                    size_t n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
+                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
                     i += n_pos - 1; // will be +1 by the for loop
                 } catch (const std::exception & e) {
                     return false;
@@ -1202,7 +1204,7 @@ struct server_tokens {
         return true;
     }
 
-    // TODO: (IMPORTANT) this is hacky ; use mtmd helper when C API is ready
+    // encode and decode the image chunk
     int32_t process_chunk(
                 llama_context * ctx,
                 mtmd_context * mctx,
@@ -1213,34 +1215,24 @@ struct server_tokens {
         if (it == map_pos_to_image.end()) {
             throw std::runtime_error("Chunk not found");
         }
-        size_t n_pos = mtmd_image_tokens_get_n_pos(it->second.tokens_image.get());
-        mtmd_input_chunks chunks;
-        {
-            mtmd_input_chunk chunk0{
-                /* type          */ MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                /* tokens_text   */ {},
-                /* tokens_image  */ std::move(it->second.tokens_image), // move it back later
-            };
-            mtmd_input_chunk chunk1{
-                /* type          */ MTMD_INPUT_CHUNK_TYPE_TEXT,
-                /* tokens_text   */ {},
-                /* tokens_image  */ nullptr,
-            };
-            chunks.emplace_back(std::move(chunk0));
-            chunks.emplace_back(std::move(chunk1));
-        }
         SRV_INF("%s\n", "processing image...");
         int32_t n_batch = llama_n_batch(ctx);
         int64_t t0 = ggml_time_ms();
-        int32_t result = mtmd_helper_eval(mctx, ctx, chunks, n_past, seq_id, n_batch);
+        llama_pos new_n_past = n_past;
+        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+            it->second.get(), // chunk
+            n_past,
+            seq_id,
+            n_batch,
+            true, // logits last
+            &new_n_past);
         SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
-        it->second.tokens_image = std::move(chunks[0].tokens_image);
         if (result != 0) {
             LOG_ERR("mtmd_helper_eval failed with status %d", result);
-            n_pos_out = 0;
+            n_pos_out = n_past;
             return result;
         }
-        n_pos_out = n_pos;
+        n_pos_out = new_n_past;
         return 0;
     }
 };

From 076e3b937c3519b7d2b493928475ad355cebd040 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 4 May 2025 23:36:21 +0200
Subject: [PATCH 41/59] helper: use mtmd_image_tokens_get_n_pos

---
 tools/llava/mtmd.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/llava/mtmd.cpp b/tools/llava/mtmd.cpp
index 9ca4d5d33dcb2..b600e4341375f 100644
--- a/tools/llava/mtmd.cpp
+++ b/tools/llava/mtmd.cpp
@@ -681,8 +681,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
             i_batch++;
         }
 
-        // for mrope, one image is one single **temporal** position
-        n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
+        n_past += mtmd_image_tokens_get_n_pos(image_tokens);
         *new_n_past = n_past;
 
         if (mtmd_decode_use_non_causal(ctx)) {

From 01c623e6e8e393b4902c966709547fc4875f995f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 5 May 2025 00:08:21 +0200
Subject: [PATCH 42/59] fix ctx_shift

---
 tools/server/server.cpp     | 8 ++++----
 tools/server/tests/utils.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index fa35ede909924..743d6619d74ae 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2960,6 +2960,7 @@ struct server_context {
                         new_tokens[i - n_discard] = new_tokens[i];
                     }
 
+                    new_tokens.resize(slot.cache_tokens.size() - n_discard);
                     slot.cache_tokens.clear();
                     slot.cache_tokens.insert(new_tokens);
                 }
@@ -3095,12 +3096,12 @@ struct server_context {
                                     // we should never reach this
                                     GGML_ABORT("not supported by multimodal");
                                 }
-                                llama_tokens curr_tokens = slot.prompt_tokens.get_text_tokens(); // copy
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
                                 const int n_block_size = n_left / 2;
                                 const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
+                                const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
                                 llama_tokens new_tokens(
                                         curr_tokens.begin(),
                                         curr_tokens.begin() + slot.params.n_keep);
@@ -3208,10 +3209,9 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.resize(slot.n_past);
 
-                    llama_token cur_tok = slot.prompt_tokens[slot.n_past];
-
                     // check if we should process the image
-                    if (cur_tok == LLAMA_TOKEN_NULL) {
+                    if (slot.n_past < slot.n_prompt_tokens
+                            && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
                         // process the image
                         int32_t new_n_past;
                         int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 4dc2062a8e5b9..482df0c737082 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -26,7 +26,7 @@
 import wget
 
 
-DEFAULT_HTTP_TIMEOUT = 12
+DEFAULT_HTTP_TIMEOUT = 120
 
 if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ:
     DEFAULT_HTTP_TIMEOUT = 30

From a0f2562e1d253bcbc133a38609f85ff231a13a59 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 5 May 2025 00:14:14 +0200
Subject: [PATCH 43/59] fix name shadowing

---
 tools/server/utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 98ae61f39c6f7..3a5c83dfaea4e 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1095,8 +1095,8 @@ struct server_tokens {
         }
     }
 
-    void insert(llama_tokens & tokens) {
-        tokens.insert(tokens.end(), tokens.begin(), tokens.end());
+    void insert(llama_tokens & inp_tokens) {
+        tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
     }
 
     size_t size() const {

From 3304b44e81ce2665b98cd153375abe6cecf0ee0b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 May 2025 15:28:34 +0200
Subject: [PATCH 44/59] more strict condition

---
 tools/server/server.cpp     | 30 +++++++++++++++---------------
 tools/server/tests/utils.py |  2 +-
 tools/server/utils.hpp      | 26 ++++++++++++++------------
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 743d6619d74ae..5a9dcbeaeca69 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1988,12 +1988,12 @@ struct server_context {
 
             if (params_base.ctx_shift) {
                 params_base.ctx_shift = false;
-                SRV_INF("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
+                SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
             }
 
             if (params_base.n_cache_reuse) {
                 params_base.n_cache_reuse = 0;
-                SRV_INF("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
+                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
 
             if (!params_base.speculative.model.path.empty()) {
@@ -2417,6 +2417,15 @@ struct server_context {
         queue_results.send(std::move(res));
     }
 
+    // if multimodal is enabled, send an error and return false
+    bool ensure_no_mtmd(const int id_task) {
+        if (mctx) {
+            send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+            return false;
+        }
+        return true;
+    }
+
     void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
         auto res = std::make_unique<server_task_result_cmpl_partial>();
 
@@ -2766,12 +2775,9 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
@@ -2807,10 +2813,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -2857,10 +2860,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -3417,7 +3417,7 @@ struct server_context {
                 }
 
                 if (mctx) {
-                    // we should never reach this
+                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
                     GGML_ABORT("not supported by multimodal");
                 }
 
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 482df0c737082..4dc2062a8e5b9 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -26,7 +26,7 @@
 import wget
 
 
-DEFAULT_HTTP_TIMEOUT = 120
+DEFAULT_HTTP_TIMEOUT = 12
 
 if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ:
     DEFAULT_HTTP_TIMEOUT = 30
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 3a5c83dfaea4e..4e70772ec1245 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1078,7 +1078,6 @@ struct server_tokens {
             auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
             const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
             llama_pos start_pos = tokens.size();
-            printf("start_pos = %d, n_pos = %d\n", start_pos, n_pos);
             for (int i = 0; i < n_pos; ++i) {
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
@@ -1095,10 +1094,24 @@ struct server_tokens {
         }
     }
 
+    // for compatibility with context shift and prompt truncation
     void insert(llama_tokens & inp_tokens) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
     }
 
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        return tokens;
+    }
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens[pos] = id;
+    }
+
     size_t size() const {
         return tokens.size();
     }
@@ -1129,17 +1142,6 @@ struct server_tokens {
         tokens.resize(n);
     }
 
-    // for compatibility with speculative decoding, ctx shift, slot save/load
-    const llama_tokens & get_text_tokens() const {
-        return tokens;
-    }
-
-    // for compatibility with speculative decoding
-    void set_token(llama_pos pos, llama_token id) {
-        // TODO: may need validation
-        tokens[pos] = id;
-    }
-
     std::string detokenize(const llama_context * ctx, bool special) const {
         llama_tokens text_tokens;
         text_tokens.reserve(tokens.size());

From 88461f2c3489f416e3bc7fa35b4569b89cf72b52 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 May 2025 16:17:02 +0200
Subject: [PATCH 45/59] support remote image_url

---
 tools/server/server.cpp | 16 +++++++++++++--
 tools/server/utils.hpp  | 44 ++++++++++++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 5a9dcbeaeca69..53ee1145cff14 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4327,7 +4327,13 @@ int main(int argc, char ** argv) {
 
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files;
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get(), files);
+        json data = oaicompat_completion_params_parse(
+            body,
+            params.use_jinja,
+            params.reasoning_format,
+            ctx_server.chat_templates.get(),
+            ctx_server.mctx,
+            files);
 
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
@@ -4342,7 +4348,13 @@ int main(int argc, char ** argv) {
     const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files; // dummy, unused
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get(), files);
+        json data = oaicompat_completion_params_parse(
+            body,
+            params.use_jinja,
+            params.reasoning_format,
+            ctx_server.chat_templates.get(),
+            ctx_server.mctx,
+            files);
         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
     };
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 4e70772ec1245..fc9cbe3fd11c8 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "arg.h" // common_remote_get_content
 #include "base64.hpp"
 #include "mtmd.h"
 
@@ -584,6 +585,7 @@ static json oaicompat_completion_params_parse(
     bool use_jinja,
     common_reasoning_format reasoning_format,
     const struct common_chat_templates * tmpls,
+    bool allow_non_text,
     std::vector<raw_buffer> & out_files)
 {
     json llama_params;
@@ -654,21 +656,41 @@ static json oaicompat_completion_params_parse(
             std::string type      = json_value(p, "type", std::string());
             json        image_url = json_value(p, "image_url", json::object());
             if (type == "image_url") {
+                if (!allow_non_text) {
+                    throw std::runtime_error("image input is not supported by this server");
+                }
+
                 std::string url = json_value(image_url, "url", std::string());
-                std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
-                if (parts.size() != 2) {
-                    throw std::runtime_error("Invalid image_url.url value");
-                } else if (!string_starts_with(parts[0], "data:image/")) {
-                    throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
-                } else if (!string_ends_with(parts[0], "base64")) {
-                    throw std::runtime_error("image_url.url must be base64 encoded");
+                if (string_starts_with(url, "http")) {
+                    // download remote image
+                    // TODO @ngxson : maybe make these params configurable
+                    common_remote_params params;
+                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+                    params.max_size = 1024 * 1024 * 10; // 10MB
+                    auto res = common_remote_get_content(url, params);
+                    raw_buffer data;
+                    data.insert(data.end(), res.second.begin(), res.second.end());
+                    out_files.push_back(data);
+
                 } else {
-                    auto base64_data = parts[1];
-                    auto decoded_data = base64_decode(base64_data);
-                    out_files.push_back(decoded_data);
+                    // try to decode base64 image
+                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                    if (parts.size() != 2) {
+                        throw std::runtime_error("Invalid image_url.url value");
+                    } else if (!string_starts_with(parts[0], "data:image/")) {
+                        throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
+                    } else if (!string_ends_with(parts[0], "base64")) {
+                        throw std::runtime_error("image_url.url must be base64 encoded");
+                    } else {
+                        auto base64_data = parts[1];
+                        auto decoded_data = base64_decode(base64_data);
+                        out_files.push_back(decoded_data);
+                    }
                 }
+
+                // replace this chunk with a marker
                 p["type"] = "text";
-                p["text"] = "<__image__>";
+                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
                 p.erase("image_url");
             }
         }

From a9b21f423ab2c80c78ad2019d426acf965f150fa Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 May 2025 22:56:16 +0200
Subject: [PATCH 46/59] remote image_url log

---
 tools/server/utils.hpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index fc9cbe3fd11c8..e2bae4b728afc 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -667,10 +667,17 @@ static json oaicompat_completion_params_parse(
                     common_remote_params params;
                     params.headers.push_back("User-Agent: llama.cpp/" + build_info);
                     params.max_size = 1024 * 1024 * 10; // 10MB
+                    params.timeout  = 10; // seconds
+                    SRV_INF("downloading image from '%s'\n", url.c_str());
                     auto res = common_remote_get_content(url, params);
-                    raw_buffer data;
-                    data.insert(data.end(), res.second.begin(), res.second.end());
-                    out_files.push_back(data);
+                    if (200 <= res.first && res.first < 300) {
+                        SRV_INF("downloaded %ld bytes\n", res.second.size());
+                        raw_buffer data;
+                        data.insert(data.end(), res.second.begin(), res.second.end());
+                        out_files.push_back(data);
+                    } else {
+                        throw std::runtime_error("Failed to download image");
+                    }
 
                 } else {
                     // try to decode base64 image

From 2f30530dc118335f153ed79aa3bcb6453d894557 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 May 2025 23:37:56 +0200
Subject: [PATCH 47/59] add CI test

---
 tools/server/tests/unit/test_vision_api.py | 56 ++++++++++++++++++++++
 tools/server/tests/utils.py                | 18 +++++++
 2 files changed, 74 insertions(+)
 create mode 100644 tools/server/tests/unit/test_vision_api.py

diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py
new file mode 100644
index 0000000000000..921ed9ebf2ff5
--- /dev/null
+++ b/tools/server/tests/unit/test_vision_api.py
@@ -0,0 +1,56 @@
+import pytest
+from utils import *
+import base64
+import requests
+
+server: ServerProcess
+
+IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
+IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
+
+response = requests.get(IMG_URL_0)
+response.raise_for_status() # Raise an exception for bad status codes
+IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinygemma3()
+
+
+@pytest.mark.parametrize(
+    "image_url, success, re_content",
+    [
+        # test model is trained on CIFAR-10, but it's quite dumb due to small size
+        (IMG_URL_0,                True, "(cat)+"),
+        (IMG_BASE64_0,             True, "(cat)+"),
+        (IMG_URL_1,                True, "(frog)+"),
+        ("malformed",              False, None),
+        ("https://google.com/404", False, None), # non-existent image
+        ("https://ggml.ai",        False, None), # non-image data
+    ]
+)
+def test_vision_chat_completion(image_url, success, re_content):
+    global server
+    server.start(timeout_seconds=60) # vision model may take longer to load due to download size
+    res = server.make_request("POST", "/chat/completions", data={
+        "temperature": 0.0,
+        "top_k": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "What is this:\n"},
+                {"type": "image_url", "image_url": {
+                    "url": image_url,
+                }},
+            ]},
+        ],
+    })
+    if success:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 4dc2062a8e5b9..27a0f0356aae1 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -88,6 +88,7 @@ class ServerProcess:
     chat_template: str | None = None
     chat_template_file: str | None = None
     server_path: str | None = None
+    mmproj_url: str | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -194,6 +195,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--chat-template", self.chat_template])
         if self.chat_template_file:
             server_args.extend(["--chat-template-file", self.chat_template_file])
+        if self.mmproj_url:
+            server_args.extend(["--mmproj-url", self.mmproj_url])
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"tests: starting server with: {' '.join(args)}")
@@ -379,6 +382,21 @@ def jina_reranker_tiny() -> ServerProcess:
         server.server_reranking = True
         return server
 
+    @staticmethod
+    def tinygemma3() -> ServerProcess:
+        server = ServerProcess()
+        # mmproj is already provided by HF registry API
+        server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
+        server.model_hf_file = "tinygemma3-Q8_0.gguf"
+        server.mmproj_url = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf"
+        server.model_alias = "tinygemma3"
+        server.n_ctx = 1024
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 4
+        server.seed = 42
+        return server
+
 
 def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
     """

From 5ffde3862fd8e725d72f15068457c2f27976e81e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 May 2025 23:54:43 +0200
Subject: [PATCH 48/59] do not log base64

---
 tools/server/tests/unit/test_vision_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py
index 921ed9ebf2ff5..b228e46b475a0 100644
--- a/tools/server/tests/unit/test_vision_api.py
+++ b/tools/server/tests/unit/test_vision_api.py
@@ -24,7 +24,7 @@ def create_server():
     [
         # test model is trained on CIFAR-10, but it's quite dumb due to small size
         (IMG_URL_0,                True, "(cat)+"),
-        (IMG_BASE64_0,             True, "(cat)+"),
+        ("IMG_BASE64_0",           True, "(cat)+"), # exceptional, so that we don't cog up the log
         (IMG_URL_1,                True, "(frog)+"),
         ("malformed",              False, None),
         ("https://google.com/404", False, None), # non-existent image
@@ -34,6 +34,8 @@ def create_server():
 def test_vision_chat_completion(image_url, success, re_content):
     global server
     server.start(timeout_seconds=60) # vision model may take longer to load due to download size
+    if image_url == "IMG_BASE64_0":
+        image_url = IMG_BASE64_0
     res = server.make_request("POST", "/chat/completions", data={
         "temperature": 0.0,
         "top_k": 1,

From aaebc3367d3a93a339673c7683f87444d97e21cc Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 May 2025 12:52:02 +0200
Subject: [PATCH 49/59] add "has_multimodal" to /props

---
 tools/server/server.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 23e4235d570cb..b4a7c077dce48 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4009,6 +4009,7 @@ int main(int argc, char ** argv) {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
             { "model_path",                  ctx_server.params_base.model.path },
+            { "has_multimodal",              ctx_server.mctx != nullptr },
             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},

From eeda075ede16418f29c94ea395cead8dcfd025ae Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 May 2025 13:05:53 +0200
Subject: [PATCH 50/59] remove dangling image

---
 tools/server/tests/unit/test_vision_api.py | 19 +++++++------
 tools/server/utils.hpp                     | 33 ++++++++++++++--------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py
index b228e46b475a0..7cc4096f19e0c 100644
--- a/tools/server/tests/unit/test_vision_api.py
+++ b/tools/server/tests/unit/test_vision_api.py
@@ -20,18 +20,19 @@ def create_server():
 
 
 @pytest.mark.parametrize(
-    "image_url, success, re_content",
+    "prompt, image_url, success, re_content",
     [
         # test model is trained on CIFAR-10, but it's quite dumb due to small size
-        (IMG_URL_0,                True, "(cat)+"),
-        ("IMG_BASE64_0",           True, "(cat)+"), # exceptional, so that we don't cog up the log
-        (IMG_URL_1,                True, "(frog)+"),
-        ("malformed",              False, None),
-        ("https://google.com/404", False, None), # non-existent image
-        ("https://ggml.ai",        False, None), # non-image data
+        ("What is this:\n", IMG_URL_0,                True, "(cat)+"),
+        ("What is this:\n", "IMG_BASE64_0",           True, "(cat)+"), # exceptional, so that we don't cog up the log
+        ("What is this:\n", IMG_URL_1,                True, "(frog)+"),
+        ("Test test\n",     IMG_URL_1,                True, "(frog)+"), # test invalidate cache
+        ("What is this:\n", "malformed",              False, None),
+        ("What is this:\n", "https://google.com/404", False, None), # non-existent image
+        ("What is this:\n", "https://ggml.ai",        False, None), # non-image data
     ]
 )
-def test_vision_chat_completion(image_url, success, re_content):
+def test_vision_chat_completion(prompt, image_url, success, re_content):
     global server
     server.start(timeout_seconds=60) # vision model may take longer to load due to download size
     if image_url == "IMG_BASE64_0":
@@ -41,7 +42,7 @@ def test_vision_chat_completion(image_url, success, re_content):
         "top_k": 1,
         "messages": [
             {"role": "user", "content": [
-                {"type": "text", "text": "What is this:\n"},
+                {"type": "text", "text": prompt},
                 {"type": "image_url", "image_url": {
                     "url": image_url,
                 }},
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index e2bae4b728afc..fad3b0d38b6a5 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1155,17 +1155,28 @@ struct server_tokens {
 
     void resize(size_t n) {
         GGML_ASSERT(n <= tokens.size());
-        // we throw an error if we try to remove a token in the middle of an image
-        // for ex. with input of 5 text tokens and 2 images:
-        //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
-        // n  1   2   3   4   5   6      7      8      9      10
-        // allowed to resize      ^                    ^
-        // disallowed to resize          ^      ^             ^
-        if (n > 0) {
-            llama_token last_token = tokens[n - 1];
-            // make sure we never remove tokens in the middle of an image
-            if (last_token == LLAMA_TOKEN_NULL) {
-                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+        if (has_mtmd) {
+            // we throw an error if we try to remove a token in the middle of an image
+            // for ex. with input of 5 text tokens and 2 images:
+            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+            // n  1   2   3   4   5   6      7      8      9      10
+            // allowed to resize      ^                    ^
+            // disallowed to resize          ^      ^             ^
+            if (n > 0) {
+                llama_token last_token = tokens[n - 1];
+                // make sure we never remove tokens in the middle of an image
+                if (last_token == LLAMA_TOKEN_NULL) {
+                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                }
+            }
+            // remove all image chunks that are not used anymore
+            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+                llama_pos pos = it->first;
+                if (pos >= (llama_pos)n) {
+                    it = map_pos_to_image.erase(it);
+                } else {
+                    ++it;
+                }
             }
         }
         tokens.resize(n);

From bef122e1a5327fc1d3ede6488cd76bd5176605a1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 May 2025 13:22:06 +0200
Subject: [PATCH 51/59] speculative: use slot.cache_tokens.insert

---
 tools/server/server.cpp | 4 +---
 tools/server/utils.hpp  | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b4a7c077dce48..1c783b310c91a 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3484,9 +3484,7 @@ struct server_context {
                 slot.n_draft_accepted += ids.size() - 1;
 
                 slot.cache_tokens.push_back(id);
-                for (auto & t : ids) {
-                    slot.cache_tokens.push_back(t);
-                }
+                slot.cache_tokens.insert(ids);
 
                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index fad3b0d38b6a5..53f72aea15a55 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1124,7 +1124,7 @@ struct server_tokens {
     }
 
     // for compatibility with context shift and prompt truncation
-    void insert(llama_tokens & inp_tokens) {
+    void insert(const llama_tokens & inp_tokens) {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
     }

From 51afc0a23f99c9f2874e090f54c800b5d745ead2 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <thichthat@gmail.com>
Date: Fri, 9 May 2025 14:26:11 +0200
Subject: [PATCH 52/59] Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 tools/server/server.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9c81cb3772ce5..6a1adacafad11 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2777,7 +2777,10 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
-                    if (!ensure_no_mtmd(task.id)) break;
+                    if (!ensure_no_mtmd(task.id)) {
+                        break;
+                    }
+
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -3269,7 +3272,7 @@ struct server_context {
 
                         // Process all prompt tokens through sampler system
                         for (size_t i = 0; i < slot.cache_tokens.size(); ++i) {
-                            llama_token id = slot.prompt_tokens[i];
+                            llama_token id = slot.cache_tokens[i];
                             if (id != LLAMA_TOKEN_NULL) {
                                 common_sampler_accept(slot.smpl, id, false);
                             }
@@ -3491,7 +3494,7 @@ struct server_context {
                 slot.n_draft_accepted += ids.size() - 1;
 
                 slot.cache_tokens.push_back(id);
-                slot.cache_tokens.insert(ids);
+                slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
 
                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
@@ -4105,8 +4108,9 @@ int main(int argc, char ** argv) {
             std::vector<server_tokens> inputs;
             if (oaicompat && !prompt.is_string()) {
                 throw std::runtime_error("prompt must be a string");
+            }
 
-            } else if (oaicompat && has_mtmd) {
+            if (oaicompat && has_mtmd) {
                 // multimodal
                 std::string prompt_str = prompt.get<std::string>();
                 mtmd_input_text inp_txt = {
@@ -4124,9 +4128,9 @@ int main(int argc, char ** argv) {
                 if (tokenized != 0) {
                     throw std::runtime_error("Failed to tokenize prompt");
                 }
+
                 server_tokens tmp(chunks, true);
                 inputs.push_back(std::move(tmp));
-
             } else {
                 // non-multimodal version
                 auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);

From f10fc5613d4f9513bef2caea11fc7a722de47b31 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 14:26:36 +0200
Subject: [PATCH 53/59] rm can_be_detokenized

---
 tools/server/server.cpp | 10 +---------
 tools/server/utils.hpp  |  8 ++++++--
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6a1adacafad11..8ac12b41f83ca 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2135,13 +2135,6 @@ struct server_context {
         return ret;
     }
 
-    bool can_be_detokenized(const struct llama_context * ctx, const server_tokens & inp) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
-        return inp.validate(n_vocab);
-    }
-
     bool launch_slot_with_task(server_slot & slot, server_task && task) {
         slot.reset();
         slot.id_task       = task.id;
@@ -2156,8 +2149,7 @@ struct server_context {
             slot.lora = slot.params.lora;
         }
 
-        bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
-        if (!can_detokenize) {
+        if (!slot.prompt_tokens.validate(ctx)) {
             send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
             return false;
         }
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 53f72aea15a55..23163f4fe939e 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1227,7 +1227,11 @@ struct server_tokens {
     }
 
     // make sure all text tokens are within the vocab range
-    bool validate(llama_token max_vocab_id) const {
+    bool validate(const struct llama_context * ctx) const {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+
         for (size_t i = 0; i < tokens.size(); ++i) {
             auto & t = tokens[i];
             if (t == LLAMA_TOKEN_NULL) {
@@ -1239,7 +1243,7 @@ struct server_tokens {
                 } catch (const std::exception & e) {
                     return false;
                 }
-            } else if (t < 0 || t >= max_vocab_id) {
+            } else if (t < 0 || t >= n_vocab) {
                 return false;
             }
         }

From 689035cc087cec46c9da3df743298584c2da7b9a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 14:27:40 +0200
Subject: [PATCH 54/59] on prmpt processing done, assert cache_tokens.size

---
 tools/server/server.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 8ac12b41f83ca..2616dd33a0721 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3256,6 +3256,8 @@ struct server_context {
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
+                        GGML_ASSERT(slot.cache_tokens.size() == slot.prompt_tokens.size());
+
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
                         GGML_ASSERT(batch.n_tokens > 0);

From b2906a98278e7ca532718800b4ac64553a78b16b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 14:33:45 +0200
Subject: [PATCH 55/59] handle_completions_impl returns void

---
 tools/server/server.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 2616dd33a0721..6aa8765106438 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4062,7 +4062,7 @@ int main(int argc, char ** argv) {
             const std::vector<raw_buffer> & files,
             const std::function<bool()> & is_connection_closed,
             httplib::Response & res,
-            oaicompat_type oaicompat) {
+            oaicompat_type oaicompat) -> void {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
         if (ctx_server.params_base.embedding) {
@@ -4224,7 +4224,7 @@ int main(int argc, char ** argv) {
     const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         json data = json::parse(req.body);
         std::vector<raw_buffer> files; // dummy
-        return handle_completions_impl(
+        handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
             files,
@@ -4236,7 +4236,7 @@ int main(int argc, char ** argv) {
     const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         json data = oaicompat_completion_params_parse(json::parse(req.body));
         std::vector<raw_buffer> files; // dummy
-        return handle_completions_impl(
+        handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
             files,
@@ -4315,7 +4315,7 @@ int main(int argc, char ** argv) {
         );
 
         std::vector<raw_buffer> files; // dummy
-        return handle_completions_impl(
+        handle_completions_impl(
             SERVER_TASK_TYPE_INFILL,
             data,
             files,
@@ -4341,7 +4341,7 @@ int main(int argc, char ** argv) {
             ctx_server.mctx,
             files);
 
-        return handle_completions_impl(
+        handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
             data,
             files,

From f5fbc03e0c8802c024376bc7e1cb9b6391add314 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 14:43:19 +0200
Subject: [PATCH 56/59] adapt the new web ui

---
 tools/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6aa8765106438..fd4613b5f99c8 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4011,7 +4011,7 @@ int main(int argc, char ** argv) {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
             { "model_path",                  ctx_server.params_base.model.path },
-            { "has_multimodal",              ctx_server.mctx != nullptr },
+            { "modalities",                  json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},

From 5fe8d72c5225d3a0b9bcb8560b010f45e618db6f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 15:04:09 +0200
Subject: [PATCH 57/59] update docs and hot topics

---
 README.md              |  3 +-
 docs/multimodal.md     | 69 ++++++++++++++++++++++++++++++++++++++++++
 tools/mtmd/README.md   | 33 +-------------------
 tools/server/README.md | 12 ++++++++
 4 files changed, 84 insertions(+), 33 deletions(-)
 create mode 100644 docs/multimodal.md

diff --git a/README.md b/README.md
index e0232478c75a2..0401723ffcf87 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
+- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
-- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
diff --git a/docs/multimodal.md b/docs/multimodal.md
new file mode 100644
index 0000000000000..efed473a3cd07
--- /dev/null
+++ b/docs/multimodal.md
@@ -0,0 +1,69 @@
+# Multimodal
+
+llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
+- [llama-mtmd-cli](../tools/mtmd/README.md)
+- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
+
+To enable it, can use use one of the 2 methods below:
+
+- Use `-hf` option with a [supported model](../../docs/multimodal.md)
+    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
+    - To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
+- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
+
+By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
+
+For example:
+
+```sh
+# simple usage with CLI
+llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
+
+# simple usage with server
+llama-server -hf ggml-org/gemma-3-4b-it-GGUF
+
+# using local file
+llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
+
+# no GPU offload
+llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
+```
+
+## Pre-quantized models
+
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
+
+Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
+
+NOTE: some models may require large context window, for example: `-c 8192`
+
+```sh
+# Gemma 3
+(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
+(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
+(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
+
+# SmolVLM
+(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
+
+# Pixtral 12B
+(tool_name) -hf ggml-org/pixtral-12b-GGUF
+
+# Qwen 2 VL
+(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
+
+# Qwen 2.5 VL
+(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
+
+# Mistral Small 3.1 24B (IQ2_M quantization)
+(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
+```
diff --git a/tools/mtmd/README.md b/tools/mtmd/README.md
index 20e7696cefd8e..06e1fd097423a 100644
--- a/tools/mtmd/README.md
+++ b/tools/mtmd/README.md
@@ -16,38 +16,7 @@ The naming and structure related to multimodal support have evolved, which might
 
 ## Pre-quantized models
 
-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default:
-
-```sh
-# Gemma 3
-llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
-llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
-llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
-
-# SmolVLM
-llama-mtmd-cli -hf ggml-org/SmolVLM-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/SmolVLM-256M-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/SmolVLM-500M-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
-
-# Pixtral 12B
-llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
-
-# Qwen 2 VL
-llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
-
-# Qwen 2.5 VL
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
-llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
-
-# Mistral Small 3.1 24B (IQ2_M quantization)
-llama-mtmd-cli -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
-```
+See the list of pre-quantized model [here](../../docs/multimodal.md)
 
 ## How it works and what is `mmproj`?
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 0ec786ea76f7a..972ca384e69a9 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -193,6 +193,12 @@ services:
       LLAMA_ARG_PORT: 8080
 ```
 
+### Multimodal support
+
+Multimodal support was added in [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) and is currently an experimental feature.
+
+For more details, please refer to [multimodal documentation](../../docs/multimodal.md)
+
 ## Build
 
 `llama-server` is built alongside everything else from the root of the project
@@ -749,6 +755,9 @@ This endpoint is public (no API key check). By default, it is read-only. To make
   "total_slots": 1,
   "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
   "chat_template": "...",
+  "modalities": {
+    "vision": false
+  },
   "build_info": "b(build number)-(build commit hash)"
 }
 ```
@@ -757,6 +766,7 @@ This endpoint is public (no API key check). By default, it is read-only. To make
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `model_path` - the path to model file (same with `-m` argument)
 - `chat_template` - the model's original Jinja2 prompt template
+- `modalities` - the list of supported modalities
 
 ### POST `/props`: Change server global properties.
 
@@ -1069,6 +1079,8 @@ print(completion.choices[0].text)
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
+If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
+
 *Options:*
 
 See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.

From b8000fd5c4fa6092059cb5bbcd50cdece205a585 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 15:25:25 +0200
Subject: [PATCH 58/59] rm assert

---
 tools/server/server.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index fd4613b5f99c8..df12b0dab8588 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3256,7 +3256,8 @@ struct server_context {
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
-                        GGML_ASSERT(slot.cache_tokens.size() == slot.prompt_tokens.size());
+                        // TODO @ngxson : this assertion fails sometimes, why?
+                        // GGML_ASSERT(slot.cache_tokens.size() == slot.prompt_tokens.size());
 
                         slot.state = SLOT_STATE_DONE_PROMPT;
 

From 9ed430ca267176f73c407302be36aa431a82e352 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 May 2025 15:34:15 +0200
Subject: [PATCH 59/59] small fix (2)

---
 tools/server/server.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index df12b0dab8588..de8ded71fd6ad 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3256,18 +3256,16 @@ struct server_context {
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
-                        // TODO @ngxson : this assertion fails sometimes, why?
-                        // GGML_ASSERT(slot.cache_tokens.size() == slot.prompt_tokens.size());
-
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
                         GGML_ASSERT(batch.n_tokens > 0);
+                        GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
 
                         common_sampler_reset(slot.smpl);
 
                         // Process all prompt tokens through sampler system
-                        for (size_t i = 0; i < slot.cache_tokens.size(); ++i) {
-                            llama_token id = slot.cache_tokens[i];
+                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                            llama_token id = slot.prompt_tokens[i];
                             if (id != LLAMA_TOKEN_NULL) {
                                 common_sampler_accept(slot.smpl, id, false);
                             }