From e8291736842bf4e524c4800400baa941384be6da Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 09:10:12 +0200 Subject: [PATCH 1/7] Prefilling assistant message in openai compatible API --- examples/server/utils.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index aba2f27f9b564..bce1acecfb519 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,9 +642,26 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } + /* Prefill assistant message support */ + bool prefill_assistant_message = inputs.messages.size() > 0 && inputs.messages[inputs.messages.size()-1].role == "assistant"; + common_chat_msg last_message; + if (prefill_assistant_message) + { + last_message = inputs.messages.back(); + inputs.messages.pop_back(); + inputs.extract_reasoning = false; + inputs.add_generation_prompt = true; + } + // Apply chat template to the list of messages auto chat_params = common_chat_templates_apply(tmpls, inputs); + /* Append assistant prefilled message */ + if (prefill_assistant_message) + { + chat_params.prompt += last_message.content; + } + llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) { From 9d96e5caf751c81a0ad3637f9633ae722106baba Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 09:28:40 +0200 Subject: [PATCH 2/7] fixed indentation --- examples/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index bce1acecfb519..16a94c696dd0d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -647,7 +647,7 @@ static json oaicompat_completion_params_parse( common_chat_msg last_message; if (prefill_assistant_message) { - last_message = inputs.messages.back(); + last_message = inputs.messages.back(); inputs.messages.pop_back(); inputs.extract_reasoning = false; inputs.add_generation_prompt = true; From 496f08e56ab8b919268d3de7e6037291b7c0b81a Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 09:46:37 +0200 Subject: [PATCH 3/7] fixed code convention --- examples/server/utils.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 16a94c696dd0d..11fad6fe21cfa 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -645,8 +645,7 @@ static json oaicompat_completion_params_parse( /* Prefill assistant message support */ bool prefill_assistant_message = inputs.messages.size() > 0 && inputs.messages[inputs.messages.size()-1].role == "assistant"; common_chat_msg last_message; - if (prefill_assistant_message) - { + if (prefill_assistant_message) { last_message = inputs.messages.back(); inputs.messages.pop_back(); inputs.extract_reasoning = false; @@ -657,8 +656,7 @@ static json oaicompat_completion_params_parse( auto chat_params = common_chat_templates_apply(tmpls, inputs); /* Append assistant prefilled message */ - if (prefill_assistant_message) - { + if (prefill_assistant_message) { chat_params.prompt += last_message.content; } From 79eb82576cdc6c072c68ad51917ef872a8a33439 Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 09:48:37 +0200 Subject: [PATCH 4/7] simplify method usage --- examples/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 11fad6fe21cfa..aed8d9e74813d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -643,7 +643,7 @@ static json oaicompat_completion_params_parse( } /* Prefill assistant message support */ - bool prefill_assistant_message = inputs.messages.size() > 0 && inputs.messages[inputs.messages.size()-1].role == "assistant"; + bool prefill_assistant_message = inputs.messages.size() > 0 && inputs.messages.back().role == "assistant"; common_chat_msg last_message; if (prefill_assistant_message) { last_message = inputs.messages.back(); From 0c316cdead517cacb9c5c5aef56c53f04983d119 Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 10:31:15 +0200 Subject: [PATCH 5/7] no more than one assistant message at end of messages --- examples/server/utils.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index aed8d9e74813d..257bb8db2f646 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,8 +642,14 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } + /* sanity check, max one assistant message at the end of the list */ + bool last_message_is_assistant = inputs.messages.size() > 0 && inputs.messages.back().role == "assistant"; + if (last_message_is_assistant && inputs.messages.size() >= 2 && inputs.messages[inputs.messages.size()-2].role == "assistant") { + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + /* Prefill assistant message support */ - bool prefill_assistant_message = inputs.messages.size() > 0 && inputs.messages.back().role == "assistant"; + bool prefill_assistant_message = last_message_is_assistant; common_chat_msg last_message; if (prefill_assistant_message) { last_message = inputs.messages.back(); From cb7fe049d944b0a42d0a6458f68e83516bdc8246 Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 14:41:08 +0200 Subject: [PATCH 6/7] merge checks into prefill code --- examples/server/utils.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 257bb8db2f646..1b9e7ca2cdb24 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,18 +642,18 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } - /* sanity check, max one assistant message at the end of the list */ - bool last_message_is_assistant = inputs.messages.size() > 0 && inputs.messages.back().role == "assistant"; - if (last_message_is_assistant && inputs.messages.size() >= 2 && inputs.messages[inputs.messages.size()-2].role == "assistant") { - throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); - } - /* Prefill assistant message support */ - bool prefill_assistant_message = last_message_is_assistant; + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; common_chat_msg last_message; if (prefill_assistant_message) { last_message = inputs.messages.back(); inputs.messages.pop_back(); + + /* sanity check, max one assistant message at the end of the list */ + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + inputs.extract_reasoning = false; inputs.add_generation_prompt = true; } From 836015d451b51f5a9a125970f3d2d1c2be712d94 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 29 Apr 2025 16:37:21 +0200 Subject: [PATCH 7/7] Update examples/server/utils.hpp --- examples/server/utils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 1b9e7ca2cdb24..b497959fd8689 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,7 +642,8 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } - /* Prefill assistant message support */ + // if the assistant message appears at the end of list, we do not add end-of-turn token + // for ex. this can be useful to modify the reasoning process in reasoning models bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; common_chat_msg last_message; if (prefill_assistant_message) {