diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index aba2f27f9b564..b497959fd8689 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } + // if the assistant message appears at the end of list, we do not add end-of-turn token + // for ex. this can be useful to modify the reasoning process in reasoning models + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; + common_chat_msg last_message; + if (prefill_assistant_message) { + last_message = inputs.messages.back(); + inputs.messages.pop_back(); + + /* sanity check, max one assistant message at the end of the list */ + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + + inputs.extract_reasoning = false; + inputs.add_generation_prompt = true; + } + // Apply chat template to the list of messages auto chat_params = common_chat_templates_apply(tmpls, inputs); + /* Append assistant prefilled message */ + if (prefill_assistant_message) { + chat_params.prompt += last_message.content; + } + llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) {