diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index cf42710869191..1e54851ea07a0 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -92,20 +92,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past, int * st_pos_id) { int N = (int) tokens.size(); - std::vector pos; for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > n_batch) { n_eval = n_batch; } auto batch = llama_batch_get_one(&tokens[i], n_eval); - // TODO: add mrope pos ids somewhere else - pos.resize(batch.n_tokens * 4); - std::fill(pos.begin(), pos.end(), 0); - for (int j = 0; j < batch.n_tokens * 3; j ++) { - pos[j] = *st_pos_id + (j % batch.n_tokens); - } - batch.pos = pos.data(); if (llama_decode(ctx_llama, batch)) { LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b52e3f6203a4b..e6595fb18bc5b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd > 1) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the other dimensions are all 0, they are unused for text tokens + std::vector pos_data(n_tokens*n_pos_per_embd, 0); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[i] = ubatch->pos[i]; + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); diff --git a/src/llama-graph.h b/src/llama-graph.h index d192dc1495787..d0c8d32192784 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -90,29 +90,27 @@ class llm_graph_input_embd : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -419,7 +417,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const;