From 6b90566052611e1134c2debe987869666e59f427 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Sat, 9 Mar 2024 20:22:37 -0800 Subject: [PATCH 1/3] control vector api and implementation --- common/common.cpp | 217 ++++++++++++++++++++++++++++++++++++++++++++++ common/common.h | 12 +++ llama.cpp | 121 ++++++++++++++++++++++++++ llama.h | 14 +++ 4 files changed, 364 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 2f38ac632b45a..6a4ec30dd166c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -562,6 +562,35 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; + } else if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back(std::make_tuple(argv[i], 1.0f)); + } else if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * control_vector = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back(std::make_tuple(control_vector, std::stof(argv[i]))); + } else if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + int32_t start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + int32_t end = std::stoi(argv[i]); + params.control_vector_layer_range = std::make_tuple(start, end); } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -1087,6 +1116,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); @@ -1351,6 +1386,35 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + int32_t layer_start, layer_end; + std::tie(layer_start, layer_end) = params.control_vector_layer_range; + + if (layer_start == 0) layer_start = 1; + if (layer_end == 0) layer_end = 31; + + std::vector control_vector; + int n_embd; + std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors); + if (n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + control_vector.data(), + control_vector.size(), + n_embd, + layer_start, + layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1867,3 +1931,156 @@ void llama_embd_normalize(const float * inp, float * out, int n) { } } +// +// Control vector utils +// + +static std::tuple, int> llama_control_vector_load_one(const std::string & path, float strength) { + int n_tensors; + size_t n_bytes = 0; + uint32_t max_direction_layer = 0; + int n_embd = -1; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + return std::make_tuple(std::vector(), -1); + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + if (n_embd == -1) { + n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str()); + return std::make_tuple(std::vector(), -1); + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + ggml_free(ctx); + return std::make_tuple(std::vector(), -1); + } + + std::vector vector; + for (uint32_t i = 1; i < max_direction_layer; i++) { + std::string name = "direction." + std::to_string(i); + ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor) { + const float * data = (const float *) tensor->data; + for (int i = 0; i < n_embd; i++) { + vector.push_back(data[i] * strength); + } + } else { + vector.insert(vector.end(), n_embd, 0.); // as a filler + } + } + + return std::make_tuple(vector, n_embd); +} + +std::tuple, int> llama_control_vector_load(const std::vector> & vectors) { + std::vector vector; + int n_embd = -1; + + for (const auto& pair : vectors) { + std::string path; + float strength; + std::tie(path, strength) = pair; + + std::vector v; + int v_n_embd; + std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength); + + if (v_n_embd == -1) { + return std::make_tuple(std::vector(), -1); + } + if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str()); + return std::make_tuple(std::vector(), -1); + } + + if (n_embd == -1) { + vector = std::move(v); + n_embd = v_n_embd; + } else { + for (size_t i = 0; i < vector.size(); i++) { + vector[i] += v[i]; + } + } + } + + if (n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + return std::make_tuple(vector, n_embd); +} diff --git a/common/common.h b/common/common.h index f8d82b8713c87..2ea867553f8f9 100644 --- a/common/common.h +++ b/common/common.h @@ -102,6 +102,9 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector> control_vectors; // control vector with user defined scale + std::tuple control_vector_layer_range; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -267,3 +270,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); +// +// Control vector utils +// + +// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together. +// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd} +// On error, returns a tuple of {empty, -1} +std::tuple, int> llama_control_vector_load( + const std::vector> & vectors); diff --git a/llama.cpp b/llama.cpp index ad7b7b7d4bcf2..91e5245184508 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1885,6 +1885,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = 0; + int32_t layer_end = 0; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2093,6 +2118,9 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + // control vectors + struct llama_control_vector cvec; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -5772,6 +5800,12 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } cb(cur, "l_out", il); // input for next layer @@ -13188,6 +13222,93 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } + + // allocate contexts + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; + } + ctx_map[it.first] = ctx; + } + + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply(struct llama_context * lctx, float * data, size_t len, int n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + if (il >= cvec.tensors.size() || cvec.tensors[il] == nullptr) { + continue; + } + size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], + data + off, + 0, + n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, diff --git a/llama.h b/llama.h index 446899da6e38e..cb946b7523884 100644 --- a/llama.h +++ b/llama.h @@ -437,6 +437,20 @@ extern "C" { const char * path_base_model, int32_t n_threads); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + float * data, + size_t len, + int n_embd, + int32_t il_start, + int32_t il_end); + // // KV cache // From 0a9bc301acace74fe36944017c647a338c04affa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 16:43:37 +0200 Subject: [PATCH 2/3] control-vectors : minor code style updates --- common/common.cpp | 145 +++++++++++++++++++++++----------------------- common/common.h | 32 +++++++--- llama.cpp | 22 +++---- llama.h | 13 +++-- 4 files changed, 113 insertions(+), 99 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index f044a374afbf3..4912237e0d0f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -573,30 +573,29 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.control_vectors.push_back(std::make_tuple(argv[i], 1.0f)); + params.control_vectors.push_back({ 1.0f, argv[i], }); } else if (arg == "--control-vector-scaled") { if (++i >= argc) { invalid_param = true; break; } - const char * control_vector = argv[i]; + const char * fname = argv[i]; if (++i >= argc) { invalid_param = true; break; } - params.control_vectors.push_back(std::make_tuple(control_vector, std::stof(argv[i]))); + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); } else if (arg == "--control-vector-layer-range") { if (++i >= argc) { invalid_param = true; break; } - int32_t start = std::stoi(argv[i]); + params.control_vector_layer_start = std::stoi(argv[i]); if (++i >= argc) { invalid_param = true; break; } - int32_t end = std::stoi(argv[i]); - params.control_vector_layer_range = std::make_tuple(start, end); + params.control_vector_layer_end = std::stoi(argv[i]); } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -1396,27 +1395,22 @@ std::tuple llama_init_from_gpt_par } if (!params.control_vectors.empty()) { - int32_t layer_start, layer_end; - std::tie(layer_start, layer_end) = params.control_vector_layer_range; + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); - if (layer_start == 0) layer_start = 1; - if (layer_end == 0) layer_end = 31; - - std::vector control_vector; - int n_embd; - std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors); - if (n_embd == -1) { + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } int err = llama_control_vector_apply(lctx, - control_vector.data(), - control_vector.size(), - n_embd, - layer_start, - layer_end); + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); if (err) { llama_free(lctx); llama_free_model(model); @@ -1959,11 +1953,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // Control vector utils // -static std::tuple, int> llama_control_vector_load_one(const std::string & path, float strength) { - int n_tensors; +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + size_t n_bytes = 0; + uint32_t max_direction_layer = 0; - int n_embd = -1; + + llama_control_vector_data result = { -1, {} }; // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer { @@ -1977,11 +1974,11 @@ static std::tuple, int> llama_control_vector_load_one(const s /* .no_alloc = */ true, /* .ctx = */ &meta_ctx, }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params); + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!meta_ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); ggml_free(meta_ctx); - return std::make_tuple(std::vector(), -1); + return result; } n_tensors = gguf_get_n_tensors(meta_ctx_gguf); @@ -1994,36 +1991,36 @@ static std::tuple, int> llama_control_vector_load_one(const s try { uint32_t layer = std::stoi(name.substr(dotpos + 1)); if (layer == 0) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); - return std::make_tuple(std::vector(), -1); + return result; } if (layer > max_direction_layer) { max_direction_layer = layer; } } catch (...) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); - return std::make_tuple(std::vector(), -1); + return result; } } struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); - return std::make_tuple(std::vector(), -1); + return result; } - if (n_embd == -1) { - n_embd = ggml_nelements(tensor_meta); - } else if (ggml_nelements(tensor_meta) != n_embd) { - fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str()); + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); - return std::make_tuple(std::vector(), -1); + return result; } n_bytes += ggml_nbytes(tensor_meta); } @@ -2032,8 +2029,8 @@ static std::tuple, int> llama_control_vector_load_one(const s } if (n_tensors == 0) { - fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str()); - return std::make_tuple(std::vector(), -1); + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; } // load and scale tensors into final control vector context @@ -2048,63 +2045,63 @@ static std::tuple, int> llama_control_vector_load_one(const s /*.no_alloc = */ false, /*.ctx = */ &ctx, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params); + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); ggml_free(ctx); - return std::make_tuple(std::vector(), -1); + return result; } - std::vector vector; - for (uint32_t i = 1; i < max_direction_layer; i++) { - std::string name = "direction." + std::to_string(i); - ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + if (tensor) { - const float * data = (const float *) tensor->data; - for (int i = 0; i < n_embd; i++) { - vector.push_back(data[i] * strength); + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; } } else { - vector.insert(vector.end(), n_embd, 0.); // as a filler + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } } } - return std::make_tuple(vector, n_embd); + return result; } -std::tuple, int> llama_control_vector_load(const std::vector> & vectors) { - std::vector vector; - int n_embd = -1; +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; - for (const auto& pair : vectors) { - std::string path; - float strength; - std::tie(path, strength) = pair; + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); - std::vector v; - int v_n_embd; - std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength); - - if (v_n_embd == -1) { - return std::make_tuple(std::vector(), -1); + if (cur.n_embd == -1) { + return result; } - if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) { - fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str()); - return std::make_tuple(std::vector(), -1); + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; } - if (n_embd == -1) { - vector = std::move(v); - n_embd = v_n_embd; + if (result.n_embd == -1) { + result = std::move(cur); } else { - for (size_t i = 0; i < vector.size(); i++) { - vector[i] += v[i]; + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; } } } - if (n_embd == -1) { + if (result.n_embd == -1) { fprintf(stderr, "%s: no vectors passed\n", __func__); } - return std::make_tuple(vector, n_embd); + + return result; } diff --git a/common/common.h b/common/common.h index b1f2ea388ab14..687f3425e8544 100644 --- a/common/common.h +++ b/common/common.h @@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -103,8 +106,10 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter - std::vector> control_vectors; // control vector with user defined scale - std::tuple control_vector_layer_range; // layer range for control vector + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -277,8 +282,19 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // Control vector utils // -// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together. -// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd} -// On error, returns a tuple of {empty, -1} -std::tuple, int> llama_control_vector_load( - const std::vector> & vectors); +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); diff --git a/llama.cpp b/llama.cpp index 3297785b36d09..09ff01b0bb2fe 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1877,7 +1877,7 @@ struct llama_control_vector { std::vector bufs; int32_t layer_start = 0; - int32_t layer_end = 0; + int32_t layer_end = 0; ggml_tensor * tensor_for(int il) const { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { @@ -13183,6 +13183,10 @@ int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -13335,7 +13339,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return true; } -int32_t llama_control_vector_apply(struct llama_context * lctx, float * data, size_t len, int n_embd, int32_t il_start, int32_t il_end) { +int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { const llama_model & model = lctx->model; llama_control_vector & cvec = lctx->cvec; @@ -13351,18 +13355,14 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, float * data, si } cvec.layer_start = il_start; - cvec.layer_end = il_end; + cvec.layer_end = il_end; for (size_t il = 1; il < model.hparams.n_layer; il++) { - if (il >= cvec.tensors.size() || cvec.tensors[il] == nullptr) { - continue; - } - size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present if (off + n_embd <= len) { - ggml_backend_tensor_set(cvec.tensors[il], - data + off, - 0, - n_embd * ggml_element_size(cvec.tensors[il])); + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); } } diff --git a/llama.h b/llama.h index 706a6be09da84..6ee513aa80ae3 100644 --- a/llama.h +++ b/llama.h @@ -387,6 +387,7 @@ extern "C" { LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_n_layer (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -434,10 +435,10 @@ extern "C" { // Returns 0 on success LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + const char * path_lora, + float scale, + const char * path_base_model, + int32_t n_threads); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. @@ -447,9 +448,9 @@ extern "C" { // See llama_control_vector_load in common to load a control vector. LLAMA_API int32_t llama_control_vector_apply( struct llama_context * lctx, - float * data, + const float * data, size_t len, - int n_embd, + int32_t n_embd, int32_t il_start, int32_t il_end); From 838c99c7d570379299b6205650630ad71bb96663 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Fri, 15 Mar 2024 12:59:08 -0700 Subject: [PATCH 3/3] disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --- llama.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 09ff01b0bb2fe..ffac0e4ca35c5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1876,8 +1876,8 @@ struct llama_control_vector { std::vector ctxs; std::vector bufs; - int32_t layer_start = 0; - int32_t layer_end = 0; + int32_t layer_start = -1; + int32_t layer_end = -1; ggml_tensor * tensor_for(int il) const { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { @@ -13343,6 +13343,13 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da const llama_model & model = lctx->model; llama_control_vector & cvec = lctx->cvec; + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + if (n_embd != (int) model.hparams.n_embd) { LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); return 1;