Skip to content

Commit 2d69bf8

Browse files
committed
New simplified llama.h API, and GPU offloading for control vectors
1 parent c82301c commit 2d69bf8

File tree

4 files changed

+282
-250
lines changed

4 files changed

+282
-250
lines changed

common/common.cpp

+171-24
Original file line numberDiff line numberDiff line change
@@ -1393,32 +1393,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
13931393
if (layer_start == 0) layer_start = 1;
13941394
if (layer_end == 0) layer_end = 31;
13951395

1396-
struct llama_control_vector * vector = nullptr;
1397-
1398-
for (const auto& t : params.control_vectors) {
1399-
std::string path;
1400-
float strength;
1401-
std::tie(path, strength) = t;
1402-
1403-
fprintf(stderr, "%s: loading control vector from %s\n", __func__, path.c_str());
1404-
struct llama_control_vector * temp = llama_control_vector_load(path.c_str());
1405-
if (temp == nullptr) {
1406-
fprintf(stderr, "%s: error: failed to load control vector from %s\n", __func__, path.c_str());
1407-
llama_free(lctx);
1408-
llama_free_model(model);
1409-
return std::make_tuple(nullptr, nullptr);
1410-
}
1411-
llama_control_vector_scale(temp, strength);
1412-
1413-
if (vector == nullptr) {
1414-
vector = temp;
1415-
} else {
1416-
llama_control_vector_add(vector, temp);
1417-
llama_control_vector_free(temp);
1418-
}
1396+
std::vector<float> control_vector;
1397+
int n_embd;
1398+
std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors);
1399+
if (n_embd == -1) {
1400+
llama_free(lctx);
1401+
llama_free_model(model);
1402+
return std::make_tuple(nullptr, nullptr);
14191403
}
14201404

1421-
llama_apply_control_vector(lctx, vector, layer_start, layer_end);
1405+
int err = llama_control_vector_apply(lctx,
1406+
control_vector.data(),
1407+
control_vector.size(),
1408+
n_embd,
1409+
layer_start,
1410+
layer_end);
1411+
if (err) {
1412+
llama_free(lctx);
1413+
llama_free_model(model);
1414+
return std::make_tuple(nullptr, nullptr);
1415+
}
14221416
}
14231417

14241418
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
@@ -1937,3 +1931,156 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
19371931
}
19381932
}
19391933

1934+
//
1935+
// Control vector utils
1936+
//
1937+
1938+
static std::tuple<std::vector<float>, int> llama_control_vector_load_one(const std::string & path, float strength) {
1939+
int n_tensors;
1940+
size_t n_bytes = 0;
1941+
uint32_t max_direction_layer = 0;
1942+
int n_embd = -1;
1943+
1944+
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
1945+
{
1946+
struct ggml_init_params meta_params = {
1947+
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
1948+
/* .mem_buffer = */ nullptr,
1949+
/* .no_alloc = */ true,
1950+
};
1951+
ggml_context * meta_ctx = ggml_init(meta_params);
1952+
struct gguf_init_params meta_gguf_params = {
1953+
/* .no_alloc = */ true,
1954+
/* .ctx = */ &meta_ctx,
1955+
};
1956+
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params);
1957+
if (!meta_ctx_gguf) {
1958+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
1959+
ggml_free(meta_ctx);
1960+
return std::make_tuple(std::vector<float>(), -1);
1961+
}
1962+
1963+
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
1964+
for (int i = 0; i < n_tensors; i++) {
1965+
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
1966+
1967+
// split on '.'
1968+
size_t dotpos = name.find('.');
1969+
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1970+
try {
1971+
uint32_t layer = std::stoi(name.substr(dotpos + 1));
1972+
if (layer == 0) {
1973+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1974+
ggml_free(meta_ctx);
1975+
gguf_free(meta_ctx_gguf);
1976+
return std::make_tuple(std::vector<float>(), -1);
1977+
}
1978+
if (layer > max_direction_layer) {
1979+
max_direction_layer = layer;
1980+
}
1981+
} catch (...) {
1982+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1983+
ggml_free(meta_ctx);
1984+
gguf_free(meta_ctx_gguf);
1985+
return std::make_tuple(std::vector<float>(), -1);
1986+
}
1987+
}
1988+
1989+
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
1990+
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
1991+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1992+
ggml_free(meta_ctx);
1993+
gguf_free(meta_ctx_gguf);
1994+
return std::make_tuple(std::vector<float>(), -1);
1995+
}
1996+
if (n_embd == -1) {
1997+
n_embd = ggml_nelements(tensor_meta);
1998+
} else if (ggml_nelements(tensor_meta) != n_embd) {
1999+
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str());
2000+
ggml_free(meta_ctx);
2001+
gguf_free(meta_ctx_gguf);
2002+
return std::make_tuple(std::vector<float>(), -1);
2003+
}
2004+
n_bytes += ggml_nbytes(tensor_meta);
2005+
}
2006+
ggml_free(meta_ctx);
2007+
gguf_free(meta_ctx_gguf);
2008+
}
2009+
2010+
if (n_tensors == 0) {
2011+
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str());
2012+
return std::make_tuple(std::vector<float>(), -1);
2013+
}
2014+
2015+
// load and scale tensors into final control vector context
2016+
struct ggml_init_params ggml_params = {
2017+
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
2018+
/* .mem_buffer = */ nullptr,
2019+
/* .no_alloc = */ false,
2020+
};
2021+
struct ggml_context * ctx = ggml_init(ggml_params);
2022+
2023+
struct gguf_init_params params = {
2024+
/*.no_alloc = */ false,
2025+
/*.ctx = */ &ctx,
2026+
};
2027+
struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params);
2028+
if (!ctx_gguf) {
2029+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
2030+
ggml_free(ctx);
2031+
return std::make_tuple(std::vector<float>(), -1);
2032+
}
2033+
2034+
std::vector<float> vector;
2035+
for (uint32_t i = 1; i < max_direction_layer; i++) {
2036+
std::string name = "direction." + std::to_string(i);
2037+
ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
2038+
if (tensor) {
2039+
const float * data = (const float *) tensor->data;
2040+
for (int i = 0; i < n_embd; i++) {
2041+
vector.push_back(data[i] * strength);
2042+
}
2043+
} else {
2044+
vector.insert(vector.end(), n_embd, 0.); // as a filler
2045+
}
2046+
}
2047+
2048+
return std::make_tuple(vector, n_embd);
2049+
}
2050+
2051+
std::tuple<std::vector<float>, int> llama_control_vector_load(const std::vector<std::tuple<std::string, float>> & vectors) {
2052+
std::vector<float> vector;
2053+
int n_embd = -1;
2054+
2055+
for (const auto& pair : vectors) {
2056+
std::string path;
2057+
float strength;
2058+
std::tie(path, strength) = pair;
2059+
2060+
std::vector<float> v;
2061+
int v_n_embd;
2062+
std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength);
2063+
2064+
if (v_n_embd == -1) {
2065+
return std::make_tuple(std::vector<float>(), -1);
2066+
}
2067+
if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) {
2068+
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str());
2069+
return std::make_tuple(std::vector<float>(), -1);
2070+
}
2071+
2072+
if (n_embd == -1) {
2073+
vector = std::move(v);
2074+
n_embd = v_n_embd;
2075+
} else {
2076+
for (size_t i = 0; i < vector.size(); i++) {
2077+
vector[i] += v[i];
2078+
}
2079+
}
2080+
}
2081+
2082+
if (n_embd == -1) {
2083+
fprintf(stderr, "%s: no vectors passed\n", __func__);
2084+
}
2085+
return std::make_tuple(vector, n_embd);
2086+
}

common/common.h

+9
Original file line numberDiff line numberDiff line change
@@ -270,3 +270,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
270270

271271
void llama_embd_normalize(const float * inp, float * out, int n);
272272

273+
//
274+
// Control vector utils
275+
//
276+
277+
// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together.
278+
// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd}
279+
// On error, returns a tuple of {empty, -1}
280+
std::tuple<std::vector<float>, int> llama_control_vector_load(
281+
const std::vector<std::tuple<std::string, float>> & vectors);

0 commit comments

Comments
 (0)