llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

This commit is contained in:
Jeffrey Morgan
2025-02-26 20:34:44 -08:00
committed by GitHub
parent 2db96c18e7
commit d7d7e99662
149 changed files with 18215 additions and 11009 deletions

2
llama/build-info.cpp generated vendored
View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
char const *LLAMA_COMMIT = "d7cfe1ffe0f435d0048a6058d529daf76e072d9c";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";

View File

@@ -2,6 +2,9 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml.h"
#include "gguf.h"
#include "common.h"
#include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
@@ -70,6 +73,22 @@
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json;
@@ -464,6 +483,48 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
std::ostringstream result;
for (size_t i = 0; i < values.size(); ++i) {
if (i > 0) {
result << separator;
}
result << values[i];
}
return result.str();
}
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> parts;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
parts.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
parts.push_back(str.substr(start));
return parts;
}
std::string string_repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
std::string string_from(bool value) {
return value ? "true" : "false";
}
@@ -846,7 +907,7 @@ struct common_init_result common_init_from_params(common_params & params) {
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
model = llama_model_load_from_file(params.model.c_str(), mparams);
}
if (model == NULL) {
@@ -854,26 +915,28 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.reranking) {
bool ok = true;
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
}
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
llama_free_model(model);
llama_model_free(model);
return iparams;
}
@@ -881,10 +944,10 @@ struct common_init_result common_init_from_params(common_params & params) {
auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_new_context_with_model(model, cparams);
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
llama_model_free(model);
return iparams;
}
@@ -895,25 +958,26 @@ struct common_init_result common_init_from_params(common_params & params) {
if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);
return iparams;
}
int err = llama_control_vector_apply(lctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
params.control_vector_layer_start,
params.control_vector_layer_end);
int err = llama_apply_adapter_cvec(
lctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
params.control_vector_layer_start,
params.control_vector_layer_end);
if (err) {
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);
return iparams;
}
@@ -921,12 +985,12 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_lora_adapter_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);
return iparams;
}
@@ -935,17 +999,17 @@ struct common_init_result common_init_from_params(common_params & params) {
}
if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, params.lora_adapters);
common_set_adapter_lora(lctx, params.lora_adapters);
}
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}
if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
if (llama_token_is_eog(model, i)) {
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
@@ -966,8 +1030,9 @@ struct common_init_result common_init_from_params(common_params & params) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model);
llama_token eos = llama_token_eos(model);
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);
// some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos);
@@ -982,7 +1047,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = bos;
}
tmp.clear();
@@ -1002,11 +1067,11 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
llama_lora_adapter_clear(ctx);
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
llama_clear_adapter_lora(ctx);
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.ptr, la.scale);
llama_set_adapter_lora(ctx, la.ptr, la.scale);
}
}
}
@@ -1020,7 +1085,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
@@ -1123,7 +1187,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
@@ -1137,11 +1202,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer ";
auth_header += hf_token.c_str();
struct curl_slist *http_headers = NULL;
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
#if defined(_WIN32)
@@ -1411,7 +1474,7 @@ struct llama_model * common_load_model_from_url(
}
}
return llama_load_model_from_file(local_path.c_str(), params);
return llama_model_load_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
@@ -1437,6 +1500,80 @@ struct llama_model * common_load_model_from_hf(
return common_load_model_from_url(model_url, local_path, hf_token, params);
}
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else
struct llama_model * common_load_model_from_url(
@@ -1458,6 +1595,11 @@ struct llama_model * common_load_model_from_hf(
return nullptr;
}
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL
//
@@ -1556,21 +1698,23 @@ std::vector<llama_token> common_tokenize(
const std::string & text,
bool add_special,
bool parse_special) {
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_tokenize(vocab, text, add_special, parse_special);
}
std::vector<llama_token> common_tokenize(
const struct llama_model * model,
const struct llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
@@ -1579,12 +1723,18 @@ std::vector<llama_token> common_tokenize(
}
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_token_to_piece(vocab, token, special);
}
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
@@ -1594,13 +1744,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
return piece;
}
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_detokenize(vocab, tokens, special);
}
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
@@ -1610,103 +1766,6 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
return text;
}
//
// Chat template utils
//
std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
}
bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported
if (res < 0) {
if (ptr_tmpl != nullptr) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
} else {
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}
}
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(
fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
return formatted_chat;
}
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
std::vector<common_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n";
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};
return common_chat_apply_template(model, tmpl, msgs, true);
}
//
// KV cache utils
//

View File

@@ -4,6 +4,7 @@
#include "llama-cpp.h"
#include <set>
#include <string>
#include <vector>
#include <sstream>
@@ -24,11 +25,11 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_lora_adapter_info {
struct common_adapter_lora_info {
std::string path;
float scale;
struct llama_lora_adapter * ptr;
struct llama_adapter_lora * ptr;
};
using llama_tokens = std::vector<llama_token>;
@@ -103,6 +104,17 @@ enum dimre_method {
DIMRE_METHOD_MEAN,
};
enum common_conversation_mode {
COMMON_CONVERSATION_MODE_DISABLED = 0,
COMMON_CONVERSATION_MODE_ENABLED = 1,
COMMON_CONVERSATION_MODE_AUTO = 2,
};
struct common_grammar_trigger {
std::string word;
bool at_start;
};
// sampling parameters
struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -128,6 +140,7 @@ struct common_params_sampling {
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float top_n_sigma = -1.00f;// -1.0 = disabled
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool ignore_eos = false;
@@ -148,7 +161,11 @@ struct common_params_sampling {
COMMON_SAMPLER_TYPE_TEMPERATURE,
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::string grammar; // optional BNF-like grammar to constrain sampling
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -161,15 +178,19 @@ struct common_params_speculative {
int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params_vocoder {
@@ -178,6 +199,13 @@ struct common_params_vocoder {
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
};
struct common_params {
@@ -240,14 +268,13 @@ struct common_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -271,11 +298,11 @@ struct common_params {
bool kl_divergence = false; // compute KL divergence
bool usage = false; // print usage
bool completion = false; // print source-able completion script
bool use_color = false; // use color to distinguish generations and inputs
bool special = false; // enable special token output
bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@@ -301,6 +328,8 @@ struct common_params {
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s)
@@ -322,7 +351,9 @@ struct common_params {
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
std::vector<std::string> api_keys;
@@ -401,13 +432,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
//
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# if defined(__MINGW32__) && !defined(__clang__)
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# else
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
#endif
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -416,6 +447,10 @@ std::string string_format(const char * fmt, ...);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
std::string string_repeat(const std::string & str, size_t n);
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
template<class T>
@@ -454,6 +489,11 @@ static bool string_starts_with(const std::string & str,
return str.rfind(prefix, 0) == 0;
}
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@@ -481,7 +521,7 @@ struct common_init_result {
llama_model_ptr model;
llama_context_ptr context;
std::vector<llama_lora_adapter_ptr> lora;
std::vector<llama_adapter_lora_ptr> lora;
};
struct common_init_result common_init_from_params(common_params & params);
@@ -495,6 +535,7 @@ struct llama_model * common_load_model_from_url(
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
@@ -502,8 +543,12 @@ struct llama_model * common_load_model_from_hf(
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
//
// Batch utils
@@ -541,7 +586,7 @@ std::vector<llama_token> common_tokenize(
bool parse_special = false);
std::vector<llama_token> common_tokenize(
const struct llama_model * model,
const struct llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special = false);
@@ -553,48 +598,23 @@ std::string common_token_to_piece(
llama_token token,
bool special = true);
std::string common_token_to_piece(
const struct llama_vocab * vocab,
llama_token token,
bool special = true);
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens
std::string common_detokenize(
llama_context * ctx,
const struct llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
//
// Chat template utils
//
// same with llama_chat_message, but uses std::string
struct common_chat_msg {
std::string role;
std::string content;
};
// Get the built-in chat template for the model. Return empty string if not present.
std::string common_get_builtin_chat_template(const struct llama_model * model);
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & chat,
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass);
// Returns an example of formatted chat
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
std::string common_detokenize(
const struct llama_vocab * vocab,
const std::vector<llama_token> & tokens,
bool special = true);
//
// KV cache utils

View File

@@ -1,4 +1,6 @@
#include "json-schema-to-grammar.h"
#include "common.h"
#include <algorithm>
#include <fstream>
#include <map>
@@ -11,11 +13,6 @@
using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
if (sub_len > 0) {
auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1);
auto sub_zeros = repeat("0", sub_len);
auto sub_nines = repeat("9", sub_len);
auto sub_zeros = string_repeat("0", sub_len);
auto sub_nines = string_repeat("9", sub_len);
auto to_reached = false;
out << "(";
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) {
uniform_range(min_s, repeat("9", digits));
min_s = "1" + repeat("0", digits);
uniform_range(min_s, string_repeat("9", digits));
min_s = "1" + string_repeat("0", digits);
out << " | ";
}
uniform_range(min_s, max_s);
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
std::ostringstream result;
if (begin != end) {
result << *begin;
for (Iterator it = begin + 1; it != end; ++it) {
result << separator << *it;
}
}
return result.str();
}
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
tokens.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
tokens.push_back(str.substr(start));
return tokens;
}
static std::string repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
std::smatch match;
std::string result;
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
class SchemaConverter {
private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
std::unordered_map<std::string, std::string> _rules;
@@ -418,7 +373,7 @@ private:
for (size_t i = 0; i < alt_schemas.size(); i++) {
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
}
return join(rules.begin(), rules.end(), " | ");
return string_join(rules, " | ");
}
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -481,7 +436,7 @@ private:
for (const auto & item : ret) {
results.push_back(to_rule(item));
}
return std::make_pair(join(results.begin(), results.end(), " "), false);
return std::make_pair(string_join(results, " "), false);
};
while (i < length) {
@@ -539,7 +494,7 @@ private:
}
curly_brackets += '}';
i++;
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0;
int max_times = std::numeric_limits<int>::max();
try {
@@ -809,10 +764,11 @@ private:
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
bool dotall)
bool dotall,
bool compact_spaces)
: _fetch_json(fetch_json), _dotall(dotall)
{
_rules["space"] = SPACE_RULE;
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
}
void resolve_refs(json & schema, const std::string & url) {
@@ -854,7 +810,7 @@ public:
return;
}
std::string pointer = ref.substr(ref.find('#') + 1);
std::vector<std::string> tokens = split(pointer, "/");
std::vector<std::string> tokens = string_split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) {
@@ -905,7 +861,7 @@ public:
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -1019,10 +975,10 @@ public:
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
}
}
@@ -1035,11 +991,35 @@ public:
}
};
std::string json_schema_to_grammar(const json & schema) {
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
auto copy = schema;
converter.resolve_refs(copy, "input");
converter.visit(copy, "");
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
#ifdef LLAMA_USE_LLGUIDANCE
if (!force_gbnf) {
return "%llguidance {}\nstart: %json " + schema.dump();
}
#else
(void)force_gbnf;
#endif // LLAMA_USE_LLGUIDANCE
return build_grammar([&](const common_grammar_builder & callbacks) {
auto copy = schema;
callbacks.resolve_refs(copy);
callbacks.add_schema("", copy);
});
}
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule);
},
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
return converter.visit(schema, name == "root" ? "" : name);
},
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
converter.resolve_refs(schema, "");
}
};
cb(builder);
converter.check_errors();
return converter.format_grammar();
}

View File

@@ -5,4 +5,18 @@
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
bool force_gbnf = false);
struct common_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
std::function<void(nlohmann::ordered_json &)> resolve_refs;
};
struct common_grammar_options {
bool dotall = false;
bool compact_spaces = false;
};
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});

View File

@@ -1,5 +1,6 @@
#include "log.h"
#include <chrono>
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
@@ -14,16 +15,6 @@ void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
@@ -206,6 +197,7 @@ public:
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
va_end(args_copy);
}
entry.level = level;

View File

@@ -2,9 +2,20 @@
#include "ggml.h" // for ggml_log_level
#define LOG_CLR_TO_EOL "\033[K\r"
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
#ifndef __GNUC__
# define LOG_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
#elif defined(__MINGW32__) && !defined(__clang__)
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))

View File

@@ -113,7 +113,10 @@ struct common_sampler {
void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab);
@@ -131,24 +134,47 @@ std::string common_params_sampling::print() const {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
mirostat, mirostat_eta, mirostat_tau);
return std::string(result);
}
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf;
struct llama_sampler * grmr;
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
#else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
} else {
std::vector<const char *> trigger_words;
trigger_words.reserve(params.grammar_trigger_words.size());
for (const auto & str : params.grammar_trigger_words) {
trigger_words.push_back(str.word.c_str());
}
grmr = params.grammar_lazy
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
trigger_words.data(), trigger_words.size(),
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
}
auto * result = new common_sampler {
/* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .grmr = */ grmr,
/* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
@@ -157,56 +183,62 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias(
llama_n_vocab(model),
llama_vocab_n_tokens(vocab),
params.logit_bias.size(),
params.logit_bias.data()));
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
if (params.top_n_sigma >= 0) {
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
} else {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
}
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));

View File

@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
const char * grammar_kind, const char * grammar_data);

View File

@@ -7,6 +7,7 @@
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
@@ -39,6 +40,7 @@
#include <map>
#include <regex>
#include <stdexcept>
#include <unordered_set>
#include <vector>
#include <sstream>
#include <cinttypes>
@@ -114,6 +116,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
@@ -131,6 +134,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -172,6 +176,15 @@ static std::string format(const char * fmt, ...) {
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
@@ -179,6 +192,7 @@ enum projector_type {
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -188,6 +202,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
};
@@ -275,7 +290,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
@@ -444,8 +459,9 @@ struct clip_hparams {
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
int32_t image_grid_pinpoints[32];
std::vector<int32_t> image_grid_pinpoints;
int32_t image_crop_resolution;
std::unordered_set<int32_t> vision_feature_layer;
};
struct clip_layer {
@@ -512,6 +528,12 @@ struct clip_vision_model {
struct ggml_tensor * mm_4_w = NULL;
struct ggml_tensor * mm_4_b = NULL;
//GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w;
struct ggml_tensor * mm_model_adapter_conv_b;
struct ggml_tensor * boi_w;
struct ggml_tensor * eoi_w;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
struct ggml_tensor * mm_model_mlp_1_b;
@@ -572,12 +594,14 @@ struct clip_ctx {
bool has_vision_encoder = false;
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
bool has_glm_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2;
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
int32_t max_feature_layer;
float image_mean[3];
float image_std[3];
bool use_gelu = false;
@@ -644,13 +668,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
int n_layer = hparams.n_layer;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs->size;
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1);
}
@@ -730,6 +753,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 4) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
@@ -742,14 +768,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
}
std::vector<struct ggml_tensor *> embedding_stack;
const auto & vision_feature_layer = hparams.vision_feature_layer;
// loop over layers
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
// TODO: figure out why we doing thing in this way ???
n_layer += 1;
}
for (int il = 0; il < n_layer - 1; il++) {
for (int il = 0; il < ctx->max_feature_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// If this is an embedding feature layer, save the output.
// NOTE: 0 index here refers to the input to the encoder.
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
embedding_stack.push_back(embeddings);
}
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
// layernorm1
@@ -837,7 +868,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
@@ -848,6 +878,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// final layer is a vision feature layer
if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
embedding_stack.push_back(embeddings);
}
// If feature layers are explicitly set, stack them (if we have multiple)
if (!embedding_stack.empty()) {
embeddings = embedding_stack[0];
for (size_t i = 1; i < embedding_stack.size(); i++) {
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
}
}
// llava projector
if (ctx->has_llava_projector) {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1065,6 +1108,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
n_head = hidden_size/d_head;
num_query = 64;
}
else if (ctx->minicpmv_version == 4) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -1099,7 +1147,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ASSERT(false);
}
}
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
// glm projector
else if (ctx->has_glm_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
//GLU
{
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
struct ggml_tensor * x = embeddings;
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
embeddings = ggml_silu_inplace(ctx0, embeddings);
embeddings = ggml_mul(ctx0, embeddings,x);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
}
} else {
GGML_ABORT("fatel error");
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1268,6 +1342,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
if (idx != -1) {
new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
@@ -1292,6 +1371,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
}
@@ -1402,14 +1482,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
int n = gguf_get_arr_n(ctx, idx);
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
hparams.image_grid_pinpoints[i] = pinpoints[i];
for (int i = 0; i < n; ++i) {
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
}
if (n < 32)
hparams.image_grid_pinpoints[n] = 0;
} catch (std::runtime_error & /*e*/) {
hparams.image_grid_pinpoints[0]=0;
}
} catch (std::runtime_error & /*e*/) { }
// Load the vision feature layer indices if they are explicitly provided;
// if multiple vision feature layers are present, the values will be concatenated
// to form the final visual features.
// NOTE: gguf conversions should standardize the values of the vision feature layer to
// be non-negative, since we use -1 to mark values as unset here.
try {
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
int n = gguf_get_arr_n(ctx, idx);
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
for (int i = 0; i < n; ++i) {
hparams.vision_feature_layer.insert(vision_feature_layer[i]);
}
} catch (std::runtime_error & /*e*/) { }
try {
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1435,6 +1527,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->image_std[i] = std_data[i];
}
// Calculate the deepest feature layer based on hparams and projector type
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
if (verbosity >= 2) {
LOG_INF("\n%s: vision model hparams\n", __func__);
LOG_INF("image_size %d\n", hparams.image_size);
@@ -1448,8 +1543,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_INF("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
for (const auto & pp : hparams.image_grid_pinpoints) {
LOG_INF("%d ", pp);
}
LOG_INF("\n");
LOG_INF("v_vision_feature_layer: ");
for (const auto & feature_layer: hparams.vision_feature_layer) {
LOG_INF("%d ", feature_layer);
}
LOG_INF("\n");
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
@@ -1584,6 +1684,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
}
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
@@ -1676,11 +1788,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
}
}
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->buf.resize(3 * nx * ny);
memcpy(img->buf.data(), data, img->buf.size());
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
@@ -1690,7 +1802,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}
@@ -1702,7 +1814,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}
@@ -2058,6 +2170,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
images[images.size()-1].push_back(patch);
}
}
clip_image_u8_free(refine_image);
}
return images;
}
@@ -2096,6 +2209,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
clip_image_f32_free(res);
}
}
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
if (imgs[i][j] != nullptr) {
clip_image_u8_free(imgs[i][j]);
}
}
}
return true;
}
else if (ctx->has_qwen2vl_merger) {
@@ -2116,6 +2236,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
return true;
}
if (ctx->has_glm_projector) {
res_imgs->size = 1;
res_imgs->data = new clip_image_f32[res_imgs->size];
clip_image_u8 resized_image;
int32_t sz=ctx->vision_model.hparams.image_size;
bicubic_resize(*img, resized_image,sz,sz);
clip_image_f32 * res = clip_image_f32_init();
//clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
res_imgs->data[0] = *res;
clip_image_f32_free(res);
return true;
}
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n");
@@ -2160,10 +2294,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
}
}
} else {
if (params.image_grid_pinpoints[0] != 0) {
if (!params.image_grid_pinpoints.empty()) {
// "spatial_unpad" with "anyres" processing for llava-1.6
std::vector<std::pair<int, int>> possible_resolutions;
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
}
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2301,7 +2435,8 @@ void clip_free(clip_ctx * ctx) {
}
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2328,7 +2463,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
}
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints;
if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
}
return nullptr;
}
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size();
}
int clip_n_patches(const struct clip_ctx * ctx) {
@@ -2343,7 +2485,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
@@ -2352,6 +2494,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
else if (ctx->minicpmv_version == 4) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
@@ -2473,6 +2618,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
if (ctx->has_minicpmv_projector) {
GGML_ASSERT(batch_size == 1);
}
if (ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1);
ggml_tensor * boi = ctx->vision_model.boi_w;
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
}
// build the inference graph
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
@@ -2531,8 +2682,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int bucket_coords_h[70];
int bucket_coords_w[70];
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
@@ -2560,6 +2711,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
else if (ctx->minicpmv_version == 4) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2622,11 +2776,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
{
if (!ctx->has_glm_projector) {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
// The patches vector is used to get rows to index into the embeds with;
// we should skip dim 0 only if we have CLS to avoid going out of bounds
// when retrieving the rows.
int patch_offset = ctx->has_class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
patches_data[i] = i + patch_offset;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
@@ -2646,14 +2804,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (ctx->has_glm_projector) {
//eoi
ggml_tensor * eoi = ctx->vision_model.eoi_w;
int offset = ggml_nelements(embeddings);
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
}
return true;
}
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
ggml_type type = GGML_TYPE_Q4_1;
assert(itype < GGML_TYPE_COUNT);
type = static_cast<ggml_type>(itype);
ggml_type type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_model_load(fname_inp, 2);
@@ -2706,8 +2869,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
}
}
// quantize only 2D tensors
quantize &= (ggml_n_dims(cur) == 2);
// quantize only 2D tensors and bigger than block size
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
if (quantize) {
new_type = type;
@@ -2752,7 +2915,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
total_size_org += orig_size;
total_size_new += new_size;
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
fout.write((const char *)new_data, new_size);
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
for (size_t j = 0; j < pad; ++j) {
@@ -2802,6 +2966,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
else if (ctx->minicpmv_version == 3) {
return 3584;
}
else if (ctx->minicpmv_version == 4) {
return 3584;
}
}
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
}
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
@@ -2818,10 +2988,35 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
return 0;
}
bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->has_glm_projector;
}
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger;
}
// Determine the number of encoder layers to iterate over
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
// Get the index of the second to last layer; this is the
// default for models that have a llava projector
const auto & hparams = ctx->vision_model.hparams;
int n_layer = hparams.n_layer - 1;
int deepest_feature_layer = -1;
// Handle other projectors; incrementing here indicates that we
// should use the last encoder layer for the vision features.
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
n_layer += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;

View File

@@ -55,6 +55,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -73,6 +74,9 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -89,10 +93,14 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
#ifdef __cplusplus
}
#endif

View File

@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
int width = image->nx;
int height = image->ny;
int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
if (!encoded) {
@@ -313,6 +307,23 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
}
else if (clip_is_glm(ctx_clip)){
struct clip_image_size * load_image_size = clip_image_size_init();
load_image_size->width = img_res_v.data[0].nx;
load_image_size->height = img_res_v.data[0].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
*n_img_pos = (pos * pos + 2);
if (!encoded){
LOG_ERR("Unable to encode image \n");
return false;
}
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
@@ -342,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
for (size_t i = 0; i < num_gridpoints; i += 2) {
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
}
@@ -384,7 +396,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@@ -394,10 +406,14 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
}
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
int num_max_patches = 6;
// Granite vision uses up to 10 patches + base patch
int num_max_patches = 11;
if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10;
}
if (clip_is_glm(ctx_clip)) {
num_max_patches = 1;
}
float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
@@ -457,7 +473,7 @@ struct llava_embd_batch {
};
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
int n_eval = image_embed->n_image_pos - i;

View File

@@ -9,7 +9,7 @@
#include "llama.h"
struct llama_model_deleter {
void operator()(llama_model * model) { llama_free_model(model); }
void operator()(llama_model * model) { llama_model_free(model); }
};
struct llama_context_deleter {
@@ -20,11 +20,11 @@ struct llama_sampler_deleter {
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
};
struct llama_lora_adapter_deleter {
void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
struct llama_adapter_lora_deleter {
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;

View File

@@ -34,7 +34,6 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
// TODO: use everywhere in the implementation
#define LLAMA_TOKEN_NULL -1
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -57,7 +56,7 @@ extern "C" {
// TODO: show sample usage
//
// struct llama_vocab; // TODO: add in the future
struct llama_vocab;
struct llama_model;
struct llama_context;
struct llama_sampler;
@@ -214,7 +213,7 @@ extern "C" {
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
};
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
@@ -290,9 +289,6 @@ extern "C" {
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
// comma separated list of RPC servers to use for offloading
const char * rpc_servers;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
@@ -312,7 +308,7 @@ extern "C" {
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggerganov/llama.cpp/pull/7544
// https://github.com/ggml-org/llama.cpp/pull/7544
struct llama_context_params {
uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -325,7 +321,7 @@ extern "C" {
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -388,11 +384,10 @@ extern "C" {
} llama_chat_message;
// lora adapter
// TODO: rename to llama_adapter_lora
struct llama_lora_adapter;
struct llama_adapter_lora;
// Helpers for getting default parameters
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
// TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
@@ -403,31 +398,53 @@ extern "C" {
// Call once at the start of the program
LLAMA_API void llama_backend_init(void);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params),
"use llama_model_load_from_file instead");
LLAMA_API struct llama_model * llama_load_model_from_file(
// Load the model from a file
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
// If the split file name does not follow this pattern, use llama_model_load_from_splits
LLAMA_API struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params);
// TODO: rename to llama_model_free
LLAMA_API void llama_free_model(struct llama_model * model);
// Load the model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
const char ** paths,
size_t n_paths,
struct llama_model_params params);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model(
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
"use llama_model_free instead");
LLAMA_API void llama_model_free(struct llama_model * model);
LLAMA_API struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params);
DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params),
"use llama_init_from_model instead");
// TODO (jmorganca): this should most likely be passed in as part of a batch
// and not set on the context for all batches.
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
@@ -449,20 +466,31 @@ extern "C" {
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
// Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
@@ -488,6 +516,10 @@ extern "C" {
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Get the default chat template. Returns nullptr if not available
// If name is NULL, returns the default chat template
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
@@ -515,34 +547,31 @@ extern "C" {
//
// Load a LoRA adapter from file
// TODO: rename to llama_adapter_lora_init
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
// The following functions operate on a llama_context, hence the naming: llama_verb_...
// Add a loaded LoRA adapter to given context
// This will not modify model's weight
// TODO: rename to llama_set_adapter_lora
LLAMA_API int32_t llama_lora_adapter_set(
LLAMA_API int32_t llama_set_adapter_lora(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
struct llama_adapter_lora * adapter,
float scale);
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
// TODO: rename to llama_rm_adapter_lora
LLAMA_API int32_t llama_lora_adapter_remove(
LLAMA_API int32_t llama_rm_adapter_lora(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);
struct llama_adapter_lora * adapter);
// Remove all LoRA adapters from given context
// TODO: rename to llama_clear_adapter_lora
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
// TODO: rename to llama_adapter_lora_free
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector.
@@ -550,9 +579,8 @@ extern "C" {
// to an n_embd x n_layers buffer starting from layer 1.
// il_start and il_end are the layer range the vector should apply to (both inclusive)
// See llama_control_vector_load in common to load a control vector.
// TODO: rename to llama_adapter_cvec_apply
LLAMA_API int32_t llama_control_vector_apply(
struct llama_context * lctx,
LLAMA_API int32_t llama_apply_adapter_cvec(
struct llama_context * ctx,
const float * data,
size_t len,
int32_t n_embd,
@@ -908,41 +936,60 @@ extern "C" {
// Vocab
//
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
// Identify if Token Id is a control token or a render-able token
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
// infill tokens
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
// CLS is equivalent to BOS
DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
"use llama_vocab_bos instead");
//
// Tokenization
@@ -958,7 +1005,7 @@ extern "C" {
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize(
const struct llama_model * model,
const struct llama_vocab * vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
@@ -972,7 +1019,7 @@ extern "C" {
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model,
const struct llama_vocab * vocab,
llama_token token,
char * buf,
int32_t length,
@@ -986,7 +1033,7 @@ extern "C" {
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
const struct llama_model * model,
const struct llama_vocab * vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
@@ -1000,7 +1047,7 @@ extern "C" {
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the models default chat template will be used instead.
/// @param chat Pointer to a list of multiple llama_chat_message
/// @param n_msg Number of llama_chat_message in this chat
@@ -1009,7 +1056,6 @@ extern "C" {
/// @param length The size of the allocated buffer
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
LLAMA_API int32_t llama_chat_apply_template(
const struct llama_model * model,
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
@@ -1057,7 +1103,6 @@ extern "C" {
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
//
typedef void * llama_sampler_context_t;
@@ -1076,11 +1121,12 @@ extern "C" {
};
struct llama_sampler {
struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
const struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
};
// mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1110,7 +1156,7 @@ extern "C" {
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1118,7 +1164,7 @@ extern "C" {
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
/// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -1133,6 +1179,9 @@ extern "C" {
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
/// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1157,10 +1206,22 @@ extern "C" {
float eta);
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
const struct llama_model * model,
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root);
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens);
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -1169,8 +1230,9 @@ extern "C" {
float penalty_present); // 0.0 = disabled
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
const struct llama_model * model,
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
const struct llama_vocab * vocab,
int32_t n_ctx_train,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
@@ -1204,7 +1266,7 @@ extern "C" {
// 3. discard non-EOG tokens with low prob
// 4. if no tokens are left -> pick EOT
//
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);

View File

@@ -1,5 +1,7 @@
#include "llama-adapter.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-model.h"
#include <algorithm>
@@ -9,7 +11,7 @@
// vec
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
@@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
return tensors[il];
}
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
return cur;
}
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
bool llama_adapter_cvec::init(const llama_model & model) {
const auto & hparams = model.hparams;
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
GGML_ASSERT(tensors.empty());
GGML_ASSERT(ctxs.empty());
GGML_ASSERT(bufs.empty());
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
}
ctx_map[buft] = ctx;
cvec.ctxs.emplace_back(ctx);
ctxs.emplace_back(ctx);
return ctx;
}
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
};
// make tensors
cvec.tensors.reserve(hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
tensors.reserve(hparams.n_layer);
tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
cvec.tensors.push_back(tensor);
tensors.push_back(tensor);
}
// allocate tensors / buffers and zero
cvec.bufs.reserve(ctx_map.size());
bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.bufs.emplace_back(buf);
bufs.emplace_back(buf);
}
return true;
}
int32_t llama_control_vector_apply(
struct llama_control_vector & cvec,
int32_t llama_adapter_cvec::apply(
const llama_model & model,
const float * data,
size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
layer_start = -1;
layer_end = -1;
return 0;
}
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
return 1;
}
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
if (tensors.empty()) {
if (!init(model)) {
return 1;
}
}
cvec.layer_start = il_start;
cvec.layer_end = il_end;
layer_start = il_start;
layer_end = il_end;
for (size_t il = 1; il < hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
assert(tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
}
}
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
// lora
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
const std::string name(w->name);
const auto pos = ab_map.find(name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
return nullptr;
}
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
};
// bundle lora_a and lora_b into pairs
std::map<std::string, llama_lora_weight> ab_map;
std::map<std::string, llama_adapter_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(cur, nullptr);
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(nullptr, cur);
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
} else if (str_endswith(name, "_norm.weight")) {
// TODO: add support for norm vector
// for now, we don't really care because most adapters still work fine without it
continue;
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
@@ -250,25 +251,33 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
// add tensors
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
llama_adapter_lora_weight & w = it.second;
bool is_token_embd = str_endswith(name, "token_embd.weight");
if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
const auto * model_tensor = model.get_tensor(name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
if (is_token_embd) {
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
} else {
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
}
// save tensor to adapter
@@ -276,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
}
// allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
struct llama_lora_adapter * adapter = new llama_lora_adapter();
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
struct llama_adapter_lora * adapter = new llama_adapter_lora();
try {
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
return nullptr;
}
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
delete adapter;
}

View File

@@ -1,66 +1,74 @@
#pragma once
#include "llama-impl.h"
#include "llama-hparams.h"
#include "llama.h"
#include "ggml-cpp.h"
#include <string>
#include <unordered_map>
#include <vector>
// TODO: pimpl
//
// llama_adapter_cvec
//
// TODO: rename to llama_adapter_cvec
struct llama_control_vector {
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
struct llama_adapter_cvec {
struct ggml_tensor * tensor_for(int il) const;
std::vector<struct ggml_tensor *> tensors; // per layer
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
int32_t apply(
const llama_model & model,
const float * data,
size_t len,
int32_t n_embd,
int32_t il_start,
int32_t il_end);
private:
bool init(const llama_model & model);
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
std::vector<struct ggml_tensor *> tensors; // per layer
};
int32_t llama_control_vector_apply(
struct llama_control_vector & cvec,
const llama_model & model,
const float * data,
size_t len,
int32_t n_embd,
int32_t il_start,
int32_t il_end);
//
// llama_adapter_lora
//
// TODO: rename to llama_adapter_lora_weight
struct llama_lora_weight {
struct llama_adapter_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
// get actual scale based on rank and alpha
float get_scale(float alpha, float adapter_scale) const {
const float rank = (float) b->ne[0];
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
return scale;
}
llama_adapter_lora_weight() = default;
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
};
// TODO: rename to llama_adapter_lora
struct llama_lora_adapter {
struct llama_adapter_lora {
// map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
float alpha;
llama_lora_adapter() = default;
~llama_lora_adapter() = default;
llama_adapter_lora() = default;
~llama_adapter_lora() = default;
llama_lora_weight * get_weight(struct ggml_tensor * w);
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
};

View File

@@ -28,6 +28,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
@@ -57,6 +58,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@@ -107,25 +109,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -179,6 +182,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -622,6 +627,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_PHIMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_PLAMO,
{
@@ -1036,6 +1062,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -1182,6 +1211,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
@@ -1199,6 +1229,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
},
},
{
LLM_ARCH_RWKV6QWEN2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_GRANITE,
{
@@ -1253,6 +1309,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
{
LLM_ARCH_SOLAR,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
},
},
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -1278,24 +1352,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
},
},
{
LLM_ARCH_SOLAR,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@@ -1399,6 +1455,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1455,10 +1512,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
std::string LLM_KV::operator()(llm_kv kv) const {
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
: ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
}
std::string LLM_TN_IMPL::str() const {

View File

@@ -32,6 +32,7 @@ enum llm_arch {
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO,
LLM_ARCH_CODESHELL,
LLM_ARCH_ORION,
@@ -61,6 +62,7 @@ enum llm_arch {
LLM_ARCH_NEMOTRON,
LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6,
LLM_ARCH_RWKV6QWEN2,
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
@@ -111,6 +113,7 @@ enum llm_kv {
LLM_KV_TIME_DECAY_EXTRA_DIM,
LLM_KV_RESIDUAL_SCALE,
LLM_KV_EMBEDDING_SCALE,
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -177,6 +180,8 @@ enum llm_kv {
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -256,6 +261,7 @@ enum llm_tensor {
LLM_TENSOR_TIME_MIX_LERP_V,
LLM_TENSOR_TIME_MIX_LERP_R,
LLM_TENSOR_TIME_MIX_LERP_G,
LLM_TENSOR_TIME_MIX_LERP_FUSED,
LLM_TENSOR_TIME_MIX_FIRST,
LLM_TENSOR_TIME_MIX_DECAY,
LLM_TENSOR_TIME_MIX_DECAY_W1,
@@ -343,9 +349,10 @@ enum llm_tensor_layer {
};
struct LLM_KV {
LLM_KV(llm_arch arch);
LLM_KV(llm_arch arch, const char * suffix = nullptr);
llm_arch arch;
const char * suffix;
std::string operator()(llm_kv kv) const;
};

View File

@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
@@ -50,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
@@ -73,7 +75,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl_contains("<|im_start|>")) {
return LLM_CHAT_TEMPLATE_CHATML;
return tmpl_contains("<|im_sep|>")
? LLM_CHAT_TEMPLATE_PHI_4
: LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -112,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return LLM_CHAT_TEMPLATE_FALCON_3;
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
@@ -149,7 +153,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_MINICPM;
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
} else if (tmpl_contains(LU8("'<Assistant>' + message['content'] + '<end▁of▁sentence>'"))) {
} else if (tmpl_contains(LU8("<Assistant>")) && tmpl_contains(LU8("<User>")) && tmpl_contains(LU8("<end▁of▁sentence>"))) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
@@ -269,6 +273,14 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
// chatml template
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
}
if (add_ass) {
ss << "<|im_start|>assistant<|im_sep|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
// Falcon 3
for (auto message : chat) {
@@ -429,6 +441,14 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {

View File

@@ -15,6 +15,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_PHI_4,
LLM_CHAT_TEMPLATE_FALCON_3,
LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH,
@@ -30,6 +31,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD,

View File

@@ -1,5 +1,8 @@
#include "llama-context.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include <cassert>
#include <cmath>
#include <cstring>
@@ -513,7 +516,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
auto * output_dev = lctx.model.dev_output.dev;
auto * output_dev = lctx.model.dev_output();
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) {
buft = output_dev_host_buft;

View File

@@ -22,12 +22,12 @@ struct llama_context {
const struct llama_model & model;
struct llama_cparams cparams;
struct llama_sbatch sbatch; // TODO: revisit if needed
struct llama_kv_cache kv_self;
struct llama_control_vector cvec;
struct llama_cparams cparams;
struct llama_sbatch sbatch; // TODO: revisit if needed
struct llama_kv_cache kv_self;
struct llama_adapter_cvec cvec;
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
std::unordered_map<struct llama_adapter_lora *, float> lora;
std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;

View File

@@ -345,194 +345,194 @@ const char * llama_grammar_parser::parse_sequence(
size_t last_sym_start = rule.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == rule.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
if (last_sym_start == rule.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
if (min_times == 0) {
rule.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
llama_grammar_rule rec_rule(prev_rule);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(prev_rule.size());
uint32_t rec_rule_id = generate_symbol_id( rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule( rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = rule.size();
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = rule.size();
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < rule.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
rule.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = rule.size();
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(rule_name);
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
last_sym_start = rule.size();
// output reference to synthesized rule
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char
last_sym_start = rule.size();
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
if (min_times == 0) {
rule.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
}
}
return pos;
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
llama_grammar_rule rec_rule(prev_rule);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(prev_rule.size());
uint32_t rec_rule_id = generate_symbol_id( rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule( rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = rule.size();
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = rule.size();
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < rule.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
rule.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = rule.size();
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(rule_name);
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
last_sym_start = rule.size();
// output reference to synthesized rule
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char
last_sym_start = rule.size();
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
}
}
return pos;
}
const char * llama_grammar_parser::parse_rule(const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(src, name_len);
const std::string name(src, name_len);
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
bool llama_grammar_parser::parse(const char * src) {
try {
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
}
}
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
rules.clear();
return false;
}
@@ -960,10 +960,28 @@ struct llama_grammar * llama_grammar_init_impl(
// Important: vec_rules has to be moved here, not copied, because stacks contains
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
return new llama_grammar {
vocab,
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
/* .lazy =*/ false,
/* .awaiting_trigger = */ false,
/* .trigger_buffer = */ "",
/* .trigger_tokens = */ {},
/* .trigger_words = */ {},
};
}
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
bool lazy,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens) {
llama_grammar_parser parser;
// if there is a grammar, parse it
@@ -1035,10 +1053,31 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
}
} while (true);
std::vector<llama_token> vec_trigger_tokens;
std::vector<std::string> vec_trigger_words;
for (size_t i = 0; i < num_trigger_tokens; i++) {
GGML_ASSERT(trigger_tokens != nullptr);
vec_trigger_tokens.push_back(trigger_tokens[i]);
}
for (size_t i = 0; i < num_trigger_words; i++) {
GGML_ASSERT(trigger_words != nullptr);
vec_trigger_words.push_back(trigger_words[i]);
}
// Important: vec_rules has to be moved here, not copied, because stacks contains
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
return new llama_grammar {
vocab,
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
/* .lazy = */ lazy,
/* .awaiting_trigger = */ lazy,
/* .trigger_buffer = */ "",
std::move(vec_trigger_tokens),
std::move(vec_trigger_words),
};
}
void llama_grammar_free_impl(struct llama_grammar * grammar) {
@@ -1055,6 +1094,11 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
grammar.rules,
grammar.stacks,
grammar.partial_utf8,
grammar.lazy,
grammar.awaiting_trigger,
grammar.trigger_buffer,
grammar.trigger_tokens,
grammar.trigger_words,
};
// redirect elements in stacks to point to new rules
@@ -1076,6 +1120,10 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
GGML_ASSERT(grammar.vocab != nullptr);
if (grammar.awaiting_trigger) {
return;
}
bool allow_eog = false;
for (const auto & stack : grammar.stacks) {
if (stack.empty()) {
@@ -1092,9 +1140,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id;
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
const std::string & piece = grammar.vocab->token_to_piece(id);
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
if (grammar.vocab->is_eog(id)) {
if (!allow_eog) {
cur_p->data[i].logit = -INFINITY;
}
@@ -1115,7 +1163,35 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
GGML_ASSERT(grammar.vocab != nullptr);
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
const auto & piece = grammar.vocab->token_to_piece(token);
if (grammar.awaiting_trigger) {
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
grammar.awaiting_trigger = false;
grammar.trigger_buffer.clear();
llama_grammar_accept_str(grammar, piece);
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
return;
} else {
// TODO: consider a smarter incremental substring search algorithm (store last position to search from).
grammar.trigger_buffer += piece;
for (const auto & word : grammar.trigger_words) {
auto pos = grammar.trigger_buffer.find(word);
if (pos != std::string::npos) {
grammar.awaiting_trigger = false;
auto constrained_str = grammar.trigger_buffer.substr(pos);
grammar.trigger_buffer.clear();
llama_grammar_accept_str(grammar, constrained_str);
LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
return;
}
}
LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
return;
}
}
if (grammar.vocab->is_eog(token)) {
for (const auto & stack : grammar.stacks) {
if (stack.empty()) {
return;
@@ -1124,8 +1200,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
GGML_ABORT("fatal error");
}
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
llama_grammar_accept_str(grammar, piece);
}
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
const auto & code_points = decoded.first;
@@ -1135,5 +1213,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
}
grammar.partial_utf8 = decoded.second;
GGML_ASSERT(!grammar.stacks.empty());
if (grammar.stacks.empty()) {
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
}
}

View File

@@ -114,6 +114,15 @@ struct llama_grammar {
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
// lazy grammars wait for trigger words or tokens before constraining the sampling.
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
// (useful e.g. for tool_choice=required)
bool lazy = false;
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
std::vector<std::string> trigger_words;
};
//
@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
size_t n_rules,
size_t start_rule_index);
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
bool lazy,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens);
void llama_grammar_free_impl(struct llama_grammar * grammar);
@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
void llama_grammar_accept_impl(
struct llama_grammar & grammar,
llama_token token);
void llama_grammar_accept_str(
struct llama_grammar & grammar,
const std::string & piece);

View File

@@ -54,7 +54,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
uint32_t llama_hparams::n_embd_k_s() const {
if (wkv_head_size != 0) {
// for RWKV models
return 2 * n_embd;
return token_shift_count * n_embd;
}
// TODO: maybe support other convolution strides than 1
@@ -82,4 +82,4 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
bool llama_hparams::cross_attention_layers(uint32_t il) const {
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
}
}

View File

@@ -30,7 +30,6 @@ struct llama_hparams {
bool use_par_res;
bool swin_norm;
uint32_t n_vocab = 0;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_embd_features = 0;
@@ -41,8 +40,8 @@ struct llama_hparams {
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_vocab_type = 0; // for BERT-style token types
uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
@@ -79,6 +78,7 @@ struct llama_hparams {
uint32_t time_mix_extra_dim = 0;
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
uint32_t token_shift_count = 2;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
@@ -141,7 +141,7 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
// cross attention layers
// cross attention layers
bool cross_attention_layers(uint32_t il) const;
};

View File

@@ -1,5 +1,6 @@
#include "llama-impl.h"
#include "gguf.h"
#include "llama.h"
#include <cinttypes>
@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {

View File

@@ -6,13 +6,13 @@
#include <vector>
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# if defined(__MINGW32__) && !defined(__clang__)
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# else
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
# define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
//

View File

@@ -72,39 +72,6 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < n_layer; i++) {
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const llama_model::buft_list_t * buft_list;
if (offload) {
buft_list = model.dev_layer.at(i).buft_list;
} else {
buft_list = &model.cpu_buft_list;
}
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
[&](ggml_context * ctx) {
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
return k;
}
ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
});
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
return false;
}
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
continue;
}
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -112,7 +79,7 @@ bool llama_kv_cache_init(
ggml_backend_buffer_type_t buft;
if (offload) {
auto * dev = model.dev_layer.at(i).dev;
auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev);
} else {
buft = ggml_backend_cpu_buffer_type();
@@ -124,8 +91,17 @@ bool llama_kv_cache_init(
return false;
}
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
ggml_tensor * k, *v;
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
} else {
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
}
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
@@ -152,10 +128,10 @@ bool llama_kv_cache_init(
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
struct llama_kv_cache & cache,
const struct llama_ubatch & batch) {
const uint32_t n_tokens = batch.n_tokens;
const uint32_t n_seqs = batch.n_seqs;
const uint32_t n_seq_tokens = batch.n_seq_tokens;
const struct llama_ubatch & ubatch) {
const uint32_t n_tokens = ubatch.n_tokens;
const uint32_t n_seqs = ubatch.n_seqs;
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
if (cache.recurrent) {
// For recurrent state architectures (like Mamba or RWKV),
@@ -163,16 +139,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT(batch.equal_seqs);
GGML_ASSERT(ubatch.equal_seqs);
int32_t min = cache.size - 1;
int32_t max = 0;
// everything should fit if all seq_ids are smaller than the max
for (uint32_t s = 0; s < n_seqs; ++s) {
const uint32_t n_seq_id = batch.n_seq_id[s];
const uint32_t n_seq_id = ubatch.n_seq_id[s];
for (uint32_t j = 0; j < n_seq_id; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j];
const llama_seq_id seq_id = ubatch.seq_id[s][j];
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
// too big seq_id
@@ -231,7 +207,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// find usable cell range
for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = batch.seq_id[s][0];
const llama_seq_id seq_id = ubatch.seq_id[s][0];
llama_kv_cell & seq_meta = cache.cells[seq_id];
bool has_cell = false;
if (seq_meta.tail >= 0) {
@@ -270,7 +246,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// gather and re-order
for (uint32_t s = 0; s < n_seqs; ++s) {
int32_t dst_id = s + min;
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
if (dst_id != src_id) {
llama_kv_cell & dst_cell = cache.cells[dst_id];
llama_kv_cell & src_cell = cache.cells[src_id];
@@ -291,7 +267,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// update the pos of the used seqs
for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
int32_t cell_id = s + min;
llama_kv_cell & cell = cache.cells[cell_id];
@@ -299,12 +275,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
__func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
}
cell.pos = last_pos;
cell.seq_id.clear();
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j];
for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
const llama_seq_id seq_id = ubatch.seq_id[s][j];
cell.seq_id.insert(seq_id);
cache.cells[seq_id].tail = cell_id;
}
@@ -358,10 +334,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
for (uint32_t s = 0; s < n_seqs; s++) {
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
uint32_t k = s*n_seq_tokens + i;
cache.cells[cache.head + k].pos = batch.pos[k];
cache.cells[cache.head + k].pos = ubatch.pos[k];
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
}
}
}

View File

@@ -37,7 +37,7 @@ struct llama_kv_cache {
bool can_shift = false;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// for a free KV slot. llama_decode_impl also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;

View File

@@ -7,6 +7,7 @@
#include <cstring>
#include <climits>
#include <stdexcept>
#include <cerrno>
#ifdef __has_include
#if __has_include(<unistd.h>)
@@ -35,7 +36,7 @@
// TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32)
std::string llama_format_win_err(DWORD err) {
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
@@ -241,12 +242,16 @@ llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }
int llama_file::fileno() const {
int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
#else
#if defined(fileno)
return fileno(pimpl->fp);
#else
return ::fileno(pimpl->fp);
#endif
#endif
}
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
@@ -265,7 +270,7 @@ struct llama_mmap::impl {
impl(struct llama_file * file, size_t prefetch, bool numa) {
size = file->size();
int fd = file->fileno();
int fd = file->file_id();
int flags = MAP_SHARED;
if (numa) { prefetch = 0; }
#ifdef __linux__
@@ -357,7 +362,7 @@ struct llama_mmap::impl {
size = file->size();
HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno());
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);

View File

@@ -1,5 +1,6 @@
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
@@ -18,7 +19,7 @@ struct llama_file {
size_t tell() const;
size_t size() const;
int fileno() const;
int file_id() const; // fileno overload
void seek(size_t offset, int whence) const;

View File

@@ -7,6 +7,10 @@
#include <cstring>
#include <future>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
const char * llama_file_version_name(llama_fver version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,8 +21,78 @@ const char * llama_file_version_name(llama_fver version) {
return "unknown";
}
static std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
}
switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
default: return "unknown, may not work";
}
}
// return a list of splits for a given path
// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
std::vector<std::string> paths;
std::string split_prefix;
std::vector<char> buf(llama_path_max(), 0);
{
int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
if (!ret) {
throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
}
split_prefix = std::string(buf.data(), ret);
}
if (split_prefix.empty()) {
throw std::runtime_error(format("invalid split file: %s", path.c_str()));
}
for (int idx = 0; idx < n_split; ++idx) {
int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
paths.push_back(std::string(buf.data(), ret));
}
return paths;
}
namespace GGUFMeta {
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
struct GKV_Base_Type {
static constexpr gguf_type gt = gt_;
@@ -60,10 +134,11 @@ namespace GGUFMeta {
public:
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
static ArrayInfo getter(const gguf_context *ctx, const int k) {
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
return ArrayInfo {
gguf_get_arr_type(ctx, k),
arr_type,
size_t(gguf_get_arr_n(ctx, k)),
gguf_get_arr_data(ctx, k),
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
};
}
};
@@ -368,7 +443,12 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
llama_model_loader::llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
@@ -380,6 +460,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
}
}
// Load the main GGUF
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
@@ -415,35 +496,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
// Load additional GGML contexts
if (n_split > 1) {
// make sure the main file is loaded first
uint16_t idx = 0;
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
get_key(kv_split_no, idx);
if (idx != 0) {
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
}
std::vector<char> split_prefix(llama_path_max(), 0);
if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) {
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
// generate list of splits if needed
if (splits.empty()) {
splits = llama_get_list_splits(fname, idx, n_split);
}
// in case user give a custom list of splits, check if it matches the expected number
if (n_split != (uint16_t)splits.size()) {
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
}
if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
}
std::vector<char> split_path(llama_path_max(), 0);
// load other splits
for (idx = 1; idx < n_split; idx++) {
llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split);
const char * fname_split = splits[idx].c_str();
struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) };
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data()));
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
}
files.emplace_back(new llama_file(split_path.data(), "rb"));
// check idx
{
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
if (kid < 0) {
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
}
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
if (idx_gguf != idx) {
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
}
}
files.emplace_back(new llama_file(fname_split, "rb"));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
@@ -556,7 +656,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(meta.get(), i);
@@ -722,7 +822,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
for (const auto & file : files) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
mmaps_used.emplace_back(mapping->size(), 0);
if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -1011,3 +1111,17 @@ bool llama_model_loader::load_all_data(
return true;
}
std::string llama_model_loader::ftype_name() const {
return llama_model_ftype_name(ftype);
}
void llama_model_loader::print_info() const {
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
if (n_bytes < GiB) {
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
} else {
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
}
}

View File

@@ -90,7 +90,12 @@ struct llama_model_loader {
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p);
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
@@ -155,4 +160,8 @@ struct llama_model_loader {
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data);
std::string ftype_name() const;
void print_info() const;
};

File diff suppressed because it is too large Load Diff

View File

@@ -4,81 +4,83 @@
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-vocab.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include <stdexcept>
struct llama_model_loader;
// available models
// TODO: this enum does not follow the enum naming convention
enum llm_type {
MODEL_UNKNOWN,
MODEL_14M,
MODEL_17M,
MODEL_22M,
MODEL_33M,
MODEL_60M,
MODEL_70M,
MODEL_80M,
MODEL_109M,
MODEL_137M,
MODEL_160M,
MODEL_220M,
MODEL_250M,
MODEL_270M,
MODEL_335M,
MODEL_410M,
MODEL_450M,
MODEL_770M,
MODEL_780M,
MODEL_0_5B,
MODEL_1B,
MODEL_1_3B,
MODEL_1_4B,
MODEL_1_5B,
MODEL_1_6B,
MODEL_2B,
MODEL_2_8B,
MODEL_3B,
MODEL_4B,
MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
MODEL_9B,
MODEL_11B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
MODEL_15B,
MODEL_16B,
MODEL_20B,
MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
MODEL_XL,
MODEL_A1_7B,
MODEL_A2_7B,
MODEL_8x7B,
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
MODEL_57B_A14B,
MODEL_27B,
LLM_TYPE_UNKNOWN,
LLM_TYPE_14M,
LLM_TYPE_17M,
LLM_TYPE_22M,
LLM_TYPE_33M,
LLM_TYPE_60M,
LLM_TYPE_70M,
LLM_TYPE_80M,
LLM_TYPE_109M,
LLM_TYPE_137M,
LLM_TYPE_160M,
LLM_TYPE_220M,
LLM_TYPE_250M,
LLM_TYPE_270M,
LLM_TYPE_335M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_5B,
LLM_TYPE_1B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
LLM_TYPE_1_6B,
LLM_TYPE_2B,
LLM_TYPE_2_8B,
LLM_TYPE_3B,
LLM_TYPE_4B,
LLM_TYPE_6B,
LLM_TYPE_6_9B,
LLM_TYPE_7B,
LLM_TYPE_8B,
LLM_TYPE_9B,
LLM_TYPE_11B,
LLM_TYPE_12B,
LLM_TYPE_13B,
LLM_TYPE_14B,
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
LLM_TYPE_35B,
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
LLM_TYPE_MEDIUM,
LLM_TYPE_LARGE,
LLM_TYPE_XL,
LLM_TYPE_A1_7B,
LLM_TYPE_A2_7B,
LLM_TYPE_8x7B,
LLM_TYPE_8x22B,
LLM_TYPE_16x12B,
LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
};
struct llama_layer_posnet {
@@ -243,15 +245,19 @@ struct llama_layer {
struct ggml_tensor * time_mix_lerp_v = nullptr;
struct ggml_tensor * time_mix_lerp_r = nullptr;
struct ggml_tensor * time_mix_lerp_g = nullptr;
struct ggml_tensor * time_mix_lerp_fused = nullptr;
struct ggml_tensor * time_mix_first = nullptr;
struct ggml_tensor * time_mix_decay = nullptr;
struct ggml_tensor * time_mix_decay_w1 = nullptr;
struct ggml_tensor * time_mix_decay_w2 = nullptr;
struct ggml_tensor * time_mix_key = nullptr;
struct ggml_tensor * time_mix_value = nullptr;
struct ggml_tensor * time_mix_receptance = nullptr;
struct ggml_tensor * time_mix_gate = nullptr;
struct ggml_tensor * time_mix_first = nullptr;
struct ggml_tensor * time_mix_decay = nullptr;
struct ggml_tensor * time_mix_decay_w1 = nullptr;
struct ggml_tensor * time_mix_decay_w2 = nullptr;
struct ggml_tensor * time_mix_key = nullptr;
struct ggml_tensor * time_mix_key_b = nullptr;
struct ggml_tensor * time_mix_value = nullptr;
struct ggml_tensor * time_mix_value_b = nullptr;
struct ggml_tensor * time_mix_receptance = nullptr;
struct ggml_tensor * time_mix_receptance_b = nullptr;
struct ggml_tensor * time_mix_gate = nullptr;
struct ggml_tensor * time_mix_ln = nullptr;
struct ggml_tensor * time_mix_ln_b = nullptr;
@@ -280,7 +286,7 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
// cross attention
// cross attention
struct ggml_tensor * cross_attn_k_norm = nullptr;
struct ggml_tensor * cross_attn_k_proj = nullptr;
struct ggml_tensor * cross_attn_o_proj = nullptr;
@@ -296,11 +302,9 @@ struct llama_layer {
};
struct llama_model {
llm_type type = MODEL_UNKNOWN;
llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
@@ -329,117 +333,53 @@ struct llama_model {
std::vector<llama_layer> layers;
llama_model_params params;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
llama_split_mode split_mode;
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev {
ggml_backend_dev_t dev;
buft_list_t * buft_list;
};
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;
// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
// total number of parameters in the model
uint64_t n_elements = 0;
explicit llama_model(const struct llama_model_params & params);
~llama_model();
// total size of all the tensors in the model in bytes
size_t n_bytes = 0;
void load_stats (llama_model_loader & ml);
void load_arch (llama_model_loader & ml);
void load_hparams(llama_model_loader & ml);
void load_vocab (llama_model_loader & ml);
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
std::string arch_name() const;
std::string type_name() const;
std::string desc() const;
size_t size() const;
size_t max_nodes() const;
size_t n_devices() const;
// total number of parameters in the model
uint64_t n_elements() const;
void print_info() const;
ggml_backend_dev_t dev_layer(int il) const;
ggml_backend_dev_t dev_output() const;
ggml_backend_buffer_type_t select_buft(int il) const;
const struct ggml_tensor * get_tensor(const char * name) const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
};
const char * llm_type_name(llm_type type);
std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
template<typename F>
bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error("failed to create ggml context");
}
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
return op_supported;
}
template<typename F>
ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error("no suitable buffer type found");
}
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
// used by llama_adapter_lora
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
size_t llama_model_max_nodes(const llama_model & model);
struct llama_model_loader;
// TODO: become llama_model methods
void llm_load_stats (llama_model_loader & ml, llama_model & model);
void llm_load_arch (llama_model_loader & ml, llama_model & model);
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);

View File

@@ -7,14 +7,12 @@
#include <algorithm>
#include <cmath>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <mutex>
#include <thread>
#include <unordered_map>
// TODO: replace with ggml API call
#define QK_K 256
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
@@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) {
}
}
struct quantize_state_internal {
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
@@ -43,13 +41,13 @@ struct quantize_state_internal {
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
static void llama_tensor_dequantize_internal(
static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
@@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal(
workers.clear();
}
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
} else {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
const int64_t nx = tensor->ne[0];
const int64_t qk_k = ggml_blck_size(new_type);
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs.model.type == MODEL_70B) {
if (qs.model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
new_type == GGML_TYPE_IQ1_M) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
{
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
@@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type;
}
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) {
// single-thread
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
@@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
return new_size;
}
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
@@ -526,18 +525,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
ml.init_mappings(false); // no prefetching
llama_model model;
llm_load_arch (ml, model);
llm_load_hparams(ml, model);
llm_load_stats (ml, model);
llama_model model(llama_model_default_params());
struct quantize_state_internal qs(model, params);
model.load_arch (ml);
model.load_hparams(ml);
model.load_stats (ml);
struct quantize_state_impl qs(model, params);
if (params->only_copy) {
ftype = model.ftype;
ftype = ml.ftype;
}
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
@@ -621,7 +623,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// sanity checks
// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0)
{
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads
@@ -761,6 +764,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -839,7 +843,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
@@ -868,7 +872,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}
@@ -877,7 +881,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
// write tensor data + padding
fout.write((const char *) new_data, new_size);
@@ -921,7 +926,7 @@ uint32_t llama_model_quantize(
const char * fname_out,
const llama_model_quantize_params * params) {
try {
llama_model_quantize_internal(fname_inp, fname_out, params);
llama_model_quantize_impl(fname_inp, fname_out, params);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
return 1;

View File

@@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
for (int i = 0; i < (int)cur_p->size; ++i) {
const float val = cur_p->data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets-1, ib));
ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
@@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
for (int i = 0; i < (int)cur_p->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
for (int j = nbuckets-1; j > ib; --j) {
for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
@@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
// llama_sampler API
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
return new llama_sampler {
/* .iface = */ iface,
/* .ctx = */ ctx,
};
}
const char * llama_sampler_name(const struct llama_sampler * smpl) {
if (!smpl->iface) {
return "(null)";
@@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
}
if (smpl->ctx == nullptr) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ smpl->iface,
/* .ctx = */ nullptr,
};
/* .ctx = */ nullptr
);
}
GGML_ABORT("the sampler does not support cloning");
@@ -371,7 +378,10 @@ void llama_sampler_free(struct llama_sampler * smpl) {
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
// TODO: do not allocate each time
std::vector<llama_token_data> cur;
@@ -469,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
};
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_chain_i,
/* .ctx = */ new llama_sampler_chain {
/* .params = */ params,
/* .samplers = */ {},
/* .t_sample_us = */ 0,
/* .n_sample = */ 0,
},
};
}
);
}
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@@ -543,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
};
struct llama_sampler * llama_sampler_init_greedy() {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_greedy_i,
/* .ctx = */ nullptr,
};
/* .ctx = */ nullptr
);
}
// dist
@@ -605,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_dist_i,
/* .ctx = */ new llama_sampler_dist {
/* .seed = */ seed,
/* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
);
}
// softmax
@@ -635,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
};
struct llama_sampler * llama_sampler_init_softmax() {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_softmax_i,
/* .ctx = */ nullptr,
};
/* .ctx = */ nullptr
);
}
// top-k
@@ -675,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
};
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_top_k_i,
/* .ctx = */ new llama_sampler_top_k {
/* .k = */ k,
},
};
}
);
}
// top-p
@@ -741,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
};
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_top_p_i,
/* .ctx = */ new llama_sampler_top_p {
/* .p = */ p,
/* .min_keep = */ min_keep,
},
};
}
);
}
// min-p
@@ -837,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
};
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_min_p_i,
/* .ctx = */ new llama_sampler_min_p {
/* .p = */ p,
/* .min_keep = */ min_keep,
},
};
}
);
}
// typical
@@ -936,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
};
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_typical_i,
/* .ctx = */ new llama_sampler_typical {
/* .p = */ p,
/* .min_keep = */ min_keep,
},
};
}
);
}
// temp
@@ -980,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
};
struct llama_sampler * llama_sampler_init_temp(float temp) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_i,
/* .ctx = */ new llama_sampler_temp {
/*.temp = */ temp,
},
};
}
);
}
// temp-ext
@@ -1090,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
};
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_ext_i,
/* .ctx = */ new llama_sampler_temp_ext {
/* .temp = */ temp,
/* .delta = */ delta,
/* .exponent = */ exponent,
},
};
}
);
}
// xtc
@@ -1182,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_xtc_i,
/* .ctx = */ new llama_sampler_xtc {
/* .probability = */ p,
@@ -1191,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
/* .seed = */ seed,
/* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
);
}
// mirostat
@@ -1289,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_i,
/* .ctx = */ new llama_sampler_mirostat {
/* .n_vocab = */ n_vocab,
@@ -1300,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
/* .m = */ m,
/* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
);
}
// mirostat v2
@@ -1388,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_v2_i,
/* .ctx = */ new llama_sampler_mirostat_v2 {
/* .seed = */ seed,
@@ -1397,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
/* .eta = */ eta,
/* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
);
}
// grammar
@@ -1430,13 +1440,30 @@ static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token
}
}
// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
static struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
bool lazy,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens);
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
if (!ctx->grammar) {
return;
}
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
std::vector<const char *> trigger_words;
for (auto & word : ctx->grammar->trigger_words) {
trigger_words.push_back(word.c_str());
}
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
llama_grammar_free_impl(ctx->grammar);
ctx->grammar = grammar_new;
@@ -1445,7 +1472,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
// copy the state
{
@@ -1481,29 +1508,55 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
/* .free = */ llama_sampler_grammar_free,
};
struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
static struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
bool lazy,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens) {
auto * ctx = new llama_sampler_grammar;
if (grammar_str != nullptr && grammar_str[0] != '\0') {
*ctx = {
/* .vocab = */ &vocab,
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
/* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
};
} else {
*ctx = {
/* .vocab = */ &vocab,
/* .vocab = */ vocab,
/* .grammar_str = */ {},
/* .grammar_root = */ {},
/* .grammar = */ nullptr,
};
}
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_grammar_i,
/* .ctx = */ ctx,
};
/* .ctx = */ ctx
);
}
struct llama_sampler * llama_sampler_init_grammar(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root) {
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
}
struct llama_sampler * llama_sampler_init_grammar_lazy(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens) {
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
}
// penalties
@@ -1632,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
float penalty_present) {
penalty_last_n = std::max(penalty_last_n, 0);
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties {
/* .penalty_last_n = */ penalty_last_n,
@@ -1641,8 +1694,75 @@ struct llama_sampler * llama_sampler_init_penalties(
/* .penalty_present = */ penalty_present,
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
/* .token_count = */ {},
},
};
}
);
}
// top-n-sigma
struct llama_sampler_top_n_sigma {
const float n;
};
static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
return "top-n-sigma";
}
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
// find max logit and calculate mean
float max = cur_p->data[0].logit;
float logits_sum = 0;
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].logit > max) {
max = cur_p->data[i].logit;
}
logits_sum += cur_p->data[i].logit;
}
float mean = logits_sum/cur_p->size;
// calculate standard deviation
float acc = 0;
for (size_t i = 0; i < cur_p->size; ++i) {
acc += pow(cur_p->data[i].logit - mean, 2);
}
float std = sqrt(acc/cur_p->size);
//apply mask
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].logit < max - (ctx->n * std)) {
cur_p->data[i].logit = -INFINITY;
}
}
llama_sampler_softmax_impl(cur_p);
}
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
return llama_sampler_init_top_n_sigma(ctx->n);
}
static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
delete (llama_sampler_top_n_sigma *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
/* .name = */ llama_sampler_top_n_sigma_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_top_n_sigma_apply,
/* .reset = */ nullptr,
/* .clone = */ llama_sampler_top_n_sigma_clone,
/* .free = */ llama_sampler_top_n_sigma_free,
};
struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
return llama_sampler_init(
/* .iface = */ &llama_sampler_top_n_sigma_i,
/* .ctx = */ new llama_sampler_top_n_sigma {
/* .n = */ n,
}
);
}
// DRY
@@ -1663,8 +1783,8 @@ struct llama_sampler_dry {
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
std::string word = llama_detokenize(vocab, {token_id}, true);
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
std::string word = vocab.detokenize({token_id}, true);
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector<llama_token>());
} else {
@@ -1681,7 +1801,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
}
}
if (match) {
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
tokenization.resize(max_tail_len);
}
@@ -1832,7 +1952,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
if (n > 0) {
lt = k;
rt = k+n-1;
rt = k + n - 1;
}
} else {
// If k is inside the current Z-box, consider two cases.
@@ -1937,7 +2057,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
llama_vocab dummy_vocab;
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
// Copy the state, including the processed breakers
{
@@ -1964,7 +2084,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
/* .free = */ llama_sampler_dry_free,
};
struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
const int MAX_CHAR_LEN = 40;
@@ -1991,11 +2111,11 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
sequence_break.resize(MAX_CHAR_LEN);
}
get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
}
}
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size,
@@ -2007,14 +2127,14 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
/* .dry_max_token_repeat = */ {},
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
},
};
}
);
}
// wrapper for test-sampling.cpp
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
llama_vocab dummy_vocab;
auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
auto * ctx = (llama_sampler_dry *) result->ctx;
// Process the token-based sequence breakers
@@ -2109,14 +2229,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab,
int32_t n_logit_bias,
const llama_logit_bias * logit_bias) {
return new llama_sampler {
return llama_sampler_init(
/* .iface = */ &llama_sampler_logit_bias_i,
/* .ctx = */ new llama_sampler_logit_bias {
/* .n_vocab = */ n_vocab,
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
/* .to_search = */ {},
},
};
}
);
}
// infill
@@ -2153,7 +2273,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
float p_eog_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_eog_sum += cur_p->data[i].p;
} else {
p_txt_sum += cur_p->data[i].p;
@@ -2175,7 +2295,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
float p_sum = 0.0f;
for (size_t i = 0; i < size_org; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
@@ -2203,17 +2323,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
continue;
}
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
if (len0 < 0) {
ctx->buf0.resize(len0);
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
assert(len0 > 0);
}
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
if (len1 < 0) {
ctx->buf1.resize(len1);
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
assert(len1 > 0);
}
@@ -2248,7 +2368,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
@@ -2269,7 +2389,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
if (n_non_eog == 0) {
cur_p->size = 1;
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
cur_p->data[0].id = ctx->vocab->token_eot();
cur_p->data[0].logit = 1.0f;
return;
@@ -2291,7 +2411,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
@@ -2314,7 +2434,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
return llama_sampler_init_infill_impl(*ctx->vocab);
return llama_sampler_init_infill(ctx->vocab);
}
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
@@ -2330,16 +2450,15 @@ static struct llama_sampler_i llama_sampler_infill_i = {
/* .free = */ llama_sampler_infill_free,
};
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab) {
return new llama_sampler {
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
return llama_sampler_init(
/* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill {
/* .vocab = */ &vocab,
/* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512),
},
};
/* .vocab = */ vocab,
/* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512),
}
);
}
// utils

View File

@@ -2,7 +2,9 @@
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
#include "llama-grammar.h"
#include "llama.h"
#include <vector>
struct llama_vocab;
struct llama_grammar;
@@ -21,24 +23,6 @@ struct llama_sampler_chain {
mutable int32_t n_sample;
};
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,
const char * grammar_root);
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab);
struct llama_sampler * llama_sampler_init_dry_impl(
const struct llama_vocab & vocab,
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const char ** seq_breakers,
size_t num_breakers);
struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size,
float dry_multiplier,

File diff suppressed because it is too large Load Diff

View File

@@ -4,179 +4,122 @@
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
#include <memory>
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
struct llm_tokenizer;
struct LLM_KV;
struct llama_model_loader;
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
tattr attr;
std::string text;
float score;
llama_token_attr attr;
};
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1;
id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13;
// fim tokens
id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
llama_vocab();
~llama_vocab();
void load(llama_model_loader & ml, const LLM_KV & kv);
enum llama_vocab_type get_type() const;
enum llama_vocab_pre_type get_pre_type() const;
uint32_t n_tokens() const;
uint32_t n_token_types() const;
std::string type_name() const;
bool is_normal (llama_token id) const;
bool is_unknown (llama_token id) const;
bool is_control (llama_token id) const;
bool is_byte (llama_token id) const;
bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
uint8_t token_to_byte(llama_token id) const;
llama_token byte_to_token(uint8_t ch) const;
llama_token text_to_token(const std::string & text) const;
const token_data & get_token_data(llama_token id) const;
const char * token_get_text (llama_token id) const;
float token_get_score(llama_token id) const;
llama_token_attr token_get_attr (llama_token id) const;
llama_token token_bos() const;
llama_token token_eos() const;
llama_token token_eot() const;
llama_token token_eom() const;
llama_token token_unk() const;
llama_token token_sep() const;
llama_token token_nl () const;
llama_token token_pad() const;
llama_token token_prefix() const;
llama_token token_middle() const;
llama_token token_suffix() const;
llama_token token_fim_pre() const;
llama_token token_fim_suf() const;
llama_token token_fim_mid() const;
llama_token token_fim_pad() const;
llama_token token_fim_rep() const;
llama_token token_fim_sep() const;
bool get_add_space_prefix () const;
bool get_add_bos () const;
bool get_add_eos () const;
bool get_ignore_merges () const;
bool get_clean_spaces () const;
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
int max_token_len() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
int32_t tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const;
std::vector<llama_token> tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special = false) const;
// does not write null-terminator to buf
int32_t token_to_piece(
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) const;
// use cached data
const std::string & token_to_piece(llama_token token) const;
int32_t detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const;
std::string detokenize(
const std::vector<llama_token> & tokens,
bool special) const;
void print_info() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
};
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
// TODO: move the API below as member functions of llama_vocab
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
// check if token0 is contained as a prefix in token1
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
std::string llama_detokenize(
const struct llama_vocab & vocab,
const std::vector<llama_token> & tokens,
bool special);

File diff suppressed because it is too large Load Diff

View File

@@ -12,18 +12,17 @@
#include <algorithm>
#include <cassert>
#include <codecvt>
#include <cstddef>
#include <cstdint>
#include <locale>
#include <map>
#include <regex>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
try {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
}
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
}
return result;
}
@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
const auto cpts = unicode_cpts_from_utf8(text);
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
std::string text_collapsed;
if (need_collapse) {
// collapse all unicode categories

View File

@@ -14,6 +14,7 @@ package llama
#include "llama.h"
#include "clip.h"
#include "llava.h"
#include "gguf.h"
#include "mllama.h"
#include "sampling_ext.h"
@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
}
func (m *Model) NumVocab() int {
return int(C.llama_n_vocab(m.c))
return int(C.llama_n_vocab(m.Vocab()))
}
func (m *Model) TokenIsEog(token int) bool {
return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
return bool(C.llama_token_is_eog(m.Vocab(), C.llama_token(token)))
}
func (m *Model) AddBOSToken() bool {
return bool(C.llama_add_bos_token(m.c))
return bool(C.llama_add_bos_token(m.Vocab()))
}
func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error {
cLoraPath := C.CString(loraPath)
defer C.free(unsafe.Pointer(cLoraPath))
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
loraAdapter := C.llama_adapter_lora_init(m.c, cLoraPath)
if loraAdapter == nil {
return errors.New("unable to load lora")
}
err := -1
if loraAdapter != nil {
err = int(C.llama_lora_adapter_set(context.c, loraAdapter, C.float(scale)))
err = int(C.llama_set_adapter_lora(context.c, loraAdapter, C.float(scale)))
}
if err != 0 {
return errors.New("error applying lora from file")
@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
return nil
}
func (m *Model) Vocab() *C.struct_llama_vocab {
return C.llama_model_get_vocab(m.c)
}
type Batch struct {
c C.struct_llama_batch
batchSize int
@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string {
tokenLen := 12
buf := make([]byte, tokenLen)
tokenLen = int(C.llama_token_to_piece(
m.c,
m.Vocab(),
C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen),
@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string {
buf = make([]byte, tokenLen)
C.llama_token_to_piece(
m.c,
m.Vocab(),
C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen),
@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
defer C.free(unsafe.Pointer(cText))
result := C.llama_tokenize(
m.c,
m.Vocab(),
cText,
C.int32_t(len(text)),
&cTokens[0],
@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
maxTokens = int(-result)
cTokens = make([]C.llama_token, maxTokens)
result = C.llama_tokenize(
m.c,
m.Vocab(),
cText,
C.int32_t(len(text)),
&cTokens[0],

1
llama/mllama.cpp vendored
View File

@@ -5,6 +5,7 @@
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"

View File

@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
3 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index e2d6c405..a12172dc 100644
index dba7be33..1ca40b2c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0b06be72..be29e979 100644
index ebb2ccae..b094929b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a85502ee..cd8ef741 100644
index c550142a..fd9a4e77 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);

View File

@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] pretokenizer
---
src/llama-model.cpp | 14 +++-----------
src/llama-vocab.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 405e0528..00b80c52 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index ad9ffe66..a4eee9b8 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
- if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

View File

@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..b9c4a5bf 100644
index 671d2a81..47e79ed4 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index ea78ea48..4eb3f6b9 100644
index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
break;
}
}

View File

@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..b3c1829f 100644
index 76d4a785..205af1eb 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@
@@ -58,6 +58,19 @@
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)
@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}

View File

@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama-arch.cpp | 53 +++++++----
src/llama-arch.cpp | 21 +++++
src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 +
src/llama-hparams.h | 5 ++
src/llama-model-loader.cpp | 1 +
src/llama-model.cpp | 16 ++++
src/llama-model.cpp | 44 +++++++++++
src/llama-model.h | 3 +
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
8 files changed, 258 insertions(+), 16 deletions(-)
src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++-
8 files changed, 236 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 007d79f8..5b376c5e 100644
index 97a1e7e5..a1e0ebcc 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
+ {
@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
+ },
+ },
{
LLM_ARCH_UNKNOWN,
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 45e458bb..eac7055b 100644
index 122fdceb..77919578 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -63,6 +63,7 @@ enum llm_arch {
@@ -65,6 +65,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN,
};
@@ -126,6 +127,7 @@ enum llm_kv {
@@ -129,6 +130,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -305,6 +307,7 @@ enum llm_tensor {
@@ -311,6 +313,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index c4053469..450738da 100644
index ea87b295..f3955de9 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
@@ -152,10 +120,10 @@ index c4053469..450738da 100644
+}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index a29f20ec..fd898e27 100644
index 1fe45410..1bdcdfd5 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -52,6 +52,8 @@ struct llama_hparams {
@@ -50,6 +50,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -134,6 +136,9 @@ struct llama_hparams {
@@ -133,6 +135,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 7743b465..422524a8 100644
index 05d58ad9..1252aca1 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -364,6 +364,7 @@ namespace GGUFMeta {
@@ -439,6 +439,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 00b80c52..306c557d 100644
index 36a0a009..ad1315c6 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
default: model.type = e_model::MODEL_UNKNOWN;
@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_SOLAR:
@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr[i];
+ bskcn.fill(0);
+ auto kv = LLM_KV(model.arch);
+ auto kv = LLM_KV(arch);
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
+ }
+
+ switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ case 64: type = LLM_TYPE_22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -54,6 +54,7 @@ enum llm_type {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -275,6 +276,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
+ } break;
+ case LLM_ARCH_SOLAR:
+ {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -10226,6 +10255,158 @@ struct llm_build_context {
return gf;
}
@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444..1afb0024 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -55,6 +55,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -281,6 +282,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7953,9 +7953,155 @@ struct llm_build_context {
cb(img_logits, "img_logits", -1);
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
cb(cur, "result_output", -1);
-
ggml_build_forward_expand(gf, cur);
+ return gf;
+ }
+
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ }
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
return gf;
}
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;

View File

@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index be29e979..aaa79ea4 100644
index b094929b..36165840 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst);
break;

View File

@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask
examples/llava/llava.cpp | 5 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++
src/llama-arch.cpp | 44 ++++++
src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 +
src/llama-context.cpp | 19 ++-
src/llama-context.cpp | 28 ++--
src/llama-context.h | 2 +
src/llama-cparams.h | 1 +
src/llama-hparams.cpp | 8 +-
src/llama-hparams.h | 4 +
src/llama-kv-cache.cpp | 33 ++++
src/llama-hparams.cpp | 6 +
src/llama-hparams.h | 5 +
src/llama-kv-cache.cpp | 13 +-
src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 59 ++-----
src/llama-model.h | 51 ++++++
src/llama-model.cpp | 65 ++++++++-
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++-
17 files changed, 508 insertions(+), 56 deletions(-)
src/llama.cpp | 262 +++++++++++++++++++++++++++++++++-
17 files changed, 452 insertions(+), 22 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 16f30c56..0f0f3f62 100644
index 518aad3f..f0e484a1 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -429,7 +429,7 @@ struct llava_embd_batch {
@@ -445,7 +445,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -441,6 +441,7 @@ struct llava_embd_batch {
@@ -457,6 +457,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 7ddd178b..899d16f2 100644
index 955ed505..95036ef8 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index a0d5ba5d..9f411960 100644
index 47919602..cc948005 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -250,6 +250,7 @@ extern "C" {
@@ -249,6 +249,7 @@ extern "C" {
llama_token * token;
float * embd;
@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -347,6 +348,7 @@ extern "C" {
@@ -343,6 +344,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -426,6 +428,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
@@ -443,6 +445,10 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5b376c5e..b35aeb31 100644
index a1e0ebcc..b6f20286 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -121,15 +121,15 @@ index 5b376c5e..b35aeb31 100644
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644
{
LLM_ARCH_DECI,
{
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index eac7055b..e8235ae0 100644
index 77919578..ec742224 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -10,6 +10,7 @@
@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -128,6 +129,7 @@ enum llm_kv {
@@ -131,6 +132,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -308,6 +310,14 @@ enum llm_tensor {
@@ -314,6 +316,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b9c4a5bf..9d0e7ca3 100644
index 47e79ed4..7b22fe13 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
}
if (ubatch.embd) {
@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644
}
if (ubatch.pos && lctx.inp_pos) {
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
- const auto & vocab = lctx.model.vocab;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
- const auto n_vocab = vocab.n_tokens();
+ const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
void llama_output_reorder(struct llama_context & ctx) {
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+ const uint32_t n_vocab = ctx.model.hparams.n_vocab;
const uint32_t n_embd = ctx.model.hparams.n_embd;
const int32_t n_outputs = ctx.n_outputs;
@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644
void llama_synchronize(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched.get());
@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
- return ctx->logits + j*ctx->model.vocab.n_tokens();
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -886,7 +898,7 @@ struct llama_data_write {
}
void write_logits(const struct llama_context * ctx) {
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
diff --git a/src/llama-context.h b/src/llama-context.h
index 0d163c47..4980a60e 100644
index a9268b29..cf12c9d7 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context {
@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644
enum llama_pooling_type pooling_type;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 450738da..42f8a58f 100644
index f3955de9..0b841028 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@
@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644
}
GGML_ABORT("fatal error");
-}
\ No newline at end of file
+}
+
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index fd898e27..f826cd9a 100644
index 1bdcdfd5..05383046 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -53,6 +53,7 @@ struct llama_hparams {
@@ -41,6 +41,7 @@ struct llama_hparams {
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
@@ -51,6 +52,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
@@ -347,65 +395,45 @@ index fd898e27..f826cd9a 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -139,6 +140,9 @@ struct llama_hparams {
@@ -138,6 +140,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
+
+ // cross attention layers
+ // cross attention layers
+ bool cross_attention_layers(uint32_t il) const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 53379253..cf814dbe 100644
index feffdf0d..b541c5a3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
@@ -91,8 +91,17 @@ bool llama_kv_cache_init(
return false;
}
for (int i = 0; i < n_layer; i++) {
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ ggml_tensor * k, *v;
+
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+ const llama_model::buft_list_t * buft_list;
+ if (offload) {
+ buft_list = model.dev_layer.at(i).buft_list;
+ } else {
+ buft_list = &model.cpu_buft_list;
+ }
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+ return k;
+ }
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+ });
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+ return false;
+ }
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
+ ggml_format_name(v, "cache_v_l%d", i);
+ cache.k_l.push_back(k);
+ cache.v_l.push_back(v);
+ continue;
+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ } else {
+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ }
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 422524a8..b12d6566 100644
index 1252aca1..45d08721 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -240,6 +240,8 @@ namespace GGUFMeta {
@@ -315,6 +315,8 @@ namespace GGUFMeta {
return true;
}
@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 306c557d..4f9bbf90 100644
index ad1315c6..21819080 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
return llama_model_ftype_name(model.ftype);
}
@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- ggml_context_ptr ctx { ggml_init(params) };
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
-
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
- }
-
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
- return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
- return cur_buft;
- }
- }
-
- throw std::runtime_error(format("no suitable buffer type found"));
-}
-
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(
*model.dev_layer.at(il).buft_list,
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
}
} break;
@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 40: model.type = e_model::MODEL_11B; break;
+ case 100: model.type = e_model::MODEL_90B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_11B; break;
+ case 100: type = LLM_TYPE_90B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index c1b9c0a1..5b23e2ba 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include "ggml-cpp.h"
#include <vector>
+#include <stdexcept>
// available models
// TODO: this enum does not follow the enum naming convention
@@ -62,6 +63,7 @@ enum llm_type {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
@@ -278,6 +280,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
+template<typename F>
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context");
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error("no suitable buffer type found");
+}
+
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 42974f8f..27def6fd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 7dec50ae..bac66c24 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+
+ // output
+ {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+ auto & layer = layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644
+ } break;
case LLM_ARCH_DECI:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
- throw std::runtime_error("vocab size mismatch");
+ LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 1afb0024..7cf57587 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include <string>
#include <unordered_map>
#include <vector>
+#include <stdexcept>
struct llama_model_loader;
@@ -63,6 +64,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -284,6 +286,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb798265..6eb1da08 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
if (params.vocab_only) {
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 6d320ea4..8f7902df 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -3593,6 +3654,7 @@ struct llm_build_context {
@@ -1157,6 +1172,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644
}
void free() {
@@ -4074,6 +4136,240 @@ struct llm_build_context {
@@ -1639,6 +1655,240 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644
+ }
+
struct ggml_cgraph * build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644
case LLM_ARCH_DECI:
{
result = llm.build_deci();
@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch, n_embd,
+ lctx.sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self.recurrent,
/* simple_split */ !lctx.kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
@@ -8749,7 +9003,6 @@ static int llama_decode_impl(
const llama_batch & batch = batch_allocr.batch;
const auto & model = lctx.model;
- const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
llama_kv_slot_restorer kv_slot_restorer(kv_self);
const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
uint32_t n_outputs = 0;
uint32_t n_outputs_prev = 0;
@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
const int64_t n_embd = hparams.n_embd;
@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,

View File

@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator
8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c714fc8c..1bc50fca 100644
index dd0c6a96..8d269a9c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" {
@@ -487,6 +487,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1735,6 +1736,15 @@ extern "C" {
@@ -1743,6 +1744,15 @@ extern "C" {
int p0,
int p1);
@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b7fefb9d..b307d554 100644
index 72325349..2f606d82 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
}
}
@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index aaa79ea4..9286f866 100644
index 36165840..1adf08fa 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM:
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8..39fd4b16 100644
index aba539e8..b4b87409 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh
@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index cd8ef741..318addec 100644
index fd9a4e77..e4c093f9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8ba43904..204c93e6 100644
index d092a169..f38909d0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
}
}
@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2bbe5f48..7ffcd907 100644
index 7fc06724..635aa299 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}

View File

@@ -11,10 +11,10 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 3fcfcaa3..8f44705a 100644
index a4eee9b8..1ca827eb 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
"[\r\n]",
@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 7aca6544..6155da80 100644
index e63bb4ab..9dd53b9a 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644
#include "unicode.h"
#include "unicode-data.h"
@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif
return conv.from_bytes(s);

View File

@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index dadc18c8..2a8dbd22 100644
index 3ebcc3d9..30c28808 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -391,7 +391,7 @@ class SchemaConverter {
private:
@@ -346,7 +346,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
- std::map<std::string, std::string> _rules;

View File

@@ -1,22 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 14 Dec 2024 12:54:00 -0800
Subject: [PATCH] fix missing arg in static assert on windows
---
ggml/src/ggml-cuda/concat.cu | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
index 2f42b8a9..5eb9f08d 100644
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb1,
uint64_t nb2,
uint64_t nb3){
- static_assert(dim >= 0 && dim <= 3);
+ static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;

View File

@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete.
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index bac66c24..c95da45d 100644
index 8f7902df..01854fce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
}
@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
@@ -3712,35 +3719,23 @@ struct llm_build_context {
@@ -1230,35 +1237,23 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -3748,31 +3743,29 @@ struct llm_build_context {
@@ -1266,31 +1261,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -10856,7 +10849,7 @@ struct llm_build_context {
@@ -8508,7 +8501,7 @@ struct llm_build_context {
}
};
@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
@@ -136,21 +136,21 @@ index bac66c24..c95da45d 100644
llm.free();
@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
kv_self.head = 0;
}
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644
cont = false;
continue;
}
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644
}
nf++;
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
}
}
@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644
#if 0
// CPU defrag
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
#else
// ggml_graph defrag

View File

@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip
1 file changed, 27 insertions(+), 47 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index b3c1829f..86b91d5c 100644
index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
#include "ggml-alloc.h"
@@ -9,25 +9,25 @@
#include "ggml-backend.h"
#include "gguf.h"
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 899d16f2..135f7df0 100644
index 95036ef8..98d5e14d 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {

View File

@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 84101c32..72b488dd 100644
index 0002ac18..0a8d1092 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,7 +19,7 @@ index 84101c32..72b488dd 100644
endfunction()
ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] try/catch backend load
1 file changed, 23 insertions(+), 22 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 135f7df0..84b21dd8 100644
index 98d5e14d..1c19129a 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,

View File

@@ -1,55 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 9 Feb 2025 17:22:15 -0800
Subject: [PATCH] remove sgemm global variables
removes the 'iq4nlt' global variable in sgemm.cpp that causes
a runtime crash when calling dlopen on ggml-cpu libraries as
its initialization depends on AVX instructions the host machine
may not have
---
ggml/src/ggml-cpu/llamafile/sgemm.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index 8fce576c..3f260ce5 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -279,14 +279,6 @@ template <> inline __m256bh load(const float *p) {
}
#endif
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// CONSTANTS
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
-#endif
-
////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION
@@ -613,6 +605,14 @@ class tinyBLAS_Q0_AVX {
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+ const int8_t kvalues_iq4nl[16] = {
+ -127, -104, -83, -65,
+ -49, -35, -22, -10,
+ 1, 13, 25, 38,
+ 53, 69, 89, 113
+ };
+
+ iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}
void matmul(int64_t m, int64_t n) {
@@ -1037,6 +1037,7 @@ class tinyBLAS_Q0_AVX {
const int64_t ldc;
const int ith;
const int nth;
+ __m128i iq4nlt;
};
#endif // __AVX__

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] use std::filesystem::path instead of wstring
1 file changed, 58 insertions(+), 86 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 84b21dd8..e35a6936 100644
index 1c19129a..c854e6bb 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -66,26 +66,6 @@

View File

@@ -8,10 +8,10 @@ Subject: [PATCH] remove amx
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 72b488dd..50828717 100644
index 0a8d1092..4564df91 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
@@ -19,6 +19,6 @@ index 72b488dd..50828717 100644
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
else ()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
endif()

View File

@@ -0,0 +1,36 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 25 Feb 2025 19:14:51 -0800
Subject: [PATCH] fix-clip-compiler-error
---
examples/llava/clip.cpp | 2 +-
examples/llava/clip.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 560021c7..54265beb 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
}
}
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->buf.resize(3 * nx * ny);
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index ce6f6194..f9f80d7d 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);