mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 06:43:57 +00:00
llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)
This commit is contained in:
101
llama/llama.cpp/src/llama-adapter.cpp
vendored
101
llama/llama.cpp/src/llama-adapter.cpp
vendored
@@ -1,5 +1,7 @@
|
||||
#include "llama-adapter.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -9,7 +11,7 @@
|
||||
|
||||
// vec
|
||||
|
||||
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
ggml_tensor * layer_dir = tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx, cur, layer_dir);
|
||||
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
|
||||
return cur;
|
||||
}
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
bool llama_adapter_cvec::init(const llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
GGML_ASSERT(tensors.empty());
|
||||
GGML_ASSERT(ctxs.empty());
|
||||
GGML_ASSERT(bufs.empty());
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
}
|
||||
|
||||
ctx_map[buft] = ctx;
|
||||
cvec.ctxs.emplace_back(ctx);
|
||||
ctxs.emplace_back(ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
};
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.reserve(hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
tensors.reserve(hparams.n_layer);
|
||||
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
cvec.bufs.reserve(ctx_map.size());
|
||||
bufs.reserve(ctx_map.size());
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.bufs.emplace_back(buf);
|
||||
bufs.emplace_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_control_vector_apply(
|
||||
struct llama_control_vector & cvec,
|
||||
int32_t llama_adapter_cvec::apply(
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
|
||||
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
layer_start = -1;
|
||||
layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
if (tensors.empty()) {
|
||||
if (!init(model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
layer_start = il_start;
|
||||
layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
assert(tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
|
||||
|
||||
// lora
|
||||
|
||||
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
||||
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
|
||||
const std::string name(w->name);
|
||||
|
||||
const auto pos = ab_map.find(name);
|
||||
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
||||
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
ggml_context * ctx_init;
|
||||
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
};
|
||||
|
||||
// bundle lora_a and lora_b into pairs
|
||||
std::map<std::string, llama_lora_weight> ab_map;
|
||||
std::map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
};
|
||||
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
if (str_endswith(name, ".lora_a")) {
|
||||
replace_all(name, ".lora_a", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(cur, nullptr);
|
||||
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
|
||||
} else {
|
||||
ab_map[name].a = cur;
|
||||
}
|
||||
} else if (str_endswith(name, ".lora_b")) {
|
||||
replace_all(name, ".lora_b", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(nullptr, cur);
|
||||
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
|
||||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
} else if (str_endswith(name, "_norm.weight")) {
|
||||
// TODO: add support for norm vector
|
||||
// for now, we don't really care because most adapters still work fine without it
|
||||
continue;
|
||||
} else {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||
}
|
||||
@@ -250,25 +251,33 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
// add tensors
|
||||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
llama_adapter_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||
}
|
||||
|
||||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
const auto * model_tensor = model.get_tensor(name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||
// validate tensor shape
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
if (is_token_embd) {
|
||||
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
} else {
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
}
|
||||
}
|
||||
|
||||
// save tensor to adapter
|
||||
@@ -276,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
||||
ggml_set_name(tensor_a, w.a->name);
|
||||
ggml_set_name(tensor_b, w.b->name);
|
||||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
||||
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_lora_adapter * adapter = new llama_lora_adapter();
|
||||
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
64
llama/llama.cpp/src/llama-adapter.h
vendored
64
llama/llama.cpp/src/llama-adapter.h
vendored
@@ -1,66 +1,74 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// TODO: pimpl
|
||||
|
||||
//
|
||||
// llama_adapter_cvec
|
||||
//
|
||||
|
||||
// TODO: rename to llama_adapter_cvec
|
||||
struct llama_control_vector {
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
struct llama_adapter_cvec {
|
||||
struct ggml_tensor * tensor_for(int il) const;
|
||||
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
||||
|
||||
int32_t apply(
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
private:
|
||||
bool init(const llama_model & model);
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
struct ggml_tensor * tensor_for(int il) const;
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
};
|
||||
|
||||
int32_t llama_control_vector_apply(
|
||||
struct llama_control_vector & cvec,
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
//
|
||||
// llama_adapter_lora
|
||||
//
|
||||
|
||||
// TODO: rename to llama_adapter_lora_weight
|
||||
struct llama_lora_weight {
|
||||
struct llama_adapter_lora_weight {
|
||||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
// get actual scale based on rank and alpha
|
||||
float get_scale(float alpha, float adapter_scale) const {
|
||||
const float rank = (float) b->ne[0];
|
||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||
return scale;
|
||||
}
|
||||
|
||||
llama_adapter_lora_weight() = default;
|
||||
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
};
|
||||
|
||||
// TODO: rename to llama_adapter_lora
|
||||
struct llama_lora_adapter {
|
||||
struct llama_adapter_lora {
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
||||
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
float alpha;
|
||||
|
||||
llama_lora_adapter() = default;
|
||||
~llama_lora_adapter() = default;
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_lora_weight * get_weight(struct ggml_tensor * w);
|
||||
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
|
||||
};
|
||||
|
||||
134
llama/llama.cpp/src/llama-arch.cpp
vendored
134
llama/llama.cpp/src/llama-arch.cpp
vendored
@@ -28,6 +28,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
@@ -57,6 +58,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -107,25 +109,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -179,6 +182,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
@@ -622,6 +627,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PHIMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PLAMO,
|
||||
{
|
||||
@@ -1036,6 +1062,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
@@ -1182,6 +1211,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
||||
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
||||
@@ -1199,6 +1229,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
||||
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
||||
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
||||
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
||||
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
||||
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GRANITE,
|
||||
{
|
||||
@@ -1253,6 +1309,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_SOLAR,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1278,24 +1352,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_SOLAR,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1399,6 +1455,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
||||
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
@@ -1455,10 +1512,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
||||
std::string LLM_KV::operator()(llm_kv kv) const {
|
||||
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
||||
return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
|
||||
: ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
||||
}
|
||||
|
||||
std::string LLM_TN_IMPL::str() const {
|
||||
|
||||
9
llama/llama.cpp/src/llama-arch.h
vendored
9
llama/llama.cpp/src/llama-arch.h
vendored
@@ -32,6 +32,7 @@ enum llm_arch {
|
||||
LLM_ARCH_QWEN2VL,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
@@ -61,6 +62,7 @@ enum llm_arch {
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -111,6 +113,7 @@ enum llm_kv {
|
||||
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
||||
LLM_KV_RESIDUAL_SCALE,
|
||||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -177,6 +180,8 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
|
||||
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||
@@ -256,6 +261,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_TIME_MIX_LERP_V,
|
||||
LLM_TENSOR_TIME_MIX_LERP_R,
|
||||
LLM_TENSOR_TIME_MIX_LERP_G,
|
||||
LLM_TENSOR_TIME_MIX_LERP_FUSED,
|
||||
LLM_TENSOR_TIME_MIX_FIRST,
|
||||
LLM_TENSOR_TIME_MIX_DECAY,
|
||||
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
||||
@@ -343,9 +349,10 @@ enum llm_tensor_layer {
|
||||
};
|
||||
|
||||
struct LLM_KV {
|
||||
LLM_KV(llm_arch arch);
|
||||
LLM_KV(llm_arch arch, const char * suffix = nullptr);
|
||||
|
||||
llm_arch arch;
|
||||
const char * suffix;
|
||||
|
||||
std::string operator()(llm_kv kv) const;
|
||||
};
|
||||
|
||||
26
llama/llama.cpp/src/llama-chat.cpp
vendored
26
llama/llama.cpp/src/llama-chat.cpp
vendored
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
||||
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
||||
@@ -50,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
@@ -73,7 +75,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return tmpl.find(haystack) != std::string::npos;
|
||||
};
|
||||
if (tmpl_contains("<|im_start|>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATML;
|
||||
return tmpl_contains("<|im_sep|>")
|
||||
? LLM_CHAT_TEMPLATE_PHI_4
|
||||
: LLM_CHAT_TEMPLATE_CHATML;
|
||||
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
||||
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
||||
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
||||
@@ -112,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
return LLM_CHAT_TEMPLATE_FALCON_3;
|
||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||
@@ -149,7 +153,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
||||
} else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
|
||||
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
@@ -269,6 +273,14 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
|
||||
// chatml template
|
||||
for (auto message : chat) {
|
||||
ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|im_start|>assistant<|im_sep|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
||||
// Falcon 3
|
||||
for (auto message : chat) {
|
||||
@@ -429,6 +441,14 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
|
||||
2
llama/llama.cpp/src/llama-chat.h
vendored
2
llama/llama.cpp/src/llama-chat.h
vendored
@@ -15,6 +15,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||
LLM_CHAT_TEMPLATE_PHI_3,
|
||||
LLM_CHAT_TEMPLATE_PHI_4,
|
||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||
LLM_CHAT_TEMPLATE_ZEPHYR,
|
||||
LLM_CHAT_TEMPLATE_MONARCH,
|
||||
@@ -30,6 +31,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_4,
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
|
||||
5
llama/llama.cpp/src/llama-context.cpp
vendored
5
llama/llama.cpp/src/llama-context.cpp
vendored
@@ -1,5 +1,8 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
@@ -513,7 +516,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||
auto * output_dev = lctx.model.dev_output.dev;
|
||||
auto * output_dev = lctx.model.dev_output();
|
||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||
if (output_dev_host_buft) {
|
||||
buft = output_dev_host_buft;
|
||||
|
||||
10
llama/llama.cpp/src/llama-context.h
vendored
10
llama/llama.cpp/src/llama-context.h
vendored
@@ -22,12 +22,12 @@ struct llama_context {
|
||||
|
||||
const struct llama_model & model;
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sbatch sbatch; // TODO: revisit if needed
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sbatch sbatch; // TODO: revisit if needed
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_adapter_cvec cvec;
|
||||
|
||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
||||
std::unordered_map<struct llama_adapter_lora *, float> lora;
|
||||
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||
|
||||
454
llama/llama.cpp/src/llama-grammar.cpp
vendored
454
llama/llama.cpp/src/llama-grammar.cpp
vendored
@@ -345,194 +345,194 @@ const char * llama_grammar_parser::parse_sequence(
|
||||
size_t last_sym_start = rule.size();
|
||||
const char * pos = src;
|
||||
|
||||
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||
|
||||
if (last_sym_start == rule.size()) {
|
||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||
}
|
||||
if (last_sym_start == rule.size()) {
|
||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||
}
|
||||
|
||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||
// the following rewrite rules:
|
||||
// S{m,n} --> S S S (m times) S'(n-m)
|
||||
// S'(x) ::= S S'(x-1) |
|
||||
// (... n-m definitions of these S' rules ...)
|
||||
// S'(1) ::= S |
|
||||
// S{m,} --> S S S (m times) S'
|
||||
// S' ::= S S' |
|
||||
// S* --> S{0,}
|
||||
// --> S' ::= S S' |
|
||||
// S+ --> S{1,}
|
||||
// --> S S'
|
||||
// S' ::= S S' |
|
||||
// S? --> S{0,1}
|
||||
// --> S'
|
||||
// S' ::= S |
|
||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||
// the following rewrite rules:
|
||||
// S{m,n} --> S S S (m times) S'(n-m)
|
||||
// S'(x) ::= S S'(x-1) |
|
||||
// (... n-m definitions of these S' rules ...)
|
||||
// S'(1) ::= S |
|
||||
// S{m,} --> S S S (m times) S'
|
||||
// S' ::= S S' |
|
||||
// S* --> S{0,}
|
||||
// --> S' ::= S S' |
|
||||
// S+ --> S{1,}
|
||||
// --> S S'
|
||||
// S' ::= S S' |
|
||||
// S? --> S{0,1}
|
||||
// --> S'
|
||||
// S' ::= S |
|
||||
|
||||
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
|
||||
if (min_times == 0) {
|
||||
rule.resize(last_sym_start);
|
||||
} else {
|
||||
// Repeat the previous elements (min_times - 1) times
|
||||
for (int i = 1; i < min_times; i++) {
|
||||
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t last_rec_rule_id = 0;
|
||||
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||
|
||||
llama_grammar_rule rec_rule(prev_rule);
|
||||
for (int i = 0; i < n_opt; i++) {
|
||||
rec_rule.resize(prev_rule.size());
|
||||
uint32_t rec_rule_id = generate_symbol_id( rule_name);
|
||||
if (i > 0 || max_times < 0) {
|
||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||
}
|
||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||
add_rule( rec_rule_id, rec_rule);
|
||||
last_rec_rule_id = rec_rule_id;
|
||||
}
|
||||
if (n_opt > 0) {
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||
}
|
||||
};
|
||||
|
||||
while (*pos) {
|
||||
if (*pos == '"') { // literal string
|
||||
pos++;
|
||||
last_sym_start = rule.size();
|
||||
while (*pos != '"') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '[') { // char range(s)
|
||||
pos++;
|
||||
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
||||
if (*pos == '^') {
|
||||
pos++;
|
||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
||||
}
|
||||
last_sym_start = rule.size();
|
||||
while (*pos != ']') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
enum llama_gretype type = last_sym_start < rule.size()
|
||||
? LLAMA_GRETYPE_CHAR_ALT
|
||||
: start_type;
|
||||
|
||||
rule.push_back({type, char_pair.first});
|
||||
if (pos[0] == '-' && pos[1] != ']') {
|
||||
if (!pos[1]) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto endchar_pair = parse_char(pos + 1);
|
||||
pos = endchar_pair.second;
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
||||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||
pos = parse_space(name_end, is_nested);
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
||||
} else if (*pos == '(') { // grouping
|
||||
// parse nested alternates into synthesized rule
|
||||
pos = parse_space(pos + 1, true);
|
||||
uint32_t sub_rule_id = generate_symbol_id(rule_name);
|
||||
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
|
||||
last_sym_start = rule.size();
|
||||
// output reference to synthesized rule
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
||||
if (*pos != ')') {
|
||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '.') { // any char
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '*') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, -1);
|
||||
} else if (*pos == '+') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(1, -1);
|
||||
} else if (*pos == '?') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, 1);
|
||||
} else if (*pos == '{') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (!is_digit_char(*pos)) {
|
||||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
int max_times = -1;
|
||||
|
||||
if (*pos == '}') {
|
||||
max_times = min_times;
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == ',') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (is_digit_char(*pos)) {
|
||||
const char * int_end = parse_int(pos);
|
||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
}
|
||||
|
||||
if (*pos != '}') {
|
||||
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else {
|
||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||
}
|
||||
handle_repetitions(min_times, max_times);
|
||||
} else {
|
||||
break;
|
||||
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
|
||||
if (min_times == 0) {
|
||||
rule.resize(last_sym_start);
|
||||
} else {
|
||||
// Repeat the previous elements (min_times - 1) times
|
||||
for (int i = 1; i < min_times; i++) {
|
||||
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
|
||||
uint32_t last_rec_rule_id = 0;
|
||||
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||
|
||||
llama_grammar_rule rec_rule(prev_rule);
|
||||
for (int i = 0; i < n_opt; i++) {
|
||||
rec_rule.resize(prev_rule.size());
|
||||
uint32_t rec_rule_id = generate_symbol_id( rule_name);
|
||||
if (i > 0 || max_times < 0) {
|
||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||
}
|
||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||
add_rule( rec_rule_id, rec_rule);
|
||||
last_rec_rule_id = rec_rule_id;
|
||||
}
|
||||
if (n_opt > 0) {
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||
}
|
||||
};
|
||||
|
||||
while (*pos) {
|
||||
if (*pos == '"') { // literal string
|
||||
pos++;
|
||||
last_sym_start = rule.size();
|
||||
while (*pos != '"') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '[') { // char range(s)
|
||||
pos++;
|
||||
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
||||
if (*pos == '^') {
|
||||
pos++;
|
||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
||||
}
|
||||
last_sym_start = rule.size();
|
||||
while (*pos != ']') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
enum llama_gretype type = last_sym_start < rule.size()
|
||||
? LLAMA_GRETYPE_CHAR_ALT
|
||||
: start_type;
|
||||
|
||||
rule.push_back({type, char_pair.first});
|
||||
if (pos[0] == '-' && pos[1] != ']') {
|
||||
if (!pos[1]) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto endchar_pair = parse_char(pos + 1);
|
||||
pos = endchar_pair.second;
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
||||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||
pos = parse_space(name_end, is_nested);
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
||||
} else if (*pos == '(') { // grouping
|
||||
// parse nested alternates into synthesized rule
|
||||
pos = parse_space(pos + 1, true);
|
||||
uint32_t sub_rule_id = generate_symbol_id(rule_name);
|
||||
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
|
||||
last_sym_start = rule.size();
|
||||
// output reference to synthesized rule
|
||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
||||
if (*pos != ')') {
|
||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '.') { // any char
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '*') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, -1);
|
||||
} else if (*pos == '+') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(1, -1);
|
||||
} else if (*pos == '?') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, 1);
|
||||
} else if (*pos == '{') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (!is_digit_char(*pos)) {
|
||||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
int max_times = -1;
|
||||
|
||||
if (*pos == '}') {
|
||||
max_times = min_times;
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == ',') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (is_digit_char(*pos)) {
|
||||
const char * int_end = parse_int(pos);
|
||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
}
|
||||
|
||||
if (*pos != '}') {
|
||||
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else {
|
||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||
}
|
||||
handle_repetitions(min_times, max_times);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
const char * llama_grammar_parser::parse_rule(const char * src) {
|
||||
const char * name_end = parse_name(src);
|
||||
const char * pos = parse_space(name_end, false);
|
||||
size_t name_len = name_end - src;
|
||||
uint32_t rule_id = get_symbol_id(src, name_len);
|
||||
const std::string name(src, name_len);
|
||||
const char * name_end = parse_name(src);
|
||||
const char * pos = parse_space(name_end, false);
|
||||
size_t name_len = name_end - src;
|
||||
uint32_t rule_id = get_symbol_id(src, name_len);
|
||||
const std::string name(src, name_len);
|
||||
|
||||
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
||||
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 3, true);
|
||||
|
||||
pos = parse_alternates(pos, name, rule_id, false);
|
||||
|
||||
if (*pos == '\r') {
|
||||
pos += pos[1] == '\n' ? 2 : 1;
|
||||
} else if (*pos == '\n') {
|
||||
pos++;
|
||||
} else if (*pos) {
|
||||
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
||||
}
|
||||
return parse_space(pos, true);
|
||||
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
||||
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 3, true);
|
||||
|
||||
pos = parse_alternates(pos, name, rule_id, false);
|
||||
|
||||
if (*pos == '\r') {
|
||||
pos += pos[1] == '\n' ? 2 : 1;
|
||||
} else if (*pos == '\n') {
|
||||
pos++;
|
||||
} else if (*pos) {
|
||||
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
||||
}
|
||||
return parse_space(pos, true);
|
||||
}
|
||||
|
||||
bool llama_grammar_parser::parse(const char * src) {
|
||||
try {
|
||||
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
||||
fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
|
||||
rules.clear();
|
||||
return false;
|
||||
}
|
||||
@@ -960,10 +960,28 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy =*/ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_words = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
llama_grammar_parser parser;
|
||||
|
||||
// if there is a grammar, parse it
|
||||
@@ -1035,10 +1053,31 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
|
||||
}
|
||||
} while (true);
|
||||
|
||||
std::vector<llama_token> vec_trigger_tokens;
|
||||
std::vector<std::string> vec_trigger_words;
|
||||
for (size_t i = 0; i < num_trigger_tokens; i++) {
|
||||
GGML_ASSERT(trigger_tokens != nullptr);
|
||||
vec_trigger_tokens.push_back(trigger_tokens[i]);
|
||||
}
|
||||
for (size_t i = 0; i < num_trigger_words; i++) {
|
||||
GGML_ASSERT(trigger_words != nullptr);
|
||||
vec_trigger_words.push_back(trigger_words[i]);
|
||||
}
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
std::move(vec_trigger_tokens),
|
||||
std::move(vec_trigger_words),
|
||||
};
|
||||
}
|
||||
|
||||
void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
@@ -1055,6 +1094,11 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
grammar.rules,
|
||||
grammar.stacks,
|
||||
grammar.partial_utf8,
|
||||
grammar.lazy,
|
||||
grammar.awaiting_trigger,
|
||||
grammar.trigger_buffer,
|
||||
grammar.trigger_tokens,
|
||||
grammar.trigger_words,
|
||||
};
|
||||
|
||||
// redirect elements in stacks to point to new rules
|
||||
@@ -1076,6 +1120,10 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
||||
GGML_ASSERT(grammar.vocab != nullptr);
|
||||
|
||||
if (grammar.awaiting_trigger) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
@@ -1092,9 +1140,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const llama_token id = cur_p->data[i].id;
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
||||
const std::string & piece = grammar.vocab->token_to_piece(id);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
||||
if (grammar.vocab->is_eog(id)) {
|
||||
if (!allow_eog) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
@@ -1115,7 +1163,35 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||
GGML_ASSERT(grammar.vocab != nullptr);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
|
||||
const auto & piece = grammar.vocab->token_to_piece(token);
|
||||
|
||||
if (grammar.awaiting_trigger) {
|
||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
grammar.awaiting_trigger = false;
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||
return;
|
||||
} else {
|
||||
// TODO: consider a smarter incremental substring search algorithm (store last position to search from).
|
||||
grammar.trigger_buffer += piece;
|
||||
for (const auto & word : grammar.trigger_words) {
|
||||
auto pos = grammar.trigger_buffer.find(word);
|
||||
if (pos != std::string::npos) {
|
||||
grammar.awaiting_trigger = false;
|
||||
auto constrained_str = grammar.trigger_buffer.substr(pos);
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (grammar.vocab->is_eog(token)) {
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
@@ -1124,8 +1200,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
}
|
||||
|
||||
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
@@ -1135,5 +1213,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
}
|
||||
|
||||
grammar.partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar.stacks.empty());
|
||||
if (grammar.stacks.empty()) {
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
}
|
||||
}
|
||||
|
||||
23
llama/llama.cpp/src/llama-grammar.h
vendored
23
llama/llama.cpp/src/llama-grammar.h
vendored
@@ -114,6 +114,15 @@ struct llama_grammar {
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
|
||||
// lazy grammars wait for trigger words or tokens before constraining the sampling.
|
||||
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
|
||||
// (useful e.g. for tool_choice=required)
|
||||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<std::string> trigger_words;
|
||||
};
|
||||
|
||||
//
|
||||
@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
size_t n_rules,
|
||||
size_t start_rule_index);
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens);
|
||||
|
||||
void llama_grammar_free_impl(struct llama_grammar * grammar);
|
||||
|
||||
@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
|
||||
void llama_grammar_accept_impl(
|
||||
struct llama_grammar & grammar,
|
||||
llama_token token);
|
||||
|
||||
void llama_grammar_accept_str(
|
||||
struct llama_grammar & grammar,
|
||||
const std::string & piece);
|
||||
|
||||
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
@@ -54,7 +54,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
uint32_t llama_hparams::n_embd_k_s() const {
|
||||
if (wkv_head_size != 0) {
|
||||
// for RWKV models
|
||||
return 2 * n_embd;
|
||||
return token_shift_count * n_embd;
|
||||
}
|
||||
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
@@ -82,4 +82,4 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
|
||||
|
||||
bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
}
|
||||
}
|
||||
6
llama/llama.cpp/src/llama-hparams.h
vendored
6
llama/llama.cpp/src/llama-hparams.h
vendored
@@ -30,7 +30,6 @@ struct llama_hparams {
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
|
||||
uint32_t n_vocab = 0;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_features = 0;
|
||||
@@ -41,8 +40,8 @@ struct llama_hparams {
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_vocab_type = 0; // for BERT-style token types
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
uint32_t n_vocab = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
@@ -79,6 +78,7 @@ struct llama_hparams {
|
||||
uint32_t time_mix_extra_dim = 0;
|
||||
uint32_t time_decay_extra_dim = 0;
|
||||
uint32_t wkv_head_size = 0;
|
||||
uint32_t token_shift_count = 2;
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
@@ -141,7 +141,7 @@ struct llama_hparams {
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
// cross attention layers
|
||||
// cross attention layers
|
||||
bool cross_attention_layers(uint32_t il) const;
|
||||
};
|
||||
|
||||
|
||||
3
llama/llama.cpp/src/llama-impl.cpp
vendored
3
llama/llama.cpp/src/llama-impl.cpp
vendored
@@ -1,5 +1,6 @@
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
|
||||
12
llama/llama.cpp/src/llama-impl.h
vendored
12
llama/llama.cpp/src/llama-impl.h
vendored
@@ -6,13 +6,13 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# if defined(__MINGW32__) && !defined(__clang__)
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# else
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
# endif
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
80
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
80
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
@@ -72,39 +72,6 @@ bool llama_kv_cache_init(
|
||||
cache.v_l.reserve(n_layer);
|
||||
|
||||
for (int i = 0; i < n_layer; i++) {
|
||||
// for cross attention layers
|
||||
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const llama_model::buft_list_t * buft_list;
|
||||
if (offload) {
|
||||
buft_list = model.dev_layer.at(i).buft_list;
|
||||
} else {
|
||||
buft_list = &model.cpu_buft_list;
|
||||
}
|
||||
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
||||
[&](ggml_context * ctx) {
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
||||
return k;
|
||||
}
|
||||
ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
||||
return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
|
||||
});
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
cache.v_l.push_back(v);
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
@@ -112,7 +79,7 @@ bool llama_kv_cache_init(
|
||||
|
||||
ggml_backend_buffer_type_t buft;
|
||||
if (offload) {
|
||||
auto * dev = model.dev_layer.at(i).dev;
|
||||
auto * dev = model.dev_layer(i);
|
||||
buft = ggml_backend_dev_buffer_type(dev);
|
||||
} else {
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
@@ -124,8 +91,17 @@ bool llama_kv_cache_init(
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
ggml_tensor * k, *v;
|
||||
|
||||
// for cross attention layers
|
||||
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||
} else {
|
||||
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
}
|
||||
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
@@ -152,10 +128,10 @@ bool llama_kv_cache_init(
|
||||
|
||||
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_ubatch & batch) {
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
const uint32_t n_seqs = batch.n_seqs;
|
||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const struct llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
@@ -163,16 +139,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// A slot should be always be contiguous.
|
||||
|
||||
// can only process batches with an equal number of new tokens in each sequence
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
|
||||
int32_t min = cache.size - 1;
|
||||
int32_t max = 0;
|
||||
|
||||
// everything should fit if all seq_ids are smaller than the max
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const uint32_t n_seq_id = batch.n_seq_id[s];
|
||||
const uint32_t n_seq_id = ubatch.n_seq_id[s];
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
||||
// too big seq_id
|
||||
@@ -231,7 +207,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
|
||||
// find usable cell range
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][0];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
||||
bool has_cell = false;
|
||||
if (seq_meta.tail >= 0) {
|
||||
@@ -270,7 +246,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
|
||||
int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
||||
llama_kv_cell & src_cell = cache.cells[src_id];
|
||||
@@ -291,7 +267,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
llama_kv_cell & cell = cache.cells[cell_id];
|
||||
|
||||
@@ -299,12 +275,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// What should happen when the pos backtracks or skips a value?
|
||||
// Clearing the state mid-batch would require special-casing which isn't done.
|
||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
||||
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
|
||||
__func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
|
||||
}
|
||||
cell.pos = last_pos;
|
||||
cell.seq_id.clear();
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
cell.seq_id.insert(seq_id);
|
||||
cache.cells[seq_id].tail = cell_id;
|
||||
}
|
||||
@@ -358,10 +334,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
for (uint32_t s = 0; s < n_seqs; s++) {
|
||||
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
||||
uint32_t k = s*n_seq_tokens + i;
|
||||
cache.cells[cache.head + k].pos = batch.pos[k];
|
||||
cache.cells[cache.head + k].pos = ubatch.pos[k];
|
||||
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
2
llama/llama.cpp/src/llama-kv-cache.h
vendored
2
llama/llama.cpp/src/llama-kv-cache.h
vendored
@@ -37,7 +37,7 @@ struct llama_kv_cache {
|
||||
bool can_shift = false;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
|
||||
13
llama/llama.cpp/src/llama-mmap.cpp
vendored
13
llama/llama.cpp/src/llama-mmap.cpp
vendored
@@ -7,6 +7,7 @@
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <stdexcept>
|
||||
#include <cerrno>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
@@ -35,7 +36,7 @@
|
||||
|
||||
// TODO: consider moving to llama-impl.h if needed in more places
|
||||
#if defined(_WIN32)
|
||||
std::string llama_format_win_err(DWORD err) {
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
@@ -241,12 +242,16 @@ llama_file::~llama_file() = default;
|
||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
size_t llama_file::size() const { return pimpl->size; }
|
||||
|
||||
int llama_file::fileno() const {
|
||||
int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
#else
|
||||
#if defined(fileno)
|
||||
return fileno(pimpl->fp);
|
||||
#else
|
||||
return ::fileno(pimpl->fp);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||
@@ -265,7 +270,7 @@ struct llama_mmap::impl {
|
||||
|
||||
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
||||
size = file->size();
|
||||
int fd = file->fileno();
|
||||
int fd = file->file_id();
|
||||
int flags = MAP_SHARED;
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
@@ -357,7 +362,7 @@ struct llama_mmap::impl {
|
||||
|
||||
size = file->size();
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno());
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
|
||||
|
||||
3
llama/llama.cpp/src/llama-mmap.h
vendored
3
llama/llama.cpp/src/llama-mmap.h
vendored
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
@@ -18,7 +19,7 @@ struct llama_file {
|
||||
size_t tell() const;
|
||||
size_t size() const;
|
||||
|
||||
int fileno() const;
|
||||
int file_id() const; // fileno overload
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
|
||||
146
llama/llama.cpp/src/llama-model-loader.cpp
vendored
146
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@@ -7,6 +7,10 @@
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
@@ -17,8 +21,78 @@ const char * llama_file_version_name(llama_fver version) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
||||
// return a list of splits for a given path
|
||||
// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
|
||||
static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
|
||||
std::vector<std::string> paths;
|
||||
std::string split_prefix;
|
||||
std::vector<char> buf(llama_path_max(), 0);
|
||||
|
||||
{
|
||||
int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
|
||||
if (!ret) {
|
||||
throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
|
||||
}
|
||||
split_prefix = std::string(buf.data(), ret);
|
||||
}
|
||||
|
||||
if (split_prefix.empty()) {
|
||||
throw std::runtime_error(format("invalid split file: %s", path.c_str()));
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < n_split; ++idx) {
|
||||
int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
|
||||
paths.push_back(std::string(buf.data(), ret));
|
||||
}
|
||||
|
||||
return paths;
|
||||
}
|
||||
|
||||
namespace GGUFMeta {
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||
struct GKV_Base_Type {
|
||||
static constexpr gguf_type gt = gt_;
|
||||
|
||||
@@ -60,10 +134,11 @@ namespace GGUFMeta {
|
||||
public:
|
||||
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
||||
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
||||
return ArrayInfo {
|
||||
gguf_get_arr_type(ctx, k),
|
||||
arr_type,
|
||||
size_t(gguf_get_arr_n(ctx, k)),
|
||||
gguf_get_arr_data(ctx, k),
|
||||
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -368,7 +443,12 @@ namespace GGUFMeta {
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
@@ -380,6 +460,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
}
|
||||
}
|
||||
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
@@ -415,35 +496,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
}
|
||||
|
||||
std::vector<char> split_prefix(llama_path_max(), 0);
|
||||
if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) {
|
||||
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
std::vector<char> split_path(llama_path_max(), 0);
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split);
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) };
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data()));
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(split_path.data(), "rb"));
|
||||
// check idx
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
@@ -556,7 +656,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
@@ -722,7 +822,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
||||
for (const auto & file : files) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||
mmaps_used.emplace_back(mapping->size(), 0);
|
||||
if (mlock_mmaps) {
|
||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||
@@ -1011,3 +1111,17 @@ bool llama_model_loader::load_all_data(
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::ftype_name() const {
|
||||
return llama_model_ftype_name(ftype);
|
||||
}
|
||||
|
||||
void llama_model_loader::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
||||
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
||||
if (n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
}
|
||||
}
|
||||
|
||||
11
llama/llama.cpp/src/llama-model-loader.h
vendored
11
llama/llama.cpp/src/llama-model-loader.h
vendored
@@ -90,7 +90,12 @@ struct llama_model_loader {
|
||||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
|
||||
llama_model_loader(
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p);
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
@@ -155,4 +160,8 @@ struct llama_model_loader {
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data);
|
||||
|
||||
std::string ftype_name() const;
|
||||
|
||||
void print_info() const;
|
||||
};
|
||||
|
||||
4287
llama/llama.cpp/src/llama-model.cpp
vendored
4287
llama/llama.cpp/src/llama-model.cpp
vendored
File diff suppressed because it is too large
Load Diff
298
llama/llama.cpp/src/llama-model.h
vendored
298
llama/llama.cpp/src/llama-model.h
vendored
@@ -4,81 +4,83 @@
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
enum llm_type {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_60M,
|
||||
MODEL_70M,
|
||||
MODEL_80M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_220M,
|
||||
MODEL_250M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_450M,
|
||||
MODEL_770M,
|
||||
MODEL_780M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_5B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_9B,
|
||||
MODEL_11B,
|
||||
MODEL_12B,
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
MODEL_22B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_90B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_A1_7B,
|
||||
MODEL_A2_7B,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_27B,
|
||||
LLM_TYPE_UNKNOWN,
|
||||
LLM_TYPE_14M,
|
||||
LLM_TYPE_17M,
|
||||
LLM_TYPE_22M,
|
||||
LLM_TYPE_33M,
|
||||
LLM_TYPE_60M,
|
||||
LLM_TYPE_70M,
|
||||
LLM_TYPE_80M,
|
||||
LLM_TYPE_109M,
|
||||
LLM_TYPE_137M,
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_220M,
|
||||
LLM_TYPE_250M,
|
||||
LLM_TYPE_270M,
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_3B,
|
||||
LLM_TYPE_1_4B,
|
||||
LLM_TYPE_1_5B,
|
||||
LLM_TYPE_1_6B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_8B,
|
||||
LLM_TYPE_3B,
|
||||
LLM_TYPE_4B,
|
||||
LLM_TYPE_6B,
|
||||
LLM_TYPE_6_9B,
|
||||
LLM_TYPE_7B,
|
||||
LLM_TYPE_8B,
|
||||
LLM_TYPE_9B,
|
||||
LLM_TYPE_11B,
|
||||
LLM_TYPE_12B,
|
||||
LLM_TYPE_13B,
|
||||
LLM_TYPE_14B,
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
LLM_TYPE_22B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
LLM_TYPE_35B,
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
LLM_TYPE_MEDIUM,
|
||||
LLM_TYPE_LARGE,
|
||||
LLM_TYPE_XL,
|
||||
LLM_TYPE_A1_7B,
|
||||
LLM_TYPE_A2_7B,
|
||||
LLM_TYPE_8x7B,
|
||||
LLM_TYPE_8x22B,
|
||||
LLM_TYPE_16x12B,
|
||||
LLM_TYPE_16x3_8B,
|
||||
LLM_TYPE_10B_128x3_66B,
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
@@ -243,15 +245,19 @@ struct llama_layer {
|
||||
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_fused = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_key_b = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_value_b = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance_b = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_ln = nullptr;
|
||||
struct ggml_tensor * time_mix_ln_b = nullptr;
|
||||
@@ -280,7 +286,7 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
// cross attention
|
||||
// cross attention
|
||||
struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||
struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||
struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||
@@ -296,11 +302,9 @@ struct llama_layer {
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
llm_type type = MODEL_UNKNOWN;
|
||||
llm_type type = LLM_TYPE_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams = {};
|
||||
@@ -329,117 +333,53 @@ struct llama_model {
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
llama_model_params params;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// list of devices used in this model
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
buft_list_t cpu_buft_list;
|
||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||
|
||||
struct layer_dev {
|
||||
ggml_backend_dev_t dev;
|
||||
buft_list_t * buft_list;
|
||||
};
|
||||
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
// contexts where the model tensors metadata is stored
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
explicit llama_model(const struct llama_model_params & params);
|
||||
~llama_model();
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
void load_stats (llama_model_loader & ml);
|
||||
void load_arch (llama_model_loader & ml);
|
||||
void load_hparams(llama_model_loader & ml);
|
||||
void load_vocab (llama_model_loader & ml);
|
||||
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
||||
|
||||
std::string arch_name() const;
|
||||
std::string type_name() const;
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t max_nodes() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements() const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
ggml_backend_dev_t dev_layer(int il) const;
|
||||
ggml_backend_dev_t dev_output() const;
|
||||
|
||||
ggml_backend_buffer_type_t select_buft(int il) const;
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
const char * llm_type_name(llm_type type);
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
template<typename F>
|
||||
bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context_ptr ctx { ggml_init(params) };
|
||||
if (!ctx) {
|
||||
throw std::runtime_error("failed to create ggml context");
|
||||
}
|
||||
|
||||
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
ggml_tensor * op_tensor = fn(ctx.get());
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op_tensor->src[i] != nullptr) {
|
||||
op_tensor->src[i]->buffer = buf.get();
|
||||
}
|
||||
}
|
||||
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
for (const auto & cur : buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
|
||||
throw std::runtime_error("no suitable buffer type found");
|
||||
}
|
||||
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
// used by llama_adapter_lora
|
||||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
||||
|
||||
size_t llama_model_max_nodes(const llama_model & model);
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// TODO: become llama_model methods
|
||||
void llm_load_stats (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_arch (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
|
||||
|
||||
71
llama/llama.cpp/src/llama-quant.cpp
vendored
71
llama/llama.cpp/src/llama-quant.cpp
vendored
@@ -7,14 +7,12 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// TODO: replace with ggml API call
|
||||
#define QK_K 256
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
@@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
struct quantize_state_internal {
|
||||
struct quantize_state_impl {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
|
||||
@@ -43,13 +41,13 @@ struct quantize_state_internal {
|
||||
// used to figure out if a model shares tok_embd with the output weight
|
||||
bool has_output = false;
|
||||
|
||||
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||
: model(model)
|
||||
, params(params)
|
||||
{}
|
||||
};
|
||||
|
||||
static void llama_tensor_dequantize_internal(
|
||||
static void llama_tensor_dequantize_impl(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
) {
|
||||
@@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal(
|
||||
workers.clear();
|
||||
}
|
||||
|
||||
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
} else {
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
if (qs.model.type == MODEL_70B) {
|
||||
if (qs.model.type == LLM_TYPE_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
||||
new_type == GGML_TYPE_IQ1_M) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||
{
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t ny = tensor->ne[1];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (nx % qk_k != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||
convert_incompatible_tensor = true;
|
||||
} else {
|
||||
++qs.n_k_quantized;
|
||||
}
|
||||
}
|
||||
|
||||
if (convert_incompatible_tensor) {
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_TQ1_0:
|
||||
@@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
return new_type;
|
||||
}
|
||||
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
||||
@@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||
return new_size;
|
||||
}
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
|
||||
@@ -526,18 +525,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_stats (ml, model);
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
model.load_arch (ml);
|
||||
model.load_hparams(ml);
|
||||
model.load_stats (ml);
|
||||
|
||||
struct quantize_state_impl qs(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
ftype = ml.ftype;
|
||||
}
|
||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||
if (params->imatrix) {
|
||||
@@ -621,7 +623,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks
|
||||
// sanity checks for models that have attention layers
|
||||
if (qs.n_attention_wv != 0)
|
||||
{
|
||||
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
||||
// attention layers have a non-zero number of kv heads
|
||||
@@ -761,6 +764,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
||||
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
@@ -839,7 +843,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
@@ -868,7 +872,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
@@ -877,7 +881,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
@@ -921,7 +926,7 @@ uint32_t llama_model_quantize(
|
||||
const char * fname_out,
|
||||
const llama_model_quantize_params * params) {
|
||||
try {
|
||||
llama_model_quantize_internal(fname_inp, fname_out, params);
|
||||
llama_model_quantize_impl(fname_inp, fname_out, params);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
||||
return 1;
|
||||
|
||||
299
llama/llama.cpp/src/llama-sampling.cpp
vendored
299
llama/llama.cpp/src/llama-sampling.cpp
vendored
@@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
const float val = cur_p->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
@@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets-1; j > ib; --j) {
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
@@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
|
||||
|
||||
// llama_sampler API
|
||||
|
||||
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
|
||||
return new llama_sampler {
|
||||
/* .iface = */ iface,
|
||||
/* .ctx = */ ctx,
|
||||
};
|
||||
}
|
||||
|
||||
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
||||
if (!smpl->iface) {
|
||||
return "(null)";
|
||||
@@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
||||
}
|
||||
|
||||
if (smpl->ctx == nullptr) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ smpl->iface,
|
||||
/* .ctx = */ nullptr,
|
||||
};
|
||||
/* .ctx = */ nullptr
|
||||
);
|
||||
}
|
||||
|
||||
GGML_ABORT("the sampler does not support cloning");
|
||||
@@ -371,7 +378,10 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// TODO: do not allocate each time
|
||||
std::vector<llama_token_data> cur;
|
||||
@@ -469,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_chain_i,
|
||||
/* .ctx = */ new llama_sampler_chain {
|
||||
/* .params = */ params,
|
||||
/* .samplers = */ {},
|
||||
/* .t_sample_us = */ 0,
|
||||
/* .n_sample = */ 0,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
||||
@@ -543,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_greedy() {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_greedy_i,
|
||||
/* .ctx = */ nullptr,
|
||||
};
|
||||
/* .ctx = */ nullptr
|
||||
);
|
||||
}
|
||||
|
||||
// dist
|
||||
@@ -605,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_dist_i,
|
||||
/* .ctx = */ new llama_sampler_dist {
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// softmax
|
||||
@@ -635,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_softmax() {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_softmax_i,
|
||||
/* .ctx = */ nullptr,
|
||||
};
|
||||
/* .ctx = */ nullptr
|
||||
);
|
||||
}
|
||||
|
||||
// top-k
|
||||
@@ -675,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_top_k_i,
|
||||
/* .ctx = */ new llama_sampler_top_k {
|
||||
/* .k = */ k,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// top-p
|
||||
@@ -741,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_top_p_i,
|
||||
/* .ctx = */ new llama_sampler_top_p {
|
||||
/* .p = */ p,
|
||||
/* .min_keep = */ min_keep,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// min-p
|
||||
@@ -837,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_min_p_i,
|
||||
/* .ctx = */ new llama_sampler_min_p {
|
||||
/* .p = */ p,
|
||||
/* .min_keep = */ min_keep,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// typical
|
||||
@@ -936,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_typical_i,
|
||||
/* .ctx = */ new llama_sampler_typical {
|
||||
/* .p = */ p,
|
||||
/* .min_keep = */ min_keep,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// temp
|
||||
@@ -980,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_temp_i,
|
||||
/* .ctx = */ new llama_sampler_temp {
|
||||
/*.temp = */ temp,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// temp-ext
|
||||
@@ -1090,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_temp_ext_i,
|
||||
/* .ctx = */ new llama_sampler_temp_ext {
|
||||
/* .temp = */ temp,
|
||||
/* .delta = */ delta,
|
||||
/* .exponent = */ exponent,
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// xtc
|
||||
@@ -1182,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
|
||||
|
||||
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_xtc_i,
|
||||
/* .ctx = */ new llama_sampler_xtc {
|
||||
/* .probability = */ p,
|
||||
@@ -1191,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// mirostat
|
||||
@@ -1289,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_mirostat_i,
|
||||
/* .ctx = */ new llama_sampler_mirostat {
|
||||
/* .n_vocab = */ n_vocab,
|
||||
@@ -1300,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
|
||||
/* .m = */ m,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// mirostat v2
|
||||
@@ -1388,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
||||
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
||||
/* .seed = */ seed,
|
||||
@@ -1397,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
|
||||
/* .eta = */ eta,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// grammar
|
||||
@@ -1430,13 +1440,30 @@ static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token
|
||||
}
|
||||
}
|
||||
|
||||
// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
|
||||
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens);
|
||||
|
||||
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
||||
if (!ctx->grammar) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
|
||||
std::vector<const char *> trigger_words;
|
||||
for (auto & word : ctx->grammar->trigger_words) {
|
||||
trigger_words.push_back(word.c_str());
|
||||
}
|
||||
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
||||
ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
llama_grammar_free_impl(ctx->grammar);
|
||||
ctx->grammar = grammar_new;
|
||||
@@ -1445,7 +1472,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
||||
|
||||
auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
|
||||
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
@@ -1481,29 +1508,55 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
|
||||
/* .free = */ llama_sampler_grammar_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
|
||||
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
auto * ctx = new llama_sampler_grammar;
|
||||
|
||||
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
||||
*ctx = {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
/* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
|
||||
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
|
||||
};
|
||||
} else {
|
||||
*ctx = {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ {},
|
||||
/* .grammar_root = */ {},
|
||||
/* .grammar = */ nullptr,
|
||||
};
|
||||
}
|
||||
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_grammar_i,
|
||||
/* .ctx = */ ctx,
|
||||
};
|
||||
/* .ctx = */ ctx
|
||||
);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
||||
}
|
||||
|
||||
// penalties
|
||||
@@ -1632,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
|
||||
float penalty_present) {
|
||||
penalty_last_n = std::max(penalty_last_n, 0);
|
||||
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_penalties_i,
|
||||
/* .ctx = */ new llama_sampler_penalties {
|
||||
/* .penalty_last_n = */ penalty_last_n,
|
||||
@@ -1641,8 +1694,75 @@ struct llama_sampler * llama_sampler_init_penalties(
|
||||
/* .penalty_present = */ penalty_present,
|
||||
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
||||
/* .token_count = */ {},
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// top-n-sigma
|
||||
|
||||
struct llama_sampler_top_n_sigma {
|
||||
const float n;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "top-n-sigma";
|
||||
}
|
||||
|
||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
|
||||
// find max logit and calculate mean
|
||||
float max = cur_p->data[0].logit;
|
||||
float logits_sum = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit > max) {
|
||||
max = cur_p->data[i].logit;
|
||||
}
|
||||
logits_sum += cur_p->data[i].logit;
|
||||
}
|
||||
float mean = logits_sum/cur_p->size;
|
||||
|
||||
// calculate standard deviation
|
||||
float acc = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||
}
|
||||
float std = sqrt(acc/cur_p->size);
|
||||
|
||||
//apply mask
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit < max - (ctx->n * std)) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
}
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
return llama_sampler_init_top_n_sigma(ctx->n);
|
||||
}
|
||||
|
||||
static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
|
||||
delete (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
|
||||
/* .name = */ llama_sampler_top_n_sigma_name,
|
||||
/* .accept = */ nullptr,
|
||||
/* .apply = */ llama_sampler_top_n_sigma_apply,
|
||||
/* .reset = */ nullptr,
|
||||
/* .clone = */ llama_sampler_top_n_sigma_clone,
|
||||
/* .free = */ llama_sampler_top_n_sigma_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_top_n_sigma_i,
|
||||
/* .ctx = */ new llama_sampler_top_n_sigma {
|
||||
/* .n = */ n,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// DRY
|
||||
@@ -1663,8 +1783,8 @@ struct llama_sampler_dry {
|
||||
|
||||
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
||||
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
||||
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
|
||||
std::string word = llama_detokenize(vocab, {token_id}, true);
|
||||
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
|
||||
std::string word = vocab.detokenize({token_id}, true);
|
||||
if (word.find(str) != std::string::npos) {
|
||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||
} else {
|
||||
@@ -1681,7 +1801,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
|
||||
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
|
||||
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
||||
tokenization.resize(max_tail_len);
|
||||
}
|
||||
@@ -1832,7 +1952,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
|
||||
ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
|
||||
if (n > 0) {
|
||||
lt = k;
|
||||
rt = k+n-1;
|
||||
rt = k + n - 1;
|
||||
}
|
||||
} else {
|
||||
// If k is inside the current Z-box, consider two cases.
|
||||
@@ -1937,7 +2057,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
|
||||
llama_vocab dummy_vocab;
|
||||
|
||||
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
|
||||
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
||||
auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
||||
|
||||
// Copy the state, including the processed breakers
|
||||
{
|
||||
@@ -1964,7 +2084,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
||||
/* .free = */ llama_sampler_dry_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
||||
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
||||
const int MAX_CHAR_LEN = 40;
|
||||
@@ -1991,11 +2111,11 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
||||
sequence_break.resize(MAX_CHAR_LEN);
|
||||
}
|
||||
|
||||
get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
||||
get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_dry_i,
|
||||
/* .ctx = */ new llama_sampler_dry {
|
||||
/* .total_context_size = */ context_size,
|
||||
@@ -2007,14 +2127,14 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
||||
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
|
||||
/* .dry_max_token_repeat = */ {},
|
||||
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// wrapper for test-sampling.cpp
|
||||
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
|
||||
llama_vocab dummy_vocab;
|
||||
auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
|
||||
auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
|
||||
auto * ctx = (llama_sampler_dry *) result->ctx;
|
||||
|
||||
// Process the token-based sequence breakers
|
||||
@@ -2109,14 +2229,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
int32_t n_vocab,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias) {
|
||||
return new llama_sampler {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_logit_bias_i,
|
||||
/* .ctx = */ new llama_sampler_logit_bias {
|
||||
/* .n_vocab = */ n_vocab,
|
||||
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
||||
/* .to_search = */ {},
|
||||
},
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// infill
|
||||
@@ -2153,7 +2273,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_eog_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_eog_sum += cur_p->data[i].p;
|
||||
} else {
|
||||
p_txt_sum += cur_p->data[i].p;
|
||||
@@ -2175,7 +2295,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
@@ -2203,17 +2323,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
continue;
|
||||
}
|
||||
|
||||
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
if (len0 < 0) {
|
||||
ctx->buf0.resize(len0);
|
||||
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
assert(len0 > 0);
|
||||
}
|
||||
|
||||
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
if (len1 < 0) {
|
||||
ctx->buf1.resize(len1);
|
||||
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
assert(len1 > 0);
|
||||
}
|
||||
|
||||
@@ -2248,7 +2368,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
@@ -2269,7 +2389,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||
if (n_non_eog == 0) {
|
||||
cur_p->size = 1;
|
||||
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
||||
cur_p->data[0].id = ctx->vocab->token_eot();
|
||||
cur_p->data[0].logit = 1.0f;
|
||||
|
||||
return;
|
||||
@@ -2291,7 +2411,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
@@ -2314,7 +2434,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
|
||||
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
||||
return llama_sampler_init_infill_impl(*ctx->vocab);
|
||||
return llama_sampler_init_infill(ctx->vocab);
|
||||
}
|
||||
|
||||
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
||||
@@ -2330,16 +2450,15 @@ static struct llama_sampler_i llama_sampler_infill_i = {
|
||||
/* .free = */ llama_sampler_infill_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab) {
|
||||
return new llama_sampler {
|
||||
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_infill_i,
|
||||
/* .ctx = */ new llama_sampler_infill {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .buf0 = */ std::vector<char>(512),
|
||||
/* .buf1 = */ std::vector<char>(512),
|
||||
},
|
||||
};
|
||||
/* .vocab = */ vocab,
|
||||
/* .buf0 = */ std::vector<char>(512),
|
||||
/* .buf1 = */ std::vector<char>(512),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
22
llama/llama.cpp/src/llama-sampling.h
vendored
22
llama/llama.cpp/src/llama-sampling.h
vendored
@@ -2,7 +2,9 @@
|
||||
|
||||
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
|
||||
|
||||
#include "llama-grammar.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_grammar;
|
||||
@@ -21,24 +23,6 @@ struct llama_sampler_chain {
|
||||
mutable int32_t n_sample;
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
float dry_base,
|
||||
int32_t dry_allowed_length,
|
||||
int32_t dry_penalty_last_n,
|
||||
const char ** seq_breakers,
|
||||
size_t num_breakers);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_testing(
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
|
||||
2362
llama/llama.cpp/src/llama-vocab.cpp
vendored
2362
llama/llama.cpp/src/llama-vocab.cpp
vendored
File diff suppressed because it is too large
Load Diff
273
llama/llama.cpp/src/llama-vocab.h
vendored
273
llama/llama.cpp/src/llama-vocab.h
vendored
@@ -4,179 +4,122 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct llm_tokenizer;
|
||||
struct LLM_KV;
|
||||
struct llama_model_loader;
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
std::string text;
|
||||
float score;
|
||||
llama_token_attr attr;
|
||||
};
|
||||
|
||||
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_eot_id = LLAMA_TOKEN_NULL;
|
||||
id special_eom_id = LLAMA_TOKEN_NULL;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
id linefeed_id = 13;
|
||||
|
||||
// fim tokens
|
||||
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||
|
||||
// set of all tokens that cause "end of generation"
|
||||
std::set<id> special_eog_ids;
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
llm_tokenizer * tokenizer = nullptr;
|
||||
|
||||
llama_vocab() = default;
|
||||
llama_vocab();
|
||||
~llama_vocab();
|
||||
|
||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||
|
||||
enum llama_vocab_type get_type() const;
|
||||
enum llama_vocab_pre_type get_pre_type() const;
|
||||
|
||||
uint32_t n_tokens() const;
|
||||
uint32_t n_token_types() const;
|
||||
|
||||
std::string type_name() const;
|
||||
|
||||
bool is_normal (llama_token id) const;
|
||||
bool is_unknown (llama_token id) const;
|
||||
bool is_control (llama_token id) const;
|
||||
bool is_byte (llama_token id) const;
|
||||
bool is_user_defined(llama_token id) const;
|
||||
bool is_unused (llama_token id) const;
|
||||
bool is_eog (llama_token id) const;
|
||||
|
||||
uint8_t token_to_byte(llama_token id) const;
|
||||
llama_token byte_to_token(uint8_t ch) const;
|
||||
|
||||
llama_token text_to_token(const std::string & text) const;
|
||||
|
||||
const token_data & get_token_data(llama_token id) const;
|
||||
|
||||
const char * token_get_text (llama_token id) const;
|
||||
float token_get_score(llama_token id) const;
|
||||
llama_token_attr token_get_attr (llama_token id) const;
|
||||
|
||||
llama_token token_bos() const;
|
||||
llama_token token_eos() const;
|
||||
llama_token token_eot() const;
|
||||
llama_token token_eom() const;
|
||||
llama_token token_unk() const;
|
||||
llama_token token_sep() const;
|
||||
llama_token token_nl () const;
|
||||
llama_token token_pad() const;
|
||||
|
||||
llama_token token_prefix() const;
|
||||
llama_token token_middle() const;
|
||||
llama_token token_suffix() const;
|
||||
|
||||
llama_token token_fim_pre() const;
|
||||
llama_token token_fim_suf() const;
|
||||
llama_token token_fim_mid() const;
|
||||
llama_token token_fim_pad() const;
|
||||
llama_token token_fim_rep() const;
|
||||
llama_token token_fim_sep() const;
|
||||
|
||||
bool get_add_space_prefix () const;
|
||||
bool get_add_bos () const;
|
||||
bool get_add_eos () const;
|
||||
bool get_ignore_merges () const;
|
||||
bool get_clean_spaces () const;
|
||||
bool get_remove_extra_whitespaces () const;
|
||||
bool get_escape_whitespaces () const;
|
||||
bool get_treat_whitespace_as_suffix() const;
|
||||
|
||||
int max_token_len() const;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
|
||||
void init_tokenizer();
|
||||
int32_t tokenize(
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special) const;
|
||||
|
||||
std::vector<llama_token> tokenize(
|
||||
const std::string & raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false) const;
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t token_to_piece(
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special) const;
|
||||
|
||||
// use cached data
|
||||
const std::string & token_to_piece(llama_token token) const;
|
||||
|
||||
int32_t detokenize(
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special) const;
|
||||
|
||||
std::string detokenize(
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special) const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
// TODO: rename to llama_tokenize_impl
|
||||
// TODO: This should probably be in llama.h
|
||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab,
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// TODO: move the API below as member functions of llama_vocab
|
||||
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
||||
|
||||
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
// check if token0 is contained as a prefix in token1
|
||||
bool llama_token_is_prefix_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token0,
|
||||
llama_token token1);
|
||||
|
||||
int32_t llama_detokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special);
|
||||
|
||||
std::string llama_detokenize(
|
||||
const struct llama_vocab & vocab,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special);
|
||||
|
||||
3697
llama/llama.cpp/src/llama.cpp
vendored
3697
llama/llama.cpp/src/llama.cpp
vendored
File diff suppressed because it is too large
Load Diff
16
llama/llama.cpp/src/unicode.cpp
vendored
16
llama/llama.cpp/src/unicode.cpp
vendored
@@ -12,18 +12,17 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
|
||||
size_t unicode_len_utf8(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
result.reserve(utf8.size());
|
||||
size_t offset = 0;
|
||||
while (offset < utf8.size()) {
|
||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||
try {
|
||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
catch (const std::invalid_argument & /*ex*/) {
|
||||
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
||||
++offset;
|
||||
result.emplace_back(0xFFFD); // replacement character
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
|
||||
std::string text_collapsed;
|
||||
if (need_collapse) {
|
||||
// collapse all unicode categories
|
||||
|
||||
Reference in New Issue
Block a user