mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
2
llm/ext_server/CMakeLists.txt
vendored
2
llm/ext_server/CMakeLists.txt
vendored
@@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||
add_executable(${TARGET} server.cpp utils.hpp httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
|
||||
24596
llm/ext_server/json.hpp
vendored
24596
llm/ext_server/json.hpp
vendored
File diff suppressed because it is too large
Load Diff
25
llm/ext_server/server.cpp
vendored
25
llm/ext_server/server.cpp
vendored
@@ -262,7 +262,7 @@ struct server_slot {
|
||||
char buffer[512];
|
||||
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
snprintf(buffer, sizeof(buffer), "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_prompt_processing, n_prompt_tokens_processed,
|
||||
t_token, n_tokens_second);
|
||||
LOG_DEBUG(buffer, {
|
||||
@@ -276,7 +276,7 @@ struct server_slot {
|
||||
|
||||
t_token = t_token_generation / n_decoded;
|
||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_token_generation, n_decoded,
|
||||
t_token, n_tokens_second);
|
||||
LOG_DEBUG(buffer, {
|
||||
@@ -288,7 +288,7 @@ struct server_slot {
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
snprintf(buffer, sizeof(buffer), " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
LOG_DEBUG(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
@@ -425,7 +425,7 @@ struct llama_server_context
|
||||
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
add_bos_token = llama_add_bos_token(model);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1031,7 +1031,7 @@ struct llama_server_context
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG_TEE("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
@@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
@@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
params.cpuparams.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--grp-attn-n" || arg == "-gan")
|
||||
{
|
||||
@@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-http")
|
||||
{
|
||||
@@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||
|
||||
if (invalid_param)
|
||||
{
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
@@ -2775,8 +2780,8 @@ int main(int argc, char **argv) {
|
||||
{"commit", LLAMA_COMMIT}});
|
||||
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
{"n_threads_batch", params.n_threads_batch},
|
||||
{"n_threads", params.cpuparams.n_threads},
|
||||
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
||||
{"total_threads", std::thread::hardware_concurrency()},
|
||||
{"system_info", llama_print_system_info()},
|
||||
});
|
||||
|
||||
@@ -19,7 +19,7 @@ sign() {
|
||||
fi
|
||||
}
|
||||
|
||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
|
||||
@@ -360,11 +360,13 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
|
||||
switch llm.KV().Architecture() {
|
||||
case "llama":
|
||||
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
|
||||
fullOffload = max(
|
||||
4*batch*(1+4*embedding+context*(1+heads)),
|
||||
4*batch*(embedding+vocab),
|
||||
)
|
||||
|
||||
partialOffload = 4 * batch * embedding
|
||||
partialOffload += max(
|
||||
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
|
||||
Submodule llm/llama.cpp updated: 1e6f6554aa...8962422b1c
@@ -7,6 +7,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
// Overflow that didn't fit into the GPU
|
||||
var overflow uint64
|
||||
|
||||
overhead := envconfig.GpuOverhead()
|
||||
availableList := make([]string, len(gpus))
|
||||
for i, gpu := range gpus {
|
||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||
@@ -164,8 +166,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
gzo = gpuZeroOverhead
|
||||
}
|
||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||
slog.Debug("gpu has too little memory to allocate any layers",
|
||||
"id", gpus[i].ID,
|
||||
"library", gpus[i].Library,
|
||||
"variant", gpus[i].Variant,
|
||||
"compute", gpus[i].Compute,
|
||||
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
||||
"name", gpus[i].Name,
|
||||
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
||||
"available", format.HumanBytes2(gpus[i].FreeMemory),
|
||||
"minimum_memory", gpus[i].MinimumMemory,
|
||||
"layer_size", format.HumanBytes2(layerSize),
|
||||
"gpu_zer_overhead", format.HumanBytes2(gzo),
|
||||
"partial_offload", format.HumanBytes2(graphPartialOffload),
|
||||
"full_offload", format.HumanBytes2(graphFullOffload),
|
||||
)
|
||||
continue
|
||||
}
|
||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||
@@ -196,7 +212,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[i%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > used+layerSize {
|
||||
if (g.g.FreeMemory - overhead) > used+layerSize {
|
||||
gpuAllocations[g.i] += layerSize
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
@@ -219,7 +235,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[layerCount%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > used+memoryLayerOutput {
|
||||
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
|
||||
gpuAllocations[g.i] += memoryLayerOutput
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
@@ -306,6 +322,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
}
|
||||
|
||||
func (m MemoryEstimate) log() {
|
||||
overhead := envconfig.GpuOverhead()
|
||||
slog.Info(
|
||||
"offload to "+m.inferenceLibrary,
|
||||
slog.Group(
|
||||
@@ -323,6 +340,7 @@ func (m MemoryEstimate) log() {
|
||||
"memory",
|
||||
// memory available by GPU for offloading
|
||||
"available", m.availableList,
|
||||
"gpu_overhead", format.HumanBytes2(overhead),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a207451f..2ddf431d 100644
|
||||
index 88355971..dd7d41ed 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
|
||||
@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
@@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "codeshell") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||
@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "exaone") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
||||
@@ -1,37 +1,36 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 1fe2b9f7..a43312a7 100644
|
||||
index 88355971..d7db689b 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
- const bool has_logits = !cparams.embeddings;
|
||||
+ const bool has_logits = cparams.causal_attn;
|
||||
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
|
||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
|
||||
@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
|
||||
// no output
|
||||
res = nullptr;
|
||||
embd = nullptr;
|
||||
- } else if (cparams.embeddings) {
|
||||
- res = nullptr; // do not extract logits for embedding case
|
||||
- embd = gf->nodes[gf->n_nodes - 1];
|
||||
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||
- embd = gf->nodes[gf->n_nodes - 2];
|
||||
- res = nullptr; // do not extract logits for embedding case
|
||||
- embd = nullptr;
|
||||
+ }
|
||||
+
|
||||
+ if (cparams.embeddings) {
|
||||
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
||||
- embd = gf->nodes[i];
|
||||
+ embd = gf->nodes[i];
|
||||
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
|
||||
+ break;
|
||||
+ }
|
||||
break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||
- } else {
|
||||
+ } else {
|
||||
- GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
|
||||
} else {
|
||||
embd = nullptr; // do not extract embeddings when not needed
|
||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||
}
|
||||
@@ -39,7 +38,6 @@ index 1fe2b9f7..a43312a7 100644
|
||||
+ if (!cparams.causal_attn) {
|
||||
+ res = nullptr; // do not extract logits when not needed
|
||||
+ }
|
||||
+
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||
|
||||
@@ -1,350 +0,0 @@
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index 2e8374d5..70d0afde 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||
if (loaded_la.adapter == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
- llama_free(lctx);
|
||||
- llama_free_model(model);
|
||||
- return iparams;
|
||||
+
|
||||
+ // if that fails, try loading as ggla for compatibility
|
||||
+ int err = llama_model_apply_lora_from_file(model,
|
||||
+ la.path.c_str(),
|
||||
+ la.scale,
|
||||
+ nullptr,
|
||||
+ params.n_threads);
|
||||
+ if (err != 0) {
|
||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
+ llama_free(lctx);
|
||||
+ llama_free_model(model);
|
||||
+ return iparams;
|
||||
+ } else {
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||
}
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 93fd77ca..b0fb37a6 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -1160,6 +1160,20 @@ extern "C" {
|
||||
|
||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
+ // Apply a LoRA adapter to a loaded model
|
||||
+ // path_base_model is the path to a higher quality model to use as a base for
|
||||
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
+ // will be applied on top of the previous one
|
||||
+ // Returns 0 on success
|
||||
+ LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
+ const struct llama_model * model,
|
||||
+ const char * path_lora,
|
||||
+ float scale,
|
||||
+ const char * path_base_model,
|
||||
+ int32_t n_threads);
|
||||
+
|
||||
+
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 80a0dd0f..9d7b0e17 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
+
|
||||
+static int llama_apply_lora_from_file_internal(
|
||||
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
||||
+) {
|
||||
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
+
|
||||
+ const int64_t t_start_lora_us = ggml_time_us();
|
||||
+
|
||||
+ llama_file fin(path_lora, "rb");
|
||||
+
|
||||
+ // verify magic and version
|
||||
+ {
|
||||
+ uint32_t magic = fin.read_u32();
|
||||
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ uint32_t format_version = fin.read_u32();
|
||||
+ if (format_version != 1) {
|
||||
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ int32_t lora_r = fin.read_u32();
|
||||
+ int32_t lora_alpha = fin.read_u32();
|
||||
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||
+
|
||||
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
+
|
||||
+ // load base model
|
||||
+ std::unique_ptr<llama_model_loader> ml;
|
||||
+ if (path_base_model) {
|
||||
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
||||
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
|
||||
+ }
|
||||
+
|
||||
+ struct tensor_meta {
|
||||
+ std::string name;
|
||||
+ ggml_type type;
|
||||
+ int32_t ne[2];
|
||||
+ size_t offset;
|
||||
+ };
|
||||
+ std::map<std::string, tensor_meta> tensor_meta_map;
|
||||
+
|
||||
+ // load all tensor meta
|
||||
+ while (true) {
|
||||
+ if (fin.tell() == fin.size) {
|
||||
+ // eof
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ int32_t n_dims;
|
||||
+ int32_t name_len;
|
||||
+ int32_t ftype;
|
||||
+
|
||||
+ fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
+ fin.read_raw(&name_len, sizeof(name_len));
|
||||
+ fin.read_raw(&ftype, sizeof(ftype));
|
||||
+
|
||||
+ if (n_dims != 1 && n_dims != 2) {
|
||||
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ int32_t ne[2] = { 1, 1 };
|
||||
+ for (int i = 0; i < n_dims; ++i) {
|
||||
+ fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||
+ }
|
||||
+
|
||||
+ std::string name;
|
||||
+ {
|
||||
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
|
||||
+ char buf[GGML_MAX_NAME];
|
||||
+ fin.read_raw(buf, name_len);
|
||||
+ name = std::string(buf, name_len);
|
||||
+ }
|
||||
+
|
||||
+ // check for lora suffix
|
||||
+ std::string lora_suffix;
|
||||
+ if (name.length() > 6) {
|
||||
+ lora_suffix = name.substr(name.length() - 6);
|
||||
+ }
|
||||
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
||||
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // tensor type
|
||||
+ ggml_type wtype;
|
||||
+ switch (ftype) {
|
||||
+ case 0: wtype = GGML_TYPE_F32; break;
|
||||
+ case 1: wtype = GGML_TYPE_F16; break;
|
||||
+ default:
|
||||
+ {
|
||||
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
||||
+ __func__, ftype);
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // data offset
|
||||
+ size_t offset = fin.tell();
|
||||
+ offset = (offset + 31) & -32;
|
||||
+
|
||||
+ // skip tensor data
|
||||
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
||||
+
|
||||
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
||||
+ }
|
||||
+
|
||||
+ bool warned = false;
|
||||
+ int n_tensors = 0;
|
||||
+
|
||||
+ // apply
|
||||
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
+ if (backend_cpu == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
||||
+
|
||||
+ std::vector<no_init<uint8_t>> read_buf;
|
||||
+ for (const auto & it : model.tensors_by_name) {
|
||||
+ const std::string & base_name = it.first;
|
||||
+ ggml_tensor * model_t = it.second;
|
||||
+
|
||||
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
||||
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
||||
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
||||
+
|
||||
+ ggml_init_params lora_init_params = {
|
||||
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||
+ /* .mem_buffer */ nullptr,
|
||||
+ /* .no_alloc */ true,
|
||||
+ };
|
||||
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
|
||||
+ if (lora_ctx == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // create tensors
|
||||
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
||||
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
||||
+ ggml_set_name(loraA, metaA.name.c_str());
|
||||
+ ggml_set_name(loraB, metaB.name.c_str());
|
||||
+
|
||||
+ ggml_tensor * base_t;
|
||||
+ if (ml) {
|
||||
+ if (!ml->get_tensor_meta(base_name.c_str())) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
+ return 1;
|
||||
+ }
|
||||
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
||||
+ } else {
|
||||
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
|
||||
+ }
|
||||
+ ggml_set_name(base_t, base_name.c_str());
|
||||
+
|
||||
+ // allocate in backend buffer
|
||||
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
+ if (lora_buf == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // load tensor data
|
||||
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
||||
+ read_buf.resize(ggml_nbytes(tensor));
|
||||
+ fin.seek(tensor_meta.offset, SEEK_SET);
|
||||
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
||||
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
||||
+ };
|
||||
+ load_tensor(metaA, loraA);
|
||||
+ load_tensor(metaB, loraB);
|
||||
+
|
||||
+ // load base model tensor data
|
||||
+ if (ml) {
|
||||
+ ml->load_data_for(base_t);
|
||||
+ } else {
|
||||
+ ggml_backend_tensor_copy(model_t, base_t);
|
||||
+ }
|
||||
+
|
||||
+ if (ggml_is_quantized(base_t->type) && !warned) {
|
||||
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
+ "use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
+ warned = true;
|
||||
+ }
|
||||
+
|
||||
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
+ ggml_free(lora_ctx);
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ auto build_lora_graph = [&]() {
|
||||
+ // w = w + BA*s
|
||||
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
+ ggml_set_name(BA, "BA");
|
||||
+
|
||||
+ if (scaling != 1.0f) {
|
||||
+ BA = ggml_scale(lora_ctx, BA, scaling);
|
||||
+ ggml_set_name(BA, "BA_scaled");
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * r;
|
||||
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
|
||||
+ ggml_set_name(r, "r_add");
|
||||
+
|
||||
+ if (base_t->type != model_t->type) {
|
||||
+ // convert the result to the model type
|
||||
+ r = ggml_cast(lora_ctx, r, model_t->type);
|
||||
+ ggml_set_name(r, "r_cast");
|
||||
+ }
|
||||
+
|
||||
+ return r;
|
||||
+ };
|
||||
+
|
||||
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
+ ggml_tensor * r = build_lora_graph();
|
||||
+ ggml_build_forward_expand(gf, r);
|
||||
+
|
||||
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
+ if (graph_buf == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
||||
+ ggml_free(lora_ctx);
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_graph_compute(backend_cpu, gf);
|
||||
+
|
||||
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||
+
|
||||
+#if 0
|
||||
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
||||
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
||||
+
|
||||
+ // sched compute
|
||||
+ ggml_build_forward_expand(gf, build_graph());
|
||||
+ ggml_backend_sched_init_measure(sched, gf);
|
||||
+
|
||||
+ // create the graph again, since the previous one was destroyed by the measure
|
||||
+ ggml_graph_clear(gf);
|
||||
+ ggml_build_forward_expand(gf, build_graph());
|
||||
+ ggml_backend_sched_graph_compute(sched, gf);
|
||||
+ ggml_backend_sched_free(sched);
|
||||
+#endif
|
||||
+
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_buffer_free(graph_buf);
|
||||
+ ggml_free(lora_ctx);
|
||||
+
|
||||
+ n_tensors++;
|
||||
+ if (n_tensors % 4 == 0) {
|
||||
+ LLAMA_LOG_INFO(".");
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+
|
||||
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
||||
+ try {
|
||||
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
||||
+ } catch (const std::exception & err) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
+ return 1;
|
||||
+ }
|
||||
+}
|
||||
\ No newline at end of file
|
||||
@@ -1,43 +0,0 @@
|
||||
From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Wed, 31 Jul 2024 14:57:04 -0700
|
||||
Subject: [PATCH] phi3 sliding window
|
||||
|
||||
---
|
||||
src/llama.cpp | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a207451f..f2872d4e 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
|
||||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
@@ -10762,7 +10762,7 @@ struct llm_build_context {
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
+ struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto residual = inpL;
|
||||
@@ -10820,7 +10820,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
--
|
||||
2.45.2
|
||||
|
||||
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
systemTotalMemory = systemMemInfo.TotalMemory
|
||||
systemFreeMemory = systemMemInfo.FreeMemory
|
||||
systemSwapFreeMemory = systemMemInfo.FreeSwap
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||
}
|
||||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
@@ -584,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
||||
|
||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
stallDuration := 5 * time.Minute // If no progress happens
|
||||
finalLoadDuration := 5 * time.Minute // After we hit 100%, give the runner more time to come online
|
||||
stallDuration := envconfig.LoadTimeout() // If no progress happens
|
||||
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
||||
|
||||
slog.Info("waiting for llama runner to start responding")
|
||||
@@ -637,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
stallTimer = time.Now().Add(stallDuration)
|
||||
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
||||
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
||||
stallTimer = time.Now().Add(finalLoadDuration)
|
||||
stallTimer = time.Now().Add(stallDuration)
|
||||
fullyLoaded = true
|
||||
}
|
||||
time.Sleep(time.Millisecond * 250)
|
||||
|
||||
Reference in New Issue
Block a user