diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ae630c3..f0c6db5d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -31,7 +31,7 @@ jobs: security set-keychain-settings -lut 3600 build.keychain - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: Build Darwin env: @@ -87,7 +87,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get ./... - run: | @@ -141,7 +141,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install ROCm' run: | @@ -218,7 +218,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install CUDA' run: | @@ -306,7 +306,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get - uses: actions/download-artifact@v4 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 90fef6e5..5e002a22 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get ./... - run: | @@ -163,7 +163,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install ROCm' run: | @@ -200,7 +200,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install CUDA' run: | @@ -255,7 +255,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: false - run: | case ${{ matrix.arch }} in @@ -297,7 +297,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: | case ${{ matrix.arch }} in diff --git a/Dockerfile b/Dockerfile index ca393496..c8efdd8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG GOLANG_VERSION=1.22.1 +ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 diff --git a/cmd/cmd.go b/cmd/cmd.go index 2252a905..b761d018 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], - envVars["OLLAMA_MAX_VRAM"], }) default: appendEnvDocs(cmd, envs) diff --git a/convert/mistral.go b/convert/mistral.go index da6874cf..8fe066d6 100644 --- a/convert/mistral.go +++ b/convert/mistral.go @@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error { "tokenizer.ggml.unknown_token_id": uint32(0), } + if m.Params.HeadDimension > 0 { + kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension) + kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension) + } + return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } diff --git a/docs/api.md b/docs/api.md index c577bb1a..4381c376 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object: ## Generate Embeddings ```shell -POST /api/embeddings +POST /api/embed ``` Generate embeddings from a model @@ -1034,10 +1034,11 @@ Generate embeddings from a model ### Parameters - `model`: name of model to generate embeddings from -- `prompt`: text to generate embeddings for +- `input`: text or list of text to generate embeddings for Advanced parameters: +- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) @@ -1046,9 +1047,9 @@ Advanced parameters: #### Request ```shell -curl http://localhost:11434/api/embeddings -d '{ +curl http://localhost:11434/api/embed -d '{ "model": "all-minilm", - "prompt": "Here is an article about llamas..." + "input": "Why is the sky blue?" }' ``` @@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{ ```json { - "embedding": [ - 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, - 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 - ] + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ]] +} +``` + +#### Request (Multiple input) + +```shell +curl http://localhost:11434/api/embed -d '{ + "model": "all-minilm", + "input": ["Why is the sky blue?", "Why is the grass green?"] +}' +``` + +#### Response + +```json +{ + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ],[ + -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725, + 0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481 + ]] } ``` @@ -1106,3 +1132,45 @@ A single JSON object will be returned. ] } ``` + +## Generate Embedding + +> Note: this endpoint has been superseded by `/api/embed` + +```shell +POST /api/embeddings +``` + +Generate embeddings from a model + +### Parameters + +- `model`: name of model to generate embeddings from +- `prompt`: text to generate embeddings for + +Advanced parameters: + +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/embeddings -d '{ + "model": "all-minilm", + "prompt": "Here is an article about llamas..." +}' +``` + +#### Response + +```json +{ + "embedding": [ + 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, + 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 + ] +} +``` \ No newline at end of file diff --git a/envconfig/config.go b/envconfig/config.go index 62d661eb..0abc6968 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -43,8 +43,6 @@ var ( MaxRunners int // Set via OLLAMA_MAX_QUEUE in the environment MaxQueuedRequests int - // Set via OLLAMA_MAX_VRAM in the environment - MaxVRAM uint64 // Set via OLLAMA_MODELS in the environment ModelsDir string // Set via OLLAMA_NOHISTORY in the environment @@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, - "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, @@ -194,16 +191,6 @@ func LoadConfig() { TmpDir = clean("OLLAMA_TMPDIR") - userLimit := clean("OLLAMA_MAX_VRAM") - if userLimit != "" { - avail, err := strconv.ParseUint(userLimit, 10, 64) - if err != nil { - slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err) - } else { - MaxVRAM = avail - } - } - LLMLibrary = clean("OLLAMA_LLM_LIBRARY") if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index d66ba9f0..8593285b 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { reqLimit := len(req) iterLimit := 5 - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram != "" { max, err := strconv.ParseUint(vram, 10, 64) require.NoError(t, err) @@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit func TestMultiModelStress(t *testing.T) { - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram == "" { t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test") } diff --git a/llm/llama.cpp b/llm/llama.cpp index a8db2a9c..d94c6e0c 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 +Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 341a6f59..646bc49c 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 2b9ace28..172640e2 100644 +index 8fe51971..7113ba64 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5357,16 +5357,7 @@ static void llm_load_vocab( +@@ -5433,16 +5433,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5439,7 +5430,8 @@ static void llm_load_vocab( - tokenizer_pre == "jais") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; +@@ -5526,7 +5517,8 @@ static void llm_load_vocab( + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; + vocab.tokenizer_clean_spaces = false; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff similarity index 100% rename from llm/patches/07-embeddings.diff rename to llm/patches/06-embeddings.diff diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff deleted file mode 100644 index 1c7109f6..00000000 --- a/llm/patches/06-qwen2.diff +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 40d2ec2c..f34eb79a 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv( - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - -- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { -+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) { - // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs - // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff similarity index 100% rename from llm/patches/08-clip-unicode.diff rename to llm/patches/07-clip-unicode.diff diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff similarity index 100% rename from llm/patches/09-pooling.diff rename to llm/patches/08-pooling.diff diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff new file mode 100644 index 00000000..fc1017a6 --- /dev/null +++ b/llm/patches/09-lora.diff @@ -0,0 +1,360 @@ +diff --git a/common/common.cpp b/common/common.cpp +index dbb724fb..c26fe6ee 100644 +--- a/common/common.cpp ++++ b/common/common.cpp +@@ -2087,14 +2087,29 @@ std::tuple llama_init_from_gpt_par + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); ++ ++ // try to load as gguf + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); + if (adapter == nullptr) { +- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +- llama_free(lctx); +- llama_free_model(model); +- return std::make_tuple(nullptr, nullptr); ++ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); ++ ++ // if that fails, try loading as ggla for compatibility ++ int err = llama_model_apply_lora_from_file(model, ++ lora_adapter.c_str(), ++ lora_scale, ++ ((i > 0) || params.lora_base.empty()) ++ ? NULL ++ : params.lora_base.c_str(), ++ params.n_threads); ++ if (err != 0) { ++ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); ++ llama_free(lctx); ++ llama_free_model(model); ++ return std::make_tuple(nullptr, nullptr); ++ } ++ } else { ++ llama_lora_adapter_set(lctx, adapter, lora_scale); + } +- llama_lora_adapter_set(lctx, adapter, lora_scale); + } + + if (params.ignore_eos) { +diff --git a/include/llama.h b/include/llama.h +index 93fd77ca..b0fb37a6 100644 +--- a/include/llama.h ++++ b/include/llama.h +@@ -1160,6 +1160,20 @@ extern "C" { + + LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + ++ // Apply a LoRA adapter to a loaded model ++ // path_base_model is the path to a higher quality model to use as a base for ++ // the layers modified by the adapter. Can be NULL to use the current loaded model. ++ // The model needs to be reloaded before applying a new adapter, otherwise the adapter ++ // will be applied on top of the previous one ++ // Returns 0 on success ++ LLAMA_API int32_t llama_model_apply_lora_from_file( ++ const struct llama_model * model, ++ const char * path_lora, ++ float scale, ++ const char * path_base_model, ++ int32_t n_threads); ++ ++ + #ifdef __cplusplus + } + #endif +diff --git a/src/llama.cpp b/src/llama.cpp +index 80a0dd0f..9d7b0e17 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, + fputs(text, stderr); + fflush(stderr); + } ++ ++static int llama_apply_lora_from_file_internal( ++ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads ++) { ++ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); ++ ++ const int64_t t_start_lora_us = ggml_time_us(); ++ ++ llama_file fin(path_lora, "rb"); ++ ++ // verify magic and version ++ { ++ uint32_t magic = fin.read_u32(); ++ if (magic != LLAMA_FILE_MAGIC_GGLA) { ++ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); ++ return 1; ++ } ++ ++ uint32_t format_version = fin.read_u32(); ++ if (format_version != 1) { ++ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); ++ return 1; ++ } ++ } ++ ++ int32_t lora_r = fin.read_u32(); ++ int32_t lora_alpha = fin.read_u32(); ++ float scaling = scale * (float)lora_alpha / (float)lora_r; ++ ++ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); ++ ++ // load base model ++ std::unique_ptr ml; ++ if (path_base_model) { ++ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ++ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); ++ ml->init_mappings(/*prefetch*/ false); // no prefetching ++ } ++ ++ struct tensor_meta { ++ std::string name; ++ ggml_type type; ++ int32_t ne[2]; ++ size_t offset; ++ }; ++ std::map tensor_meta_map; ++ ++ // load all tensor meta ++ while (true) { ++ if (fin.tell() == fin.size) { ++ // eof ++ break; ++ } ++ ++ int32_t n_dims; ++ int32_t name_len; ++ int32_t ftype; ++ ++ fin.read_raw(&n_dims, sizeof(n_dims)); ++ fin.read_raw(&name_len, sizeof(name_len)); ++ fin.read_raw(&ftype, sizeof(ftype)); ++ ++ if (n_dims != 1 && n_dims != 2) { ++ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); ++ return 1; ++ } ++ ++ int32_t ne[2] = { 1, 1 }; ++ for (int i = 0; i < n_dims; ++i) { ++ fin.read_raw(&ne[i], sizeof(ne[i])); ++ } ++ ++ std::string name; ++ { ++ GGML_ASSERT(name_len < GGML_MAX_NAME); ++ char buf[GGML_MAX_NAME]; ++ fin.read_raw(buf, name_len); ++ name = std::string(buf, name_len); ++ } ++ ++ // check for lora suffix ++ std::string lora_suffix; ++ if (name.length() > 6) { ++ lora_suffix = name.substr(name.length() - 6); ++ } ++ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { ++ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); ++ return 1; ++ } ++ ++ // tensor type ++ ggml_type wtype; ++ switch (ftype) { ++ case 0: wtype = GGML_TYPE_F32; break; ++ case 1: wtype = GGML_TYPE_F16; break; ++ default: ++ { ++ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", ++ __func__, ftype); ++ return 1; ++ } ++ } ++ ++ // data offset ++ size_t offset = fin.tell(); ++ offset = (offset + 31) & -32; ++ ++ // skip tensor data ++ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); ++ ++ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); ++ } ++ ++ bool warned = false; ++ int n_tensors = 0; ++ ++ // apply ++ ggml_backend_t backend_cpu = ggml_backend_cpu_init(); ++ if (backend_cpu == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); ++ return 1; ++ } ++ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); ++ ++ std::vector> read_buf; ++ for (const auto & it : model.tensors_by_name) { ++ const std::string & base_name = it.first; ++ ggml_tensor * model_t = it.second; ++ ++ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || ++ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { ++ continue; ++ } ++ ++ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); ++ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); ++ ++ ggml_init_params lora_init_params = { ++ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), ++ /* .mem_buffer */ nullptr, ++ /* .no_alloc */ true, ++ }; ++ ggml_context * lora_ctx = ggml_init(lora_init_params); ++ if (lora_ctx == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ // create tensors ++ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); ++ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); ++ ggml_set_name(loraA, metaA.name.c_str()); ++ ggml_set_name(loraB, metaB.name.c_str()); ++ ++ ggml_tensor * base_t; ++ if (ml) { ++ if (!ml->get_tensor_meta(base_name.c_str())) { ++ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); ++ return 1; ++ } ++ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); ++ } else { ++ base_t = ggml_dup_tensor(lora_ctx, model_t); ++ } ++ ggml_set_name(base_t, base_name.c_str()); ++ ++ // allocate in backend buffer ++ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); ++ if (lora_buf == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); ++ return 1; ++ } ++ ++ // load tensor data ++ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { ++ read_buf.resize(ggml_nbytes(tensor)); ++ fin.seek(tensor_meta.offset, SEEK_SET); ++ fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); ++ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); ++ }; ++ load_tensor(metaA, loraA); ++ load_tensor(metaB, loraB); ++ ++ // load base model tensor data ++ if (ml) { ++ ml->load_data_for(base_t); ++ } else { ++ ggml_backend_tensor_copy(model_t, base_t); ++ } ++ ++ if (ggml_is_quantized(base_t->type) && !warned) { ++ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " ++ "use a f16 or f32 base model with --lora-base\n", __func__); ++ warned = true; ++ } ++ ++ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { ++ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" ++ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); ++ ggml_free(lora_ctx); ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ auto build_lora_graph = [&]() { ++ // w = w + BA*s ++ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ++ ggml_set_name(BA, "BA"); ++ ++ if (scaling != 1.0f) { ++ BA = ggml_scale(lora_ctx, BA, scaling); ++ ggml_set_name(BA, "BA_scaled"); ++ } ++ ++ ggml_tensor * r; ++ r = ggml_add_inplace(lora_ctx, base_t, BA); ++ ggml_set_name(r, "r_add"); ++ ++ if (base_t->type != model_t->type) { ++ // convert the result to the model type ++ r = ggml_cast(lora_ctx, r, model_t->type); ++ ggml_set_name(r, "r_cast"); ++ } ++ ++ return r; ++ }; ++ ++ ggml_cgraph * gf = ggml_new_graph(lora_ctx); ++ ggml_tensor * r = build_lora_graph(); ++ ggml_build_forward_expand(gf, r); ++ ++ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); ++ if (graph_buf == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); ++ ggml_free(lora_ctx); ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ ggml_backend_graph_compute(backend_cpu, gf); ++ ++ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); ++ ++#if 0 ++ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU ++ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); ++ ++ // sched compute ++ ggml_build_forward_expand(gf, build_graph()); ++ ggml_backend_sched_init_measure(sched, gf); ++ ++ // create the graph again, since the previous one was destroyed by the measure ++ ggml_graph_clear(gf); ++ ggml_build_forward_expand(gf, build_graph()); ++ ggml_backend_sched_graph_compute(sched, gf); ++ ggml_backend_sched_free(sched); ++#endif ++ ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_buffer_free(graph_buf); ++ ggml_free(lora_ctx); ++ ++ n_tensors++; ++ if (n_tensors % 4 == 0) { ++ LLAMA_LOG_INFO("."); ++ } ++ } ++ ++ ggml_backend_free(backend_cpu); ++ ++ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; ++ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); ++ ++ return 0; ++} ++ ++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { ++ try { ++ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); ++ } catch (const std::exception & err) { ++ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); ++ return 1; ++ } ++} +\ No newline at end of file diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff deleted file mode 100644 index 56a583e0..00000000 --- a/llm/patches/10-tekken.diff +++ /dev/null @@ -1,43 +0,0 @@ -diff --git a/include/llama.h b/include/llama.h -index bb4b05ba..a92174e0 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -92,6 +92,7 @@ extern "C" { - LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, - LLAMA_VOCAB_PRE_TYPE_VIKING = 18, - LLAMA_VOCAB_PRE_TYPE_JAIS = 19, -+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, - }; - - // note: these values should be synchronized with ggml_rope -diff --git a/src/llama.cpp b/src/llama.cpp -index 18364976..435b6fe5 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -5429,6 +5429,12 @@ static void llm_load_vocab( - } else if ( - tokenizer_pre == "jais") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; -+ } else if ( -+ tokenizer_pre == "tekken") { -+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; -+ vocab.tokenizer_clean_spaces = false; -+ vocab.tokenizer_ignore_merges = true; -+ vocab.tokenizer_add_bos = true; - } else { - LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; -@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe { - " ?[^(\\s|.,!?…。,、।۔،)]+", - }; - break; -+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN: -+ // original regex from tokenizer.json -+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" -+ regex_exprs = { -+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", -+ }; -+ break; - default: - // default regex for BPE tokenization pre-processing - regex_exprs = { diff --git a/llm/patches/11-embd_kv.diff b/llm/patches/11-embd_kv.diff deleted file mode 100644 index ad17a700..00000000 --- a/llm/patches/11-embd_kv.diff +++ /dev/null @@ -1,19 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 2b9ace28..e60d3d8d 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -6052,10 +6052,10 @@ static bool llm_load_tensors( - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - -- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); -- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); -- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); -- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); -+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); -+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); -+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); -+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); - - // optional bias tensors - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); diff --git a/llm/server.go b/llm/server.go index ba7eab03..08463ef0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // reap subprocess when it exits go func() { - s.done <- s.cmd.Wait() + err := s.cmd.Wait() + // Favor a more detailed message over the process exit status + if err != nil && s.status != nil && s.status.LastErrMsg != "" { + slog.Debug("llama runner terminated", "error", err) + if strings.Contains(s.status.LastErrMsg, "unknown model") { + s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" + } + s.done <- fmt.Errorf(s.status.LastErrMsg) + } else { + s.done <- err + } }() return s, nil @@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { slog.Warn("client connection closed before server finished loading, aborting load") return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err()) case err := <-s.done: - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - if strings.Contains(msg, "unknown model") { - return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade") - } - return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) + return fmt.Errorf("llama runner process has terminated: %w", err) default: } if time.Now().After(stallTimer) { diff --git a/server/model.go b/server/model.go index a084dd8c..bf38c415 100644 --- a/server/model.go +++ b/server/model.go @@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) { } } + if name == "" || arguments == "" { + return nil, false + } + var objs []map[string]any for offset := 0; offset < len(s); { var obj map[string]any @@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) { return nil, false } else { offset += int(decoder.InputOffset()) - objs = append(objs, obj) + + // collect all nested objects + var collect func(any) []map[string]any + collect = func(obj any) (all []map[string]any) { + switch o := obj.(type) { + case map[string]any: + all = append(all, o) + for _, v := range o { + all = append(all, collect(v)...) + } + case []any: + for _, v := range o { + all = append(all, collect(v)...) + } + } + + return all + } + objs = append(objs, collect(obj)...) } } var toolCalls []api.ToolCall for _, kv := range objs { - var call api.ToolCall - for k, v := range kv { - switch k { - case name: - call.Function.Name = v.(string) - case arguments: - call.Function.Arguments = v.(map[string]any) - } + n, nok := kv[name].(string) + a, aok := kv[arguments].(map[string]any) + if nok && aok { + toolCalls = append(toolCalls, api.ToolCall{ + Function: api.ToolCallFunction{ + Name: n, + Arguments: a, + }, + }) } - - toolCalls = append(toolCalls, call) } return toolCalls, len(toolCalls) > 0 diff --git a/server/model_test.go b/server/model_test.go index 7c826b06..5829adfc 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}} `, true}, + {"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true}, } var tools []api.Tool diff --git a/server/routes.go b/server/routes.go index 85db7924..e6ffe526 100644 --- a/server/routes.go +++ b/server/routes.go @@ -609,12 +609,11 @@ func (s *Server) CreateModelHandler(c *gin.Context) { defer cancel() quantization := cmp.Or(r.Quantize, r.Quantization) - if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil { - if errors.Is(err, errBadTemplate) { - ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} - } + if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) { + ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} + } else if err != nil { ch <- gin.H{"error": err.Error()} - } + } }() if r.Stream != nil && !*r.Stream { diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl new file mode 100644 index 00000000..51513d69 --- /dev/null +++ b/server/testdata/tools/xlam.gotmpl @@ -0,0 +1,45 @@ +{{- if .System }}{{ .System }} +{{ end }} +{{- range $i, $_ := .Messages }} +{{- if eq .Role "user" }}### Instruction: +{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }} +[BEGIN OF TASK INSTRUCTION] +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the functions can be used, point it out and refuse to answer. +If the given question lacks the parameters required by the function, also point it out. +[END OF TASK INSTRUCTION] + +[BEGIN OF AVAILABLE TOOLS] +{{ $.Tools }} +[END OF AVAILABLE TOOLS] + +[BEGIN OF FORMAT INSTRUCTION] +The output MUST strictly adhere to the following JSON format, and NO other text MUST be included. +The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'. +``` +{ + "tool_calls": [ + {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}}, + ... (more tool calls as required) + ] +} +``` +[END OF FORMAT INSTRUCTION] + +[BEGIN OF QUERY] +{{ .Content }} +[END OF QUERY] + + +{{ else }} +{{ .Content }} +{{ end }} +{{- else if .ToolCalls }}### Response: +{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]} +<|EOT|> +{{ else if eq .Role "assistant" }}### Response: +{{ .Content }} +<|EOT|> +{{ end }} +{{- end }}### Response: \ No newline at end of file diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out new file mode 100644 index 00000000..a4a9952f --- /dev/null +++ b/server/testdata/tools/xlam.out @@ -0,0 +1,40 @@ +You are a knowledgable assistant. You can answer questions and perform tasks. +### Instruction: +What's the weather like today in Paris? +### Response: +{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]} +<|EOT|> +### Response: +The current temperature in Paris, France is 22 degrees Celsius. +<|EOT|> +### Instruction: +[BEGIN OF TASK INSTRUCTION] +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the functions can be used, point it out and refuse to answer. +If the given question lacks the parameters required by the function, also point it out. +[END OF TASK INSTRUCTION] + +[BEGIN OF AVAILABLE TOOLS] +[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}] +[END OF AVAILABLE TOOLS] + +[BEGIN OF FORMAT INSTRUCTION] +The output MUST strictly adhere to the following JSON format, and NO other text MUST be included. +The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'. +``` +{ + "tool_calls": [ + {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}}, + ... (more tool calls as required) + ] +} +``` +[END OF FORMAT INSTRUCTION] + +[BEGIN OF QUERY] +What's the weather like today in San Francisco and Toronto? +[END OF QUERY] + + +### Response: \ No newline at end of file