From bd54b08261c15e927234d03e2b1020e528b38afe Mon Sep 17 00:00:00 2001 From: ManniX-ITA <20623405+mann1x@users.noreply.github.com> Date: Wed, 17 Apr 2024 17:39:52 +0200 Subject: [PATCH 01/28] Streamlined WaitUntilRunning --- llm/server.go | 68 ++++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/llm/server.go b/llm/server.go index 4e808085..25122572 100644 --- a/llm/server.go +++ b/llm/server.go @@ -381,56 +381,42 @@ func (s *LlamaServer) Ping(ctx context.Context) error { func (s *LlamaServer) WaitUntilRunning() error { start := time.Now() - // TODO we need to wire up a better way to detect hangs during model load and startup of the server expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load - ticker := time.NewTicker(50 * time.Millisecond) - defer ticker.Stop() slog.Info("waiting for llama runner to start responding") - var lastStatus ServerStatus = -1 + for { - select { - case err := <-s.done: + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + status, err := s.getServerStatus(ctx) + if err != nil { + slog.Debug("server not yet available", "error", err) + } + if time.Now().After(expiresAt) { + // timeout msg := "" if s.status != nil && s.status.LastErrMsg != "" { msg = s.status.LastErrMsg } - return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) - case <-ticker.C: - if time.Now().After(expiresAt) { - // timeout - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - return fmt.Errorf("timed out waiting for llama runner to start: %s", msg) - } - if s.cmd.ProcessState != nil { - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) - } - - ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) - defer cancel() - status, err := s.getServerStatus(ctx) - if err != nil && lastStatus != status { - slog.Debug("server not yet available", "error", err) - lastStatus = status - continue - } - - switch status { - case ServerStatusLoadingModel: - // TODO - this state never seems to happen with the current server.cpp code (bug?) - // it doesn't respond to the health endpoint until after the model is loaded - slog.Debug("loading model") - case ServerStatusReady: - slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds())) - return nil + return fmt.Errorf("timed out waiting for llama runner to start: %s", msg) + } + if s.cmd.ProcessState != nil { + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg } + return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) + } + switch status { + case ServerStatusLoadingModel: + time.Sleep(time.Millisecond * 250) + slog.Debug("loading model") + case ServerStatusReady: + slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", time.Since(start).Seconds())) + return nil + default: + time.Sleep(time.Millisecond * 250) + continue } } } From c942e4a07b91dc6b78bb245241ea514b752e3d4d Mon Sep 17 00:00:00 2001 From: ManniX-ITA <20623405+mann1x@users.noreply.github.com> Date: Wed, 17 Apr 2024 17:40:32 +0200 Subject: [PATCH 02/28] Fixed startup sequence to report model loading --- llm/ext_server/server.cpp | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 22117037..96df9f4b 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2726,7 +2726,7 @@ static json format_detokenized_response(std::string content) static void log_server_request(const httplib::Request &req, const httplib::Response &res) { // skip GH copilot requests when using default port - if (req.path == "/v1/health" || req.path == "/v1/completions") + if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions") { return; } @@ -3053,6 +3053,26 @@ int main(int argc, char **argv) { log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } + if (sparams.n_threads_http < 1) { + // +2 threads for monitoring endpoints + sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + } + log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); + svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + // load the model if (!llama.load_model(params)) { @@ -3257,26 +3277,6 @@ int main(int argc, char **argv) { }*/ //); - if (sparams.n_threads_http < 1) { - // +2 threads for monitoring endpoints - sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); - svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; - - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( From 6042e8bc57e40ea1e666baef64da3aa302182e90 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Wed, 8 May 2024 19:49:45 -0700 Subject: [PATCH 03/28] remove `bash-comparemodels` example --- examples/bash-comparemodels/README.md | 10 --- examples/bash-comparemodels/comparemodels.sh | 64 ------------------- .../bash-comparemodels/sourcequestions.txt | 7 -- 3 files changed, 81 deletions(-) delete mode 100644 examples/bash-comparemodels/README.md delete mode 100755 examples/bash-comparemodels/comparemodels.sh delete mode 100644 examples/bash-comparemodels/sourcequestions.txt diff --git a/examples/bash-comparemodels/README.md b/examples/bash-comparemodels/README.md deleted file mode 100644 index 65e66f1e..00000000 --- a/examples/bash-comparemodels/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Bash Shell examples - -When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other: - -`ollama run llama3 < sourcequestions.txt` - -This concept is used in the following example. - -## Compare Models -`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally. diff --git a/examples/bash-comparemodels/comparemodels.sh b/examples/bash-comparemodels/comparemodels.sh deleted file mode 100755 index 1ce249a6..00000000 --- a/examples/bash-comparemodels/comparemodels.sh +++ /dev/null @@ -1,64 +0,0 @@ -#! /usr/bin/env bash -# Compare multiple models by running them with the same questions - -NUMBEROFCHOICES=4 -SELECTIONS=() -declare -a SUMS=() - -# Get the list of models -CHOICES=$(ollama list | awk '{print $1}') - -# Select which models to run as a comparison -echo "Select $NUMBEROFCHOICES models to compare:" -select ITEM in $CHOICES; do - if [[ -n $ITEM ]]; then - echo "You have selected $ITEM" - SELECTIONS+=("$ITEM") - ((COUNT++)) - if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then - break - fi - else - echo "Invalid selection" - fi -done - -# Loop through each of the selected models -for ITEM in "${SELECTIONS[@]}"; do - echo "--------------------------------------------------------------" - echo "Loading the model $ITEM into memory" - ollama run "$ITEM" "" - echo "--------------------------------------------------------------" - echo "Running the questions through the model $ITEM" - COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr) - - # eval duration is sometimes listed in seconds and sometimes in milliseconds. - # Add up the values for each model - SUM=$(echo "$COMMAND_OUTPUT" | awk ' - /eval duration:/ { - value = $3 - if (index(value, "ms") > 0) { - gsub("ms", "", value) - value /= 1000 - } else { - gsub("s", "", value) - } - sum += value - } - END { print sum }') - - - SUMS+=("All questions for $ITEM completed in $SUM seconds") -done - -echo "" -echo "--------------------------------------------------------------" -echo -e "Sums of eval durations for each run:" -for val in "${SUMS[@]}"; do - echo "$val" -done - -echo "--------------------------------------------------------------" -echo "Comparison complete. Now you can decide" -echo "which model is best." -echo "--------------------------------------------------------------" \ No newline at end of file diff --git a/examples/bash-comparemodels/sourcequestions.txt b/examples/bash-comparemodels/sourcequestions.txt deleted file mode 100644 index 90004c07..00000000 --- a/examples/bash-comparemodels/sourcequestions.txt +++ /dev/null @@ -1,7 +0,0 @@ -Why is the sky blue -What is a black hole -Explain the big bang theory like I am 5? -What is the quickest way to win a game of Monopoly with 3 others? -Why does a vacuum bottle keep my coffee hot and my milkshake cold? -What is the difference between a meteor, a meteorite, and a meteoroid? -Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust. \ No newline at end of file From daa1a032f7b7663378aca20fee5558bf9e2dd022 Mon Sep 17 00:00:00 2001 From: Carlos Gamez <118949383+CarlosGamez-Nova@users.noreply.github.com> Date: Thu, 9 May 2024 11:21:03 +0800 Subject: [PATCH 04/28] Update langchainjs.md (#2027) Updated sample code as per warning notification from the package maintainers --- docs/tutorials/langchainjs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md index 63b34aa6..4d60afb6 100644 --- a/docs/tutorials/langchainjs.md +++ b/docs/tutorials/langchainjs.md @@ -5,13 +5,13 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**: ```bash -npm install langchain +npm install @langchain/community ``` Now we can start building out our JavaScript: ```javascript -import { Ollama } from "langchain/llms/ollama"; +import { Ollama } from "@langchain/community/llms/ollama"; const ollama = new Ollama({ baseUrl: "http://localhost:11434", From d5eec16d2311b244e93f89027c3b716adfabee70 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 9 May 2024 09:06:13 -0700 Subject: [PATCH 05/28] use model defaults for `num_gqa`, `rope_frequency_base ` and `rope_frequency_scale` (#1983) --- api/types.go | 132 ++++++++++++++++++++++------------------------- docs/api.md | 3 -- server/routes.go | 12 ----- 3 files changed, 61 insertions(+), 86 deletions(-) diff --git a/api/types.go b/api/types.go index 5d0212e5..380c179f 100644 --- a/api/types.go +++ b/api/types.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" "math" "os" "reflect" @@ -161,7 +162,6 @@ type Runner struct { UseNUMA bool `json:"numa,omitempty"` NumCtx int `json:"num_ctx,omitempty"` NumBatch int `json:"num_batch,omitempty"` - NumGQA int `json:"num_gqa,omitempty"` NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` LowVRAM bool `json:"low_vram,omitempty"` @@ -171,11 +171,6 @@ type Runner struct { UseMMap bool `json:"use_mmap,omitempty"` UseMLock bool `json:"use_mlock,omitempty"` NumThread int `json:"num_thread,omitempty"` - - // Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used - RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"` - // Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used - RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"` } // EmbeddingRequest is the request passed to [Client.Embeddings]. @@ -359,8 +354,6 @@ func (m *Metrics) Summary() { } } -// ErrInvalidOpts is returned when invalid options are passed to the client. -var ErrInvalidOpts = errors.New("invalid options") var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST") func (opts *Options) FromMap(m map[string]interface{}) error { @@ -376,73 +369,71 @@ func (opts *Options) FromMap(m map[string]interface{}) error { } } - invalidOpts := []string{} for key, val := range m { - if opt, ok := jsonOpts[key]; ok { - field := valueOpts.FieldByName(opt.Name) - if field.IsValid() && field.CanSet() { - if val == nil { - continue - } + opt, ok := jsonOpts[key] + if !ok { + slog.Warn("invalid option provided", "option", opt.Name) + continue + } - switch field.Kind() { - case reflect.Int: - switch t := val.(type) { - case int64: - field.SetInt(t) - case float64: - // when JSON unmarshals numbers, it uses float64, not int - field.SetInt(int64(t)) - default: - return fmt.Errorf("option %q must be of type integer", key) - } - case reflect.Bool: - val, ok := val.(bool) - if !ok { - return fmt.Errorf("option %q must be of type boolean", key) - } - field.SetBool(val) - case reflect.Float32: - // JSON unmarshals to float64 - val, ok := val.(float64) - if !ok { - return fmt.Errorf("option %q must be of type float32", key) - } - field.SetFloat(val) - case reflect.String: - val, ok := val.(string) - if !ok { - return fmt.Errorf("option %q must be of type string", key) - } - field.SetString(val) - case reflect.Slice: - // JSON unmarshals to []interface{}, not []string - val, ok := val.([]interface{}) - if !ok { - return fmt.Errorf("option %q must be of type array", key) - } - // convert []interface{} to []string - slice := make([]string, len(val)) - for i, item := range val { - str, ok := item.(string) - if !ok { - return fmt.Errorf("option %q must be of an array of strings", key) - } - slice[i] = str - } - field.Set(reflect.ValueOf(slice)) - default: - return fmt.Errorf("unknown type loading config params: %v", field.Kind()) - } + field := valueOpts.FieldByName(opt.Name) + if field.IsValid() && field.CanSet() { + if val == nil { + continue + } + + switch field.Kind() { + case reflect.Int: + switch t := val.(type) { + case int64: + field.SetInt(t) + case float64: + // when JSON unmarshals numbers, it uses float64, not int + field.SetInt(int64(t)) + default: + return fmt.Errorf("option %q must be of type integer", key) + } + case reflect.Bool: + val, ok := val.(bool) + if !ok { + return fmt.Errorf("option %q must be of type boolean", key) + } + field.SetBool(val) + case reflect.Float32: + // JSON unmarshals to float64 + val, ok := val.(float64) + if !ok { + return fmt.Errorf("option %q must be of type float32", key) + } + field.SetFloat(val) + case reflect.String: + val, ok := val.(string) + if !ok { + return fmt.Errorf("option %q must be of type string", key) + } + field.SetString(val) + case reflect.Slice: + // JSON unmarshals to []interface{}, not []string + val, ok := val.([]interface{}) + if !ok { + return fmt.Errorf("option %q must be of type array", key) + } + // convert []interface{} to []string + slice := make([]string, len(val)) + for i, item := range val { + str, ok := item.(string) + if !ok { + return fmt.Errorf("option %q must be of an array of strings", key) + } + slice[i] = str + } + field.Set(reflect.ValueOf(slice)) + default: + return fmt.Errorf("unknown type loading config params: %v", field.Kind()) } - } else { - invalidOpts = append(invalidOpts, key) } } - if len(invalidOpts) > 0 { - return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", ")) - } return nil } @@ -475,8 +466,7 @@ func DefaultOptions() Options { NumCtx: 2048, NumBatch: 512, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically - NumGQA: 1, - NumThread: 0, // let the runtime decide + NumThread: 0, // let the runtime decide LowVRAM: false, F16KV: true, UseMLock: false, diff --git a/docs/api.md b/docs/api.md index 2f52c55a..94cd9c90 100644 --- a/docs/api.md +++ b/docs/api.md @@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{ "numa": false, "num_ctx": 1024, "num_batch": 2, - "num_gqa": 1, "num_gpu": 1, "main_gpu": 0, "low_vram": false, @@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{ "vocab_only": false, "use_mmap": true, "use_mlock": false, - "rope_frequency_base": 1.1, - "rope_frequency_scale": 0.8, "num_thread": 8 } }' diff --git a/server/routes.go b/server/routes.go index 7dfeb513..4121483e 100644 --- a/server/routes.go +++ b/server/routes.go @@ -127,10 +127,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -370,10 +366,6 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -1177,10 +1169,6 @@ func (s *Server) ChatHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } From 8cc0ee2efe39b5096ab5a86418d3c067b3474db6 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 9 May 2024 08:49:40 -0700 Subject: [PATCH 06/28] Doc container usage and workaround for nvidia errors --- docs/README.md | 2 +- docs/docker.md | 71 +++++++++++++++++++++++++++++++++++++++++ docs/troubleshooting.md | 21 +++++++++++- 3 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 docs/docker.md diff --git a/docs/README.md b/docs/README.md index a3edb18c..b6221041 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ * [Importing models](./import.md) * [Linux Documentation](./linux.md) * [Windows Documentation](./windows.md) -* [Docker Documentation](https://hub.docker.com/r/ollama/ollama) +* [Docker Documentation](./docker.md) ### Reference diff --git a/docs/docker.md b/docs/docker.md new file mode 100644 index 00000000..0b58562b --- /dev/null +++ b/docs/docker.md @@ -0,0 +1,71 @@ +# Ollama Docker image + +### CPU only + +```bash +docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +### Nvidia GPU +Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation). + +#### Install with Apt +1. Configure the repository +```bash +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +``` +2. Install the NVIDIA Container Toolkit packages +```bash +sudo apt-get install -y nvidia-container-toolkit +``` + +#### Install with Yum or Dnf +1. Configure the repository + +```bash +curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo +``` + +2. Install the NVIDIA Container Toolkit packages + +```bash +sudo yum install -y nvidia-container-toolkit +``` + +#### Configure Docker to use Nvidia driver +``` +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +#### Start the container + +```bash +docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +### AMD GPU + +To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command: + +``` +docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm +``` + +### Run model locally + +Now you can run a model: + +``` +docker exec -it ollama ollama run llama3 +``` + +### Try different models + +More models can be found on the [Ollama library](https://ollama.com/library). diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index b9038e38..2586e4e4 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -82,4 +82,23 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example -OLLAMA_TMPDIR=/usr/share/ollama/ \ No newline at end of file +OLLAMA_TMPDIR=/usr/share/ollama/ + +## Container fails to run on NVIDIA GPU + +Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md) + +Sometimes the container runtime can have difficulties initializing the GPU. +When you check the server logs, this can show up as various error codes, such +as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" +(unknown), or others. The following troubleshooting techniques may help resolve +the problem + +- Is the uvm driver not loaded? `sudo nvidia-modprobe -u` +- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` +- Try rebooting +- Make sure you're running the latest nvidia drivers + +If none of those resolve the problem, gather additional information and file an issue: +- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs +- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` From 5cde17a096144bb0d034688508e3af1051532c3c Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Thu, 9 May 2024 17:39:05 +0100 Subject: [PATCH 07/28] Add PromptingTools.jl (#2192) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4f980375..cc43805d 100644 --- a/README.md +++ b/README.md @@ -357,6 +357,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama) - [Testcontainers](https://testcontainers.com/modules/ollama/) - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama) +- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama) ### Mobile From 788b092c49d6fa889c911ed1d4eb2d01d944e1c3 Mon Sep 17 00:00:00 2001 From: tusharhero <54012021+tusharhero@users.noreply.github.com> Date: Thu, 9 May 2024 23:40:24 +0530 Subject: [PATCH 08/28] docs: add Guix package manager in README. (#4040) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cc43805d..465d1a12 100644 --- a/README.md +++ b/README.md @@ -331,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/) - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama) +- [Guix channel](https://codeberg.org/tusharhero/ollama-guix) ### Libraries From 84ac7ce139252506d77115a3152f36a5a4f3541a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 9 May 2024 11:10:28 -0700 Subject: [PATCH 09/28] Refine subprocess reaping --- llm/server.go | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/llm/server.go b/llm/server.go index b452434e..4600d00f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -53,6 +53,7 @@ type llmServer struct { estimatedTotal uint64 // Total size of model totalLayers uint64 gpuCount int + loadDuration time.Duration // Record how long it took the model to load sem *semaphore.Weighted } @@ -291,6 +292,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1, gpuCount: gpuCount, + done: make(chan error, 1), } s.cmd.Env = os.Environ() @@ -339,6 +341,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr continue } + // reap subprocess when it exits + go func() { + s.done <- s.cmd.Wait() + }() + return s, nil } @@ -486,6 +493,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load slog.Info("waiting for llama runner to start responding") + var lastStatus ServerStatus = -1 for { select { @@ -500,12 +508,6 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) default: } - ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) - defer cancel() - status, err := s.getServerStatus(ctx) - if err != nil { - slog.Debug("server not yet available", "error", err) - } if time.Now().After(expiresAt) { // timeout msg := "" @@ -521,14 +523,20 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { } return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) } + ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond) + defer cancel() + status, _ := s.getServerStatus(ctx) + if lastStatus != status && status != ServerStatusReady { + // Only log on status changes + slog.Info("waiting for server to become available", "status", status.ToString()) + } switch status { - case ServerStatusLoadingModel: - time.Sleep(time.Millisecond * 250) - slog.Debug("loading model") case ServerStatusReady: - slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", time.Since(start).Seconds())) + s.loadDuration = time.Since(start) + slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds())) return nil default: + lastStatus = status time.Sleep(time.Millisecond * 250) continue } @@ -930,8 +938,11 @@ func (s *llmServer) Close() error { if err := s.cmd.Process.Kill(); err != nil { return err } - - _ = s.cmd.Wait() + // if ProcessState is already populated, Wait already completed, no need to wait again + if s.cmd.ProcessState == nil { + slog.Debug("waiting for llama server to exit") + <-s.done + } slog.Debug("llama server stopped") } From a7ee84fc31d597a55e591470aba489b643ce7210 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 9 May 2024 11:23:22 -0700 Subject: [PATCH 10/28] routes: skip invalid filepaths --- server/routes.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/routes.go b/server/routes.go index 4121483e..c4c0f6a9 100644 --- a/server/routes.go +++ b/server/routes.go @@ -732,6 +732,11 @@ func (s *Server) ListModelsHandler(c *gin.Context) { } n := model.ParseNameFromFilepath(rel) + if !n.IsValid() { + slog.Info("invalid model filepath", "path", rel) + return nil + } + m, err := ParseNamedManifest(n) if err != nil { return err From cfa84b8470837ecb418d1668858fe06c35c01d34 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 9 May 2024 13:30:14 -0700 Subject: [PATCH 11/28] add done_reason to the api (#4235) --- api/types.go | 10 +++++++--- llm/server.go | 16 ++++++++++++---- openai/openai.go | 24 ++++++------------------ server/routes.go | 34 +++++++++++++++++++--------------- 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/api/types.go b/api/types.go index 380c179f..860330db 100644 --- a/api/types.go +++ b/api/types.go @@ -114,9 +114,10 @@ type Message struct { // ChatResponse is the response returned by [Client.Chat]. Its fields are // similar to [GenerateResponse]. type ChatResponse struct { - Model string `json:"model"` - CreatedAt time.Time `json:"created_at"` - Message Message `json:"message"` + Model string `json:"model"` + CreatedAt time.Time `json:"created_at"` + Message Message `json:"message"` + DoneReason string `json:"done_reason"` Done bool `json:"done"` @@ -309,6 +310,9 @@ type GenerateResponse struct { // Done specifies if the response is complete. Done bool `json:"done"` + // DoneReason is the reason the model stopped generating text. + DoneReason string `json:"done_reason"` + // Context is an encoding of the conversation used in this response; this // can be sent in the next request to keep a conversational memory. Context []int `json:"context,omitempty"` diff --git a/llm/server.go b/llm/server.go index 78106ea0..ec122453 100644 --- a/llm/server.go +++ b/llm/server.go @@ -576,10 +576,11 @@ type ImageData struct { } type completion struct { - Content string `json:"content"` - Model string `json:"model"` - Prompt string `json:"prompt"` - Stop bool `json:"stop"` + Content string `json:"content"` + Model string `json:"model"` + Prompt string `json:"prompt"` + Stop bool `json:"stop"` + StoppedLimit bool `json:"stopped_limit"` Timings struct { PredictedN int `json:"predicted_n"` @@ -598,6 +599,7 @@ type CompletionRequest struct { type CompletionResponse struct { Content string + DoneReason string Done bool PromptEvalCount int PromptEvalDuration time.Duration @@ -739,8 +741,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } if c.Stop { + doneReason := "stop" + if c.StoppedLimit { + doneReason = "length" + } + fn(CompletionResponse{ Done: true, + DoneReason: doneReason, PromptEvalCount: c.Timings.PromptN, PromptEvalDuration: parseDurationMs(c.Timings.PromptMS), EvalCount: c.Timings.PredictedN, diff --git a/openai/openai.go b/openai/openai.go index 96d8f218..4b335f36 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -107,15 +107,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { Model: r.Model, SystemFingerprint: "fp_ollama", Choices: []Choice{{ - Index: 0, - Message: Message{Role: r.Message.Role, Content: r.Message.Content}, - FinishReason: func(done bool) *string { - if done { - reason := "stop" - return &reason - } - return nil - }(r.Done), + Index: 0, + Message: Message{Role: r.Message.Role, Content: r.Message.Content}, + FinishReason: &r.DoneReason, }}, Usage: Usage{ // TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count @@ -135,15 +129,9 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk { SystemFingerprint: "fp_ollama", Choices: []ChunkChoice{ { - Index: 0, - Delta: Message{Role: "assistant", Content: r.Message.Content}, - FinishReason: func(done bool) *string { - if done { - reason := "stop" - return &reason - } - return nil - }(r.Done), + Index: 0, + Delta: Message{Role: "assistant", Content: r.Message.Content}, + FinishReason: &r.DoneReason, }, }, } diff --git a/server/routes.go b/server/routes.go index c4c0f6a9..c73a1d3c 100644 --- a/server/routes.go +++ b/server/routes.go @@ -152,9 +152,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { // of `raw` mode so we need to check for it too if req.Prompt == "" && req.Template == "" && req.System == "" { c.JSON(http.StatusOK, api.GenerateResponse{ - CreatedAt: time.Now().UTC(), - Model: req.Model, - Done: true, + CreatedAt: time.Now().UTC(), + Model: req.Model, + Done: true, + DoneReason: "load", }) return } @@ -222,10 +223,11 @@ func (s *Server) GenerateHandler(c *gin.Context) { } resp := api.GenerateResponse{ - Model: req.Model, - CreatedAt: time.Now().UTC(), - Done: r.Done, - Response: r.Content, + Model: req.Model, + CreatedAt: time.Now().UTC(), + Done: r.Done, + Response: r.Content, + DoneReason: r.DoneReason, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, PromptEvalDuration: r.PromptEvalDuration, @@ -1215,10 +1217,11 @@ func (s *Server) ChatHandler(c *gin.Context) { // an empty request loads the model if len(req.Messages) == 0 || prompt == "" { resp := api.ChatResponse{ - CreatedAt: time.Now().UTC(), - Model: req.Model, - Done: true, - Message: api.Message{Role: "assistant"}, + CreatedAt: time.Now().UTC(), + Model: req.Model, + Done: true, + DoneReason: "load", + Message: api.Message{Role: "assistant"}, } c.JSON(http.StatusOK, resp) return @@ -1251,10 +1254,11 @@ func (s *Server) ChatHandler(c *gin.Context) { fn := func(r llm.CompletionResponse) { resp := api.ChatResponse{ - Model: req.Model, - CreatedAt: time.Now().UTC(), - Message: api.Message{Role: "assistant", Content: r.Content}, - Done: r.Done, + Model: req.Model, + CreatedAt: time.Now().UTC(), + Message: api.Message{Role: "assistant", Content: r.Content}, + Done: r.Done, + DoneReason: r.DoneReason, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, PromptEvalDuration: r.PromptEvalDuration, From 8727a9c140cabc2ffcf6599412f540ced594edb7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 7 May 2024 14:54:26 -0700 Subject: [PATCH 12/28] Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX. --- gpu/amd_hip_windows.go | 15 +++++--- gpu/amd_linux.go | 82 +++++++++++++++++++++++++++++++----------- gpu/amd_windows.go | 66 ++++++++++------------------------ gpu/gpu.go | 16 ++++++--- gpu/gpu_info.h | 3 ++ gpu/gpu_info_cpu.c | 4 --- gpu/gpu_info_nvcuda.c | 20 ++++++----- gpu/gpu_info_nvcuda.h | 3 ++ gpu/types.go | 34 +++++++++++++++--- server/routes.go | 3 +- 10 files changed, 150 insertions(+), 96 deletions(-) diff --git a/gpu/amd_hip_windows.go b/gpu/amd_hip_windows.go index 4e216132..8572a24c 100644 --- a/gpu/amd_hip_windows.go +++ b/gpu/amd_hip_windows.go @@ -3,7 +3,6 @@ package gpu import ( "fmt" "log/slog" - "strconv" "syscall" "unsafe" @@ -74,16 +73,22 @@ func (hl *HipLib) Release() { hl.dll = 0 } -func (hl *HipLib) AMDDriverVersion() (string, error) { +func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) { if hl.dll == 0 { - return "", fmt.Errorf("dll has been unloaded") + return 0, 0, fmt.Errorf("dll has been unloaded") } var version int status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version))) if status != hipSuccess { - return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err) + return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err) } - return strconv.Itoa(version), nil + + slog.Debug("hipDriverGetVersion", "version", version) + // TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway... + driverMajor = version / 1000 + driverMinor = (version - (driverMajor * 1000)) / 10 + + return driverMajor, driverMinor, nil } func (hl *HipLib) HipGetDeviceCount() int { diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 9f9f8e74..6b08ac2e 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -8,6 +8,7 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "slices" "strconv" "strings" @@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo { } // Opportunistic logging of driver version to aid in troubleshooting - ver, err := AMDDriverVersion() - if err == nil { - slog.Info("AMD Driver: " + ver) - } else { + driverMajor, driverMinor, err := AMDDriverVersion() + if err != nil { // TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err) } @@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo { scanner := bufio.NewScanner(fp) isCPU := false var major, minor, patch uint64 + var vendor, device uint64 for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) // Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs @@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo { slog.Debug("malformed int " + line) continue } + } else if strings.HasPrefix(line, "vendor_id") { + ver := strings.Fields(line) + if len(ver) != 2 { + slog.Debug("malformed vendor_id", "vendor_id", line) + continue + } + vendor, err = strconv.ParseUint(ver[1], 10, 32) + if err != nil { + slog.Debug("malformed vendor_id" + line) + } + } else if strings.HasPrefix(line, "device_id") { + ver := strings.Fields(line) + if len(ver) != 2 { + slog.Debug("malformed device_id", "device_id", line) + continue + } + device, err = strconv.ParseUint(ver[1], 10, 32) + if err != nil { + slog.Debug("malformed device_id" + line) + } } // TODO - any other properties we want to extract and record? @@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo { } if int(major) < RocmComputeMin { - slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID) + slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID) continue } @@ -210,24 +230,29 @@ func AMDGetGPUInfo() []GpuInfo { // iGPU detection, remove this check once we can support an iGPU variant of the rocm library if totalMemory < IGPUMemLimit { - slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) + slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory)) continue } + var name string + // TODO - PCI ID lookup + if vendor > 0 && device > 0 { + name = fmt.Sprintf("%04x:%04x", vendor, device) + } - slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) - slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) + slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) gpuInfo := GpuInfo{ Library: "rocm", memInfo: memInfo{ TotalMemory: totalMemory, FreeMemory: (totalMemory - usedMemory), }, - ID: fmt.Sprintf("%d", gpuID), - // Name: not exposed in sysfs directly, would require pci device id lookup - Major: int(major), - Minor: int(minor), - Patch: int(patch), + ID: fmt.Sprintf("%d", gpuID), + Name: name, + Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), MinimumMemory: rocmMinimumMemory, + DriverMajor: driverMajor, + DriverMinor: driverMinor, } // If the user wants to filter to a subset of devices, filter out if we aren't a match @@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo { } slog.Debug("rocm supported GPUs", "types", supported) } - gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch) + gfx := gpuInfo.Compute if !slices.Contains[[]string, string](supported, gfx) { slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported) // TODO - consider discrete markdown just for ROCM troubleshooting? @@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo { slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx) } } else { - slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) + slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) } // The GPU has passed all the verification steps and is supported @@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) { return "", fmt.Errorf("no suitable rocm found, falling back to CPU") } -func AMDDriverVersion() (string, error) { - _, err := os.Stat(DriverVersionFile) +func AMDDriverVersion() (driverMajor, driverMinor int, err error) { + _, err = os.Stat(DriverVersionFile) if err != nil { - return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err) + return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err) } fp, err := os.Open(DriverVersionFile) if err != nil { - return "", err + return 0, 0, err } defer fp.Close() verString, err := io.ReadAll(fp) if err != nil { - return "", err + return 0, 0, err } - return strings.TrimSpace(string(verString)), nil + + pattern := `\A(\d+)\.(\d+).*` + regex := regexp.MustCompile(pattern) + match := regex.FindStringSubmatch(string(verString)) + if len(match) < 2 { + return 0, 0, fmt.Errorf("malformed version string %s", string(verString)) + } + driverMajor, err = strconv.Atoi(match[1]) + if err != nil { + return 0, 0, err + } + driverMinor, err = strconv.Atoi(match[2]) + if err != nil { + return 0, 0, err + } + return driverMajor, driverMinor, nil } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 22c9f427..aae6c5b7 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -7,7 +7,6 @@ import ( "os" "path/filepath" "slices" - "strconv" "strings" "github.com/ollama/ollama/format" @@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo { } defer hl.Release() - ver, err := hl.AMDDriverVersion() - if err == nil { - slog.Info("AMD Driver: " + ver) - } else { - // For now this is benign, but we may eventually need to fail compatibility checks - slog.Debug("error looking up amd driver version", "error", err) - } + // TODO - this reports incorrect version information, so omitting for now + // driverMajor, driverMinor, err := hl.AMDDriverVersion() + // if err != nil { + // // For now this is benign, but we may eventually need to fail compatibility checks + // slog.Debug("error looking up amd driver version", "error", err) + // } // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() @@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo { return nil } } else { - slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) + slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) } - slog.Info("detected hip devices", "count", count) + slog.Debug("detected hip devices", "count", count) // TODO how to determine the underlying device ID when visible devices is causing this to subset? for i := 0; i < count; i++ { err = hl.HipSetDevice(i) @@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo { // Can luid be used on windows for setting visible devices (and is it actually set?) n = bytes.IndexByte(props.GcnArchName[:], 0) gfx := string(props.GcnArchName[:n]) - slog.Info("hip device", "id", i, "name", name, "gfx", gfx) - var major, minor, patch string - switch len(gfx) { - case 6: - major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:] - case 7: - major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:] - } + slog.Debug("hip device", "id", i, "name", name, "gfx", gfx) //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0 // TODO Why isn't props.iGPU accurate!? if strings.EqualFold(name, iGPUName) { - slog.Info("iGPU detected skipping", "id", i) + slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx) continue } if gfxOverride == "" { @@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo { slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage") continue } else { - slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx) + slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx) } } @@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo { // TODO revisit this once ROCm v6 is available on windows. // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable - slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) - slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) + slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) gpuInfo := GpuInfo{ Library: "rocm", memInfo: memInfo{ @@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo { ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices DependencyPath: libDir, MinimumMemory: rocmMinimumMemory, - } - if major != "" { - gpuInfo.Major, err = strconv.Atoi(major) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } - } - if minor != "" { - gpuInfo.Minor, err = strconv.Atoi(minor) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } - } - if patch != "" { - // Patch rev is hex; e.g. gfx90a - p, err := strconv.ParseInt(patch, 16, 0) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } else { - gpuInfo.Patch = int(p) - } - } - if gpuInfo.Major < RocmComputeMin { - slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)) - continue + Name: name, + Compute: gfx, + + // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve + // DriverMajor: driverMajor, + // DriverMinor: driverMinor, } resp = append(resp, gpuInfo) diff --git a/gpu/gpu.go b/gpu/gpu.go index f8bae9b0..f546506b 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -119,12 +119,12 @@ func initGPUHandles() *handles { return gpuHandles } - slog.Info("Detecting GPUs") + slog.Debug("Detecting GPUs") nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) if nvcuda != nil { - slog.Info("detected GPUs", "count", deviceCount, "library", libPath) + slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) gpuHandles.nvcuda = nvcuda gpuHandles.deviceCount = deviceCount return gpuHandles @@ -135,7 +135,7 @@ func initGPUHandles() *handles { if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) if cudart != nil { - slog.Info("detected GPUs", "library", libPath, "count", deviceCount) + slog.Debug("detected GPUs", "library", libPath, "count", deviceCount) gpuHandles.cudart = cudart gpuHandles.deviceCount = deviceCount return gpuHandles @@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList { gpuInfo := GpuInfo{ Library: "cuda", } + var driverMajor int + var driverMinor int if gpuHandles.cudart != nil { C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) } else { C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) + driverMajor = int(gpuHandles.nvcuda.driver_major) + driverMinor = int(gpuHandles.nvcuda.driver_minor) } if memInfo.err != nil { slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) @@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList { gpuInfo.TotalMemory = uint64(memInfo.total) gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Major = int(memInfo.major) - gpuInfo.Minor = int(memInfo.minor) + gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.DependencyPath = depPath + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DriverMajor = int(driverMajor) + gpuInfo.DriverMinor = int(driverMinor) // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... resp = append(resp, gpuInfo) diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 577bd3f0..2fa86f8d 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -39,16 +39,19 @@ extern "C" { #endif #define GPU_ID_LEN 64 +#define GPU_NAME_LEN 96 typedef struct mem_info { char *err; // If non-nill, caller responsible for freeing char gpu_id[GPU_ID_LEN]; + char gpu_name[GPU_NAME_LEN]; uint64_t total; uint64_t free; // Compute Capability int major; int minor; + int patch; } mem_info_t; void cpu_check_ram(mem_info_t *resp); diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c index 81ba3de4..6cbe28b0 100644 --- a/gpu/gpu_info_cpu.c +++ b/gpu/gpu_info_cpu.c @@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) { if (GlobalMemoryStatusEx(&info) != 0) { resp->total = info.ullTotalPhys; resp->free = info.ullAvailPhys; - resp->major = 0; - resp->minor = 0; snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); } else { resp->err = LOAD_ERR(); @@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) { } else { resp->total = info.totalram * info.mem_unit; resp->free = info.freeram * info.mem_unit; - resp->major = 0; - resp->minor = 0; snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); } return; diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index e192d2e6..26d855df 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet}, {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute}, {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid}, + {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName}, {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3}, {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2}, {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy}, @@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { } int version = 0; - nvcudaDriverVersion_t driverVersion; - driverVersion.major = 0; - driverVersion.minor = 0; + resp->ch.driver_major = 0; + resp->ch.driver_minor = 0; // Report driver version if we're in verbose mode, ignore errors ret = (*resp->ch.cuDriverGetVersion)(&version); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); } else { - driverVersion.major = version / 1000; - driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; - LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + resp->ch.driver_major = version / 1000; + resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor); } ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); @@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { return; } - resp->major = 0; - resp->minor = 0; int major = 0; int minor = 0; ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); @@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { ); } + ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret); + resp->gpu_name[0] = '\0'; + } + // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h index c4d94edd..2b232839 100644 --- a/gpu/gpu_info_nvcuda.h +++ b/gpu/gpu_info_nvcuda.h @@ -44,12 +44,15 @@ typedef void* CUcontext; typedef struct nvcuda_handle { void *handle; uint16_t verbose; + int driver_major; + int driver_minor; CUresult (*cuInit)(unsigned int Flags); CUresult (*cuDriverGetVersion)(int *driverVersion); CUresult (*cuDeviceGetCount)(int *); CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2 + CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev); // Context specific aspects CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev); diff --git a/gpu/types.go b/gpu/types.go index 7a5d5ba7..af33b896 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -1,5 +1,12 @@ package gpu +import ( + "fmt" + "log/slog" + + "github.com/ollama/ollama/format" +) + type memInfo struct { TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` @@ -20,11 +27,13 @@ type GpuInfo struct { DependencyPath string `json:"lib_path,omitempty"` // GPU information - ID string `json:"gpu_id"` // string to use for selection of this specific GPU - Name string `json:"name"` // user friendly name if available - Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx) - Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx) - Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD + ID string `json:"gpu_id"` // string to use for selection of this specific GPU + Name string `json:"name"` // user friendly name if available + Compute string `json:"compute"` // Compute Capability or gfx + + // Driver Information - TODO no need to put this on each GPU + DriverMajor int `json:"driver_major,omitempty"` + DriverMinor int `json:"driver_minor,omitempty"` // TODO other performance capability info to help in scheduling decisions } @@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { return resp } +// Report the GPU information into the log an Info level +func (l GpuInfoList) LogDetails() { + for _, g := range l { + slog.Info("inference compute", + "id", g.ID, + "library", g.Library, + "compute", g.Compute, + "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), + "name", g.Name, + "total", format.HumanBytes2(g.TotalMemory), + "available", format.HumanBytes2(g.FreeMemory), + ) + } +} + // Sort by Free Space type ByFreeMemory []GpuInfo diff --git a/server/routes.go b/server/routes.go index c73a1d3c..90cdfcd5 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1065,7 +1065,8 @@ func Serve(ln net.Listener) error { // At startup we retrieve GPU information so we can get log messages before loading a model // This will log warnings to the log in case we have problems with detected GPUs - _ = gpu.GetGPUInfo() + gpus := gpu.GetGPUInfo() + gpus.LogDetails() return srvr.Serve(ln) } From 58876091f750c0f7b2620f161747d1e2a1a2025b Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 9 May 2024 13:52:56 -0700 Subject: [PATCH 13/28] log clean up --- llm/ext_server/server.cpp | 44 ++++++++++++++++++--------------------- llm/ext_server/utils.hpp | 13 +++++++----- llm/server.go | 7 ++----- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 41455c65..0c339989 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -66,7 +66,7 @@ struct server_params { }; bool server_verbose = false; -bool server_log_json = true; +bool server_log_json = false; enum stop_type { STOP_FULL, @@ -266,7 +266,7 @@ struct server_slot { sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -280,7 +280,7 @@ struct server_slot { sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", t_token_generation, n_decoded, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_token_generation", t_token_generation}, @@ -290,7 +290,7 @@ struct server_slot { }); sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -371,7 +371,7 @@ struct llama_server_context { if (clp_ctx) { - LOG_INFO("freeing clip model", {}); + LOG_DEBUG("freeing clip model", {}); clip_free(clp_ctx); clp_ctx = nullptr; } @@ -392,7 +392,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_INFO("Multi Modal Mode Enabled", {}); + LOG_DEBUG("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -445,7 +445,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -454,7 +454,7 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { + LOG_DEBUG("new slot", { {"slot_id", slot.id}, {"n_ctx_slot", slot.n_ctx} }); @@ -468,7 +468,7 @@ struct llama_server_context //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { + LOG_DEBUG("slot self-extend", { {"slot_id", slot.id}, {"ga_n", ga_n}, {"ga_w", ga_w} @@ -827,7 +827,7 @@ struct llama_server_context all_slots_are_idle = false; - LOG_INFO("slot is processing task", { + LOG_DEBUG("slot is processing task", { {"slot_id", slot->id}, {"task_id", slot->task_id}, }); @@ -1504,7 +1504,7 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_INFO("slot data", { + LOG_DEBUG("slot data", { {"task_id", task.id}, {"n_idle_slots", n_idle_slots}, {"n_processing_slots", n_processing_slots} @@ -1566,7 +1566,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_INFO("updating system prompt", {}); + LOG_DEBUG("updating system prompt", {}); system_prompt_update(); } @@ -1576,7 +1576,7 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); + LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; @@ -1599,7 +1599,7 @@ struct llama_server_context const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_INFO("slot context shift", { + LOG_DEBUG("slot context shift", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_keep", n_keep}, @@ -1638,7 +1638,7 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_INFO("slot released", { + LOG_DEBUG("slot released", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_ctx", n_ctx}, @@ -1807,7 +1807,7 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_INFO("slot progression", { + LOG_DEBUG("slot progression", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "n_past", slot.n_past }, @@ -1822,7 +1822,7 @@ struct llama_server_context if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { + LOG_DEBUG("we have to evaluate at least 1 token to generate logits", { { "slot_id", slot.id }, { "task_id", slot.task_id } }); @@ -1834,7 +1834,7 @@ struct llama_server_context } int p0 = (int) system_tokens.size() + slot.n_past; - LOG_INFO("kv cache rm [p0, end)", { + LOG_DEBUG("kv cache rm [p0, end)", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "p0", p0 } @@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } else if (arg == "-v" || arg == "--verbose") { -#if SERVER_VERBOSE != 1 - LOG_WARNING("server.cpp is not built with verbose logging.", {}); -#else server_verbose = true; -#endif } else if (arg == "--mlock") { @@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, else if (arg == "--log-disable") { log_set_target(stdout); - LOG_INFO("logging to file is disabled.", {}); + LOG_DEBUG("logging to file is disabled.", {}); } else if (arg == "--slots-endpoint-disable") { @@ -2732,7 +2728,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo return; } - LOG_INFO("request", { + LOG_DEBUG("request", { {"remote_addr", req.remote_addr}, {"remote_port", req.remote_port}, {"status", res.status}, diff --git a/llm/ext_server/utils.hpp b/llm/ext_server/utils.hpp index bd340656..d63ead04 100644 --- a/llm/ext_server/utils.hpp +++ b/llm/ext_server/utils.hpp @@ -55,9 +55,10 @@ extern bool server_log_json; } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__) enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet @@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line, {"timestamp", time(nullptr)}, }; + if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) { + return; + } + if (server_log_json) { log.merge_patch( { @@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line, std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - if (!extra.empty()) { log.merge_patch(extra); } + std::stringstream ss; - ss << buf << " |"; + ss << level << " [" << function << "] " << message << " |"; for (const auto& el : log.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); diff --git a/llm/server.go b/llm/server.go index 9fe4cc17..27f471a0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -157,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr "--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--embedding", } - if envconfig.Debug { - params = append(params, "--log-format", "json") - } else { - params = append(params, "--log-disable") - } + + params = append(params, "--log-disable") if opts.NumGPU >= 0 { params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) From 354ad9254ee64ada621f66192cf4686af8a2a6bb Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 9 May 2024 11:44:45 -0700 Subject: [PATCH 14/28] Wait for GPU free memory reporting to converge The GPU drivers take a while to update their free memory reporting, so we need to wait until the values converge with what we're expecting before proceeding to start another runner in order to get an accurate picture. --- gpu/cpu_common.go | 6 ++--- server/sched.go | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 3b299e42..920d0f5b 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -8,14 +8,14 @@ import ( func GetCPUVariant() string { if cpu.X86.HasAVX2 { - slog.Info("CPU has AVX2") + slog.Debug("CPU has AVX2") return "avx2" } if cpu.X86.HasAVX { - slog.Info("CPU has AVX") + slog.Debug("CPU has AVX") return "avx" } - slog.Info("CPU does not have vector extensions") + slog.Debug("CPU does not have vector extensions") // else LCD return "" } diff --git a/server/sched.go b/server/sched.go index c4a071c1..96235ea5 100644 --- a/server/sched.go +++ b/server/sched.go @@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) { s.loadedMu.Lock() slog.Debug("got lock to unload", "model", runner.model) + finished := runner.waitForVRAMRecovery() runner.unload() delete(s.loaded, runner.model) s.loadedMu.Unlock() slog.Debug("runner released", "model", runner.model) runner.refMu.Unlock() + + <-finished slog.Debug("sending an unloaded event", "model", runner.model) s.unloadedCh <- struct{}{} } @@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool return false } +// Free memory reporting on GPUs can lag for a while even after the runner +// exits, so we have to keep checking until we see the available memory recover, +// otherwise subsequent model loads will get far less layers loaded or worse +// case, may completely fall back to CPU mode. +// This routine must be called before the runner unloads so it can establish +// a before and after GPU memory allocation. The returned channel +// will be notified when we're done waiting, or have timed out and should +// proceed anyway +func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { + finished := make(chan interface{}, 1) + + // CPU or Metal don't need checking, so no waiting required + if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") { + finished <- struct{}{} + return finished + } + start := time.Now() + + // Establish a baseline before we unload + gpusBefore := gpu.GetGPUInfo() + var totalMemoryBefore, freeMemoryBefore uint64 + for _, gpu := range gpusBefore { + totalMemoryBefore += gpu.TotalMemory + freeMemoryBefore += gpu.FreeMemory + } + go func() { + expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s + ticker := time.NewTicker(250 * time.Millisecond) + defer ticker.Stop() + for { + <-ticker.C + if time.Now().After(expiresAt) { + slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds()) + finished <- struct{}{} + } + + // Query GPUs, look for free to go back up + gpusNow := gpu.GetGPUInfo() + var totalMemoryNow, freeMemoryNow uint64 + for _, gpu := range gpusNow { + totalMemoryNow += gpu.TotalMemory + freeMemoryNow += gpu.FreeMemory + } + // If we're within ~80% of the estimated memory usage recovered, bail out + if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { + slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds())) + finished <- struct{}{} + return + } + } + }() + return finished + +} + type ByDuration []*runnerRef func (a ByDuration) Len() int { return len(a) } From ce3b212d124ad24434a0336347f47491c13ad960 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 9 May 2024 15:11:43 -0700 Subject: [PATCH 15/28] only forward some env vars --- llm/server.go | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/llm/server.go b/llm/server.go index 27f471a0..434a3275 100644 --- a/llm/server.go +++ b/llm/server.go @@ -292,32 +292,26 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr done: make(chan error, 1), } - s.cmd.Env = os.Environ() s.cmd.Stdout = os.Stdout s.cmd.Stderr = s.status - visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv() - pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) + if v := strings.Join(libraryPaths, string(filepath.ListSeparator)); v != "" { + s.cmd.Env = append(s.cmd.Env, pathEnv+"="+v) + } - // Update or add the path and visible devices variable with our adjusted version - pathNeeded := true - devicesNeeded := visibleDevicesEnv != "" - for i := range s.cmd.Env { - cmp := strings.SplitN(s.cmd.Env[i], "=", 2) - if strings.EqualFold(cmp[0], pathEnv) { - s.cmd.Env[i] = pathEnv + "=" + pathEnvVal - pathNeeded = false - } else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) { - s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal - devicesNeeded = false + if k, v := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv(); k != "" { + s.cmd.Env = append(s.cmd.Env, k+"="+v) + } + + for _, ev := range os.Environ() { + if strings.HasPrefix(ev, "CUDA_") || + strings.HasPrefix(ev, "ROCM_") || + strings.HasPrefix(ev, "HIP_") || + strings.HasPrefix(ev, "HSA_") || + strings.HasPrefix(ev, "GGML_") { + s.cmd.Env = append(s.cmd.Env, ev) } } - if pathNeeded { - s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) - } - if devicesNeeded { - s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal) - } slog.Info("starting llama server", "cmd", s.cmd.String()) // Log at debug as the environment is inherited and might contain sensitive information From 2abb3f642437da5c902b7f4654d6cf87133eb6b4 Mon Sep 17 00:00:00 2001 From: Zander Lewis Date: Thu, 9 May 2024 18:30:49 -0400 Subject: [PATCH 16/28] Update README.md (#4300) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 465d1a12..4a6935f7 100644 --- a/README.md +++ b/README.md @@ -359,7 +359,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Testcontainers](https://testcontainers.com/modules/ollama/) - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama) - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama) - +- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript) ### Mobile - [Enchanted](https://github.com/AugustDev/enchanted) From 3ae2f441e05af8e42f3030920b087b7c2e2afc7b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 9 May 2024 15:47:02 -0700 Subject: [PATCH 17/28] Fix race in shutdown logic Ensure the runners are terminated --- server/routes.go | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/server/routes.go b/server/routes.go index 90cdfcd5..130423b7 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1036,7 +1036,8 @@ func Serve(ln net.Listener) error { } ctx, done := context.WithCancel(context.Background()) - sched := InitScheduler(ctx) + schedCtx, schedDone := context.WithCancel(ctx) + sched := InitScheduler(schedCtx) s := &Server{addr: ln.Addr(), sched: sched} r := s.GenerateRoutes() @@ -1051,24 +1052,31 @@ func Serve(ln net.Listener) error { go func() { <-signals srvr.Close() - done() + schedDone() sched.unloadAllRunners() gpu.Cleanup() - os.Exit(0) + done() }() if err := llm.Init(); err != nil { return fmt.Errorf("unable to initialize llm library %w", err) } - s.sched.Run(ctx) + s.sched.Run(schedCtx) // At startup we retrieve GPU information so we can get log messages before loading a model // This will log warnings to the log in case we have problems with detected GPUs gpus := gpu.GetGPUInfo() gpus.LogDetails() - return srvr.Serve(ln) + err = srvr.Serve(ln) + // If server is closed from the signal handler, wait for the ctx to be done + // otherwise error out quickly + if !errors.Is(err, http.ErrServerClosed) { + return err + } + <-ctx.Done() + return err } func waitForStream(c *gin.Context, ch chan interface{}) { From cf442cd57e9e8cee1e56e4635520ced4926c0c33 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 9 May 2024 16:23:37 -0700 Subject: [PATCH 18/28] fix typo --- llm/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 434a3275..8d0744a9 100644 --- a/llm/server.go +++ b/llm/server.go @@ -218,7 +218,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if dir == "" { // Shouldn't happen finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers) - slog.Error("sever list inconsistent", "error", finalErr) + slog.Error("server list inconsistent", "error", finalErr) continue } From 302d7fdbf3dc59ed8ef6a2e25c0b052435009cb0 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 9 May 2024 16:35:20 -0700 Subject: [PATCH 19/28] prune partial downloads (#4272) --- server/images.go | 35 +++++++++++++++++++++-------------- server/modelpath.go | 3 --- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/server/images.go b/server/images.go index 2be1d366..3f415b6d 100644 --- a/server/images.go +++ b/server/images.go @@ -565,7 +565,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m } if !envconfig.NoPrune { - if err := deleteUnusedLayers(nil, unref, false); err != nil { + if err := deleteUnusedLayers(nil, unref); err != nil { return err } } @@ -613,7 +613,7 @@ func CopyModel(src, dst model.Name) error { return err } -func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error { +func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error { fp, err := GetManifestPath() if err != nil { return err @@ -660,13 +660,9 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err)) continue } - if !dryRun { - if err := os.Remove(fp); err != nil { - slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err)) - continue - } - } else { - slog.Info(fmt.Sprintf("wanted to remove: %s", fp)) + if err := os.Remove(fp); err != nil { + slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err)) + continue } } @@ -689,14 +685,25 @@ func PruneLayers() error { for _, blob := range blobs { name := blob.Name() name = strings.ReplaceAll(name, "-", ":") - if strings.HasPrefix(name, "sha256:") { - deleteMap[name] = struct{}{} + + _, err := GetBlobsPath(name) + if err != nil { + if errors.Is(err, ErrInvalidDigestFormat) { + // remove invalid blobs (e.g. partial downloads) + if err := os.Remove(filepath.Join(p, blob.Name())); err != nil { + slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err) + } + } + + continue } + + deleteMap[name] = struct{}{} } slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - err = deleteUnusedLayers(nil, deleteMap, false) + err = deleteUnusedLayers(nil, deleteMap) if err != nil { return err } @@ -752,7 +759,7 @@ func DeleteModel(name string) error { } deleteMap[manifest.Config.Digest] = struct{}{} - err = deleteUnusedLayers(&mp, deleteMap, false) + err = deleteUnusedLayers(&mp, deleteMap) if err != nil { return err } @@ -912,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu if noprune == "" { fn(api.ProgressResponse{Status: "removing any unused layers"}) - err = deleteUnusedLayers(nil, deleteMap, false) + err = deleteUnusedLayers(nil, deleteMap) if err != nil { return err } diff --git a/server/modelpath.go b/server/modelpath.go index 86908226..25a817ca 100644 --- a/server/modelpath.go +++ b/server/modelpath.go @@ -154,9 +154,6 @@ func GetBlobsPath(digest string) (string, error) { // only accept actual sha256 digests pattern := "^sha256[:-][0-9a-fA-F]{64}$" re := regexp.MustCompile(pattern) - if err != nil { - return "", err - } if digest != "" && !re.MatchString(digest) { return "", ErrInvalidDigestFormat From c02db93243353855b983db2a1562a02b57e66db1 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 9 May 2024 16:45:29 -0700 Subject: [PATCH 20/28] omit empty done reason --- api/types.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/types.go b/api/types.go index 860330db..c210d419 100644 --- a/api/types.go +++ b/api/types.go @@ -117,7 +117,7 @@ type ChatResponse struct { Model string `json:"model"` CreatedAt time.Time `json:"created_at"` Message Message `json:"message"` - DoneReason string `json:"done_reason"` + DoneReason string `json:"done_reason,omitempty"` Done bool `json:"done"` @@ -311,7 +311,7 @@ type GenerateResponse struct { Done bool `json:"done"` // DoneReason is the reason the model stopped generating text. - DoneReason string `json:"done_reason"` + DoneReason string `json:"done_reason,omitempty"` // Context is an encoding of the conversation used in this response; this // can be sent in the next request to keep a conversational memory. From e03637176d90cc5e298e13dfd5e583b2989b3aee Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 9 May 2024 17:44:34 -0700 Subject: [PATCH 21/28] fix(routes): skip bad manifests --- server/routes.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/server/routes.go b/server/routes.go index 130423b7..ec9f0e76 100644 --- a/server/routes.go +++ b/server/routes.go @@ -735,24 +735,27 @@ func (s *Server) ListModelsHandler(c *gin.Context) { n := model.ParseNameFromFilepath(rel) if !n.IsValid() { - slog.Info("invalid model filepath", "path", rel) + slog.Warn("bad manifest filepath", "path", rel) return nil } m, err := ParseNamedManifest(n) if err != nil { - return err + slog.Warn("bad manifest", "name", n, "error", err) + return nil } f, err := m.Config.Open() if err != nil { - return err + slog.Warn("bad manifest config filepath", "name", n, "error", err) + return nil } defer f.Close() var c ConfigV2 if err := json.NewDecoder(f).Decode(&c); err != nil { - return err + slog.Warn("bad manifest config", "name", n, "error", err) + return nil } // tag should never be masked From 30a7d7096c1ca4e90e859efce23042a27487fa8c Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 10 May 2024 09:15:28 -0700 Subject: [PATCH 22/28] Bump VRAM buffer back up Under stress scenarios we're seeing OOMs so this should help stabilize the allocations under heavy concurrency stress. --- gpu/gpu.go | 4 ++-- gpu/gpu_darwin.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index f546506b..781e23df 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -31,8 +31,8 @@ type handles struct { } const ( - cudaMinimumMemory = 256 * format.MebiByte - rocmMinimumMemory = 256 * format.MebiByte + cudaMinimumMemory = 457 * format.MebiByte + rocmMinimumMemory = 457 * format.MebiByte ) var gpuMutex sync.Mutex diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 0ba02e1b..f8cc1adb 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -15,7 +15,7 @@ import ( ) const ( - metalMinimumMemory = 384 * format.MebiByte + metalMinimumMemory = 512 * format.MebiByte ) func GetGPUInfo() GpuInfoList { From bb6fd02298bda99e3d77318d4f282bb2c30b3603 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 10 May 2024 10:17:12 -0700 Subject: [PATCH 23/28] Don't clamp ctx size in `PredictServerFit` (#4317) * dont clamp ctx size in `PredictServerFit` * minimum 4 context * remove context warning --- llm/memory.go | 11 +---------- llm/server.go | 10 +--------- server/sched.go | 4 ++++ 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index 6890b08c..df7081cf 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -12,17 +12,8 @@ import ( // This algorithm looks for a complete fit to determine if we need to unload other models func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { - var estimatedVRAM uint64 - if opts.NumCtx > int(ggml.KV().ContextLength()) { - slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength()) - opts.NumCtx = int(ggml.KV().ContextLength()) - } - - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - // Split up the GPUs by type and try them + var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { var layerCount int layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) diff --git a/llm/server.go b/llm/server.go index 8d0744a9..81a2dec4 100644 --- a/llm/server.go +++ b/llm/server.go @@ -77,15 +77,7 @@ func LoadModel(model string) (*GGML, error) { // The gpu list must be a single family. func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { var err error - if opts.NumCtx > int(ggml.KV().ContextLength()) { - slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength()) - } - - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - - cpuRunner := "" + var cpuRunner string var estimatedVRAM uint64 var estimatedTotal uint64 var systemMemory uint64 diff --git a/server/sched.go b/server/sched.go index 96235ea5..bbf333d7 100644 --- a/server/sched.go +++ b/server/sched.go @@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { // allocate a large enough kv cache for all parallel requests + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } + opts.NumCtx = opts.NumCtx * envconfig.NumParallel req := &LlmRequest{ From 1eb382da5a52e882497552256e7494a90c095467 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 10 May 2024 12:13:28 -0700 Subject: [PATCH 24/28] add phi2 mem --- llm/ggml.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llm/ggml.go b/llm/ggml.go index 1c21bde0..40089be2 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -329,7 +329,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui 4*batch*(1+4*embedding+context+context*heads), ) - partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128 + partialOffload = max( + 4*batch*(2*embedding+vocab)+embedding*vocab*105/128, + 4*batch*(2+3*embedding+context+context*heads), + ) case "stablelm": fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) partialOffload = max( From 6602e793c011805bec36d7d5b1f27537fe2f2353 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 10 May 2024 13:06:13 -0700 Subject: [PATCH 25/28] Use `--quantize` flag and `quantize` api parameter (#4321) * rename `--quantization` to `--quantize` * backwards * Update api/types.go Co-authored-by: Michael Yang --------- Co-authored-by: Michael Yang --- api/types.go | 13 ++++++++----- cmd/cmd.go | 6 +++--- server/routes.go | 7 ++++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/api/types.go b/api/types.go index c210d419..fcab6fef 100644 --- a/api/types.go +++ b/api/types.go @@ -197,14 +197,17 @@ type EmbeddingResponse struct { // CreateRequest is the request passed to [Client.Create]. type CreateRequest struct { - Model string `json:"model"` - Path string `json:"path"` - Modelfile string `json:"modelfile"` - Stream *bool `json:"stream,omitempty"` - Quantization string `json:"quantization,omitempty"` + Model string `json:"model"` + Path string `json:"path"` + Modelfile string `json:"modelfile"` + Stream *bool `json:"stream,omitempty"` + Quantize string `json:"quantize,omitempty"` // Name is deprecated, see Model Name string `json:"name"` + + // Quantization is deprecated, see Quantize + Quantization string `json:"quantization,omitempty"` } // DeleteRequest is the request passed to [Client.Delete]. diff --git a/cmd/cmd.go b/cmd/cmd.go index bf305d81..7814734a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return nil } - quantization, _ := cmd.Flags().GetString("quantization") + quantize, _ := cmd.Flags().GetString("quantize") - request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization} + request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize} if err := client.Create(cmd.Context(), &request, fn); err != nil { return err } @@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command { } createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")") - createCmd.Flags().StringP("quantization", "q", "", "Quantization level.") + createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)") showCmd := &cobra.Command{ Use: "show MODEL", diff --git a/server/routes.go b/server/routes.go index ec9f0e76..600a30fa 100644 --- a/server/routes.go +++ b/server/routes.go @@ -554,7 +554,12 @@ func (s *Server) CreateModelHandler(c *gin.Context) { ctx, cancel := context.WithCancel(c.Request.Context()) defer cancel() - if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil { + quantization := req.Quantization + if req.Quantize != "" { + quantization = req.Quantize + } + + if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil { ch <- gin.H{"error": err.Error()} } }() From 4142c3ef7c4c543bf9735cdddb99d4570071c5bd Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 10 May 2024 13:53:21 -0700 Subject: [PATCH 26/28] Always use the sorted list of GPUs Make sure the first GPU has the most free space --- server/sched.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/sched.go b/server/sched.go index bbf333d7..eff2b117 100644 --- a/server/sched.go +++ b/server/sched.go @@ -567,9 +567,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. // - try subsets of GPUs instead of just falling back to 1 or all in a family // Now try all the GPUs - if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { - slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM)) - return gl + if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM)) + return sgl } } return nil From 074dc3b9d87f03de21a1c599caa5980d2a28f586 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 10 May 2024 14:13:26 -0700 Subject: [PATCH 27/28] Integration fixes --- integration/concurrency_test.go | 2 +- integration/utils_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 110301ab..f6bdb9d4 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -217,7 +217,7 @@ func TestMultiModelStress(t *testing.T) { defer wg.Done() for j := 0; j < 3; j++ { slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model) - DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second) + DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second) } }(i) } diff --git a/integration/utils_test.go b/integration/utils_test.go index e133e76d..c6f19e98 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -85,7 +85,7 @@ func GetTestEndpoint() (*api.Client, string) { var serverMutex sync.Mutex var serverReady bool -func startServer(ctx context.Context, ollamaHost string) error { +func startServer(t *testing.T, ctx context.Context, ollamaHost string) error { // Make sure the server has been built CLIName, err := filepath.Abs("../ollama") if err != nil { @@ -200,7 +200,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin } lifecycle.ServerLogFile = fp.Name() fp.Close() - require.NoError(t, startServer(ctx, testEndpoint)) + require.NoError(t, startServer(t, ctx, testEndpoint)) } return client, testEndpoint, func() { From c4014e73a25488b3f1488b96e82b578d8261993f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 10 May 2024 15:09:48 -0700 Subject: [PATCH 28/28] Fall back to CPU runner with zero layers --- llm/server.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llm/server.go b/llm/server.go index 81a2dec4..33c56f1f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -105,6 +105,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 + } else if gpus[0].Library != "metal" && layers == 0 { + // Don't bother loading into the GPU if no layers can fit + cpuRunner = serverForCpu() + gpuCount = 0 } else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" { opts.NumGPU = layers }