diff --git a/README.md b/README.md index 4f980375..4a6935f7 100644 --- a/README.md +++ b/README.md @@ -331,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/) - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama) +- [Guix channel](https://codeberg.org/tusharhero/ollama-guix) ### Libraries @@ -357,7 +358,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama) - [Testcontainers](https://testcontainers.com/modules/ollama/) - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama) - +- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama) +- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript) ### Mobile - [Enchanted](https://github.com/AugustDev/enchanted) diff --git a/api/types.go b/api/types.go index 5d0212e5..fcab6fef 100644 --- a/api/types.go +++ b/api/types.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" "math" "os" "reflect" @@ -113,9 +114,10 @@ type Message struct { // ChatResponse is the response returned by [Client.Chat]. Its fields are // similar to [GenerateResponse]. type ChatResponse struct { - Model string `json:"model"` - CreatedAt time.Time `json:"created_at"` - Message Message `json:"message"` + Model string `json:"model"` + CreatedAt time.Time `json:"created_at"` + Message Message `json:"message"` + DoneReason string `json:"done_reason,omitempty"` Done bool `json:"done"` @@ -161,7 +163,6 @@ type Runner struct { UseNUMA bool `json:"numa,omitempty"` NumCtx int `json:"num_ctx,omitempty"` NumBatch int `json:"num_batch,omitempty"` - NumGQA int `json:"num_gqa,omitempty"` NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` LowVRAM bool `json:"low_vram,omitempty"` @@ -171,11 +172,6 @@ type Runner struct { UseMMap bool `json:"use_mmap,omitempty"` UseMLock bool `json:"use_mlock,omitempty"` NumThread int `json:"num_thread,omitempty"` - - // Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used - RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"` - // Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used - RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"` } // EmbeddingRequest is the request passed to [Client.Embeddings]. @@ -201,14 +197,17 @@ type EmbeddingResponse struct { // CreateRequest is the request passed to [Client.Create]. type CreateRequest struct { - Model string `json:"model"` - Path string `json:"path"` - Modelfile string `json:"modelfile"` - Stream *bool `json:"stream,omitempty"` - Quantization string `json:"quantization,omitempty"` + Model string `json:"model"` + Path string `json:"path"` + Modelfile string `json:"modelfile"` + Stream *bool `json:"stream,omitempty"` + Quantize string `json:"quantize,omitempty"` // Name is deprecated, see Model Name string `json:"name"` + + // Quantization is deprecated, see Quantize + Quantization string `json:"quantization,omitempty"` } // DeleteRequest is the request passed to [Client.Delete]. @@ -314,6 +313,9 @@ type GenerateResponse struct { // Done specifies if the response is complete. Done bool `json:"done"` + // DoneReason is the reason the model stopped generating text. + DoneReason string `json:"done_reason,omitempty"` + // Context is an encoding of the conversation used in this response; this // can be sent in the next request to keep a conversational memory. Context []int `json:"context,omitempty"` @@ -359,8 +361,6 @@ func (m *Metrics) Summary() { } } -// ErrInvalidOpts is returned when invalid options are passed to the client. -var ErrInvalidOpts = errors.New("invalid options") var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST") func (opts *Options) FromMap(m map[string]interface{}) error { @@ -376,73 +376,71 @@ func (opts *Options) FromMap(m map[string]interface{}) error { } } - invalidOpts := []string{} for key, val := range m { - if opt, ok := jsonOpts[key]; ok { - field := valueOpts.FieldByName(opt.Name) - if field.IsValid() && field.CanSet() { - if val == nil { - continue - } + opt, ok := jsonOpts[key] + if !ok { + slog.Warn("invalid option provided", "option", opt.Name) + continue + } - switch field.Kind() { - case reflect.Int: - switch t := val.(type) { - case int64: - field.SetInt(t) - case float64: - // when JSON unmarshals numbers, it uses float64, not int - field.SetInt(int64(t)) - default: - return fmt.Errorf("option %q must be of type integer", key) - } - case reflect.Bool: - val, ok := val.(bool) - if !ok { - return fmt.Errorf("option %q must be of type boolean", key) - } - field.SetBool(val) - case reflect.Float32: - // JSON unmarshals to float64 - val, ok := val.(float64) - if !ok { - return fmt.Errorf("option %q must be of type float32", key) - } - field.SetFloat(val) - case reflect.String: - val, ok := val.(string) - if !ok { - return fmt.Errorf("option %q must be of type string", key) - } - field.SetString(val) - case reflect.Slice: - // JSON unmarshals to []interface{}, not []string - val, ok := val.([]interface{}) - if !ok { - return fmt.Errorf("option %q must be of type array", key) - } - // convert []interface{} to []string - slice := make([]string, len(val)) - for i, item := range val { - str, ok := item.(string) - if !ok { - return fmt.Errorf("option %q must be of an array of strings", key) - } - slice[i] = str - } - field.Set(reflect.ValueOf(slice)) - default: - return fmt.Errorf("unknown type loading config params: %v", field.Kind()) - } + field := valueOpts.FieldByName(opt.Name) + if field.IsValid() && field.CanSet() { + if val == nil { + continue + } + + switch field.Kind() { + case reflect.Int: + switch t := val.(type) { + case int64: + field.SetInt(t) + case float64: + // when JSON unmarshals numbers, it uses float64, not int + field.SetInt(int64(t)) + default: + return fmt.Errorf("option %q must be of type integer", key) + } + case reflect.Bool: + val, ok := val.(bool) + if !ok { + return fmt.Errorf("option %q must be of type boolean", key) + } + field.SetBool(val) + case reflect.Float32: + // JSON unmarshals to float64 + val, ok := val.(float64) + if !ok { + return fmt.Errorf("option %q must be of type float32", key) + } + field.SetFloat(val) + case reflect.String: + val, ok := val.(string) + if !ok { + return fmt.Errorf("option %q must be of type string", key) + } + field.SetString(val) + case reflect.Slice: + // JSON unmarshals to []interface{}, not []string + val, ok := val.([]interface{}) + if !ok { + return fmt.Errorf("option %q must be of type array", key) + } + // convert []interface{} to []string + slice := make([]string, len(val)) + for i, item := range val { + str, ok := item.(string) + if !ok { + return fmt.Errorf("option %q must be of an array of strings", key) + } + slice[i] = str + } + field.Set(reflect.ValueOf(slice)) + default: + return fmt.Errorf("unknown type loading config params: %v", field.Kind()) } - } else { - invalidOpts = append(invalidOpts, key) } } - if len(invalidOpts) > 0 { - return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", ")) - } return nil } @@ -475,8 +473,7 @@ func DefaultOptions() Options { NumCtx: 2048, NumBatch: 512, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically - NumGQA: 1, - NumThread: 0, // let the runtime decide + NumThread: 0, // let the runtime decide LowVRAM: false, F16KV: true, UseMLock: false, diff --git a/cmd/cmd.go b/cmd/cmd.go index bf305d81..7814734a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return nil } - quantization, _ := cmd.Flags().GetString("quantization") + quantize, _ := cmd.Flags().GetString("quantize") - request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization} + request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize} if err := client.Create(cmd.Context(), &request, fn); err != nil { return err } @@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command { } createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")") - createCmd.Flags().StringP("quantization", "q", "", "Quantization level.") + createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)") showCmd := &cobra.Command{ Use: "show MODEL", diff --git a/docs/README.md b/docs/README.md index a3edb18c..b6221041 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ * [Importing models](./import.md) * [Linux Documentation](./linux.md) * [Windows Documentation](./windows.md) -* [Docker Documentation](https://hub.docker.com/r/ollama/ollama) +* [Docker Documentation](./docker.md) ### Reference diff --git a/docs/api.md b/docs/api.md index 2f52c55a..94cd9c90 100644 --- a/docs/api.md +++ b/docs/api.md @@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{ "numa": false, "num_ctx": 1024, "num_batch": 2, - "num_gqa": 1, "num_gpu": 1, "main_gpu": 0, "low_vram": false, @@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{ "vocab_only": false, "use_mmap": true, "use_mlock": false, - "rope_frequency_base": 1.1, - "rope_frequency_scale": 0.8, "num_thread": 8 } }' diff --git a/docs/docker.md b/docs/docker.md new file mode 100644 index 00000000..0b58562b --- /dev/null +++ b/docs/docker.md @@ -0,0 +1,71 @@ +# Ollama Docker image + +### CPU only + +```bash +docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +### Nvidia GPU +Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation). + +#### Install with Apt +1. Configure the repository +```bash +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +``` +2. Install the NVIDIA Container Toolkit packages +```bash +sudo apt-get install -y nvidia-container-toolkit +``` + +#### Install with Yum or Dnf +1. Configure the repository + +```bash +curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo +``` + +2. Install the NVIDIA Container Toolkit packages + +```bash +sudo yum install -y nvidia-container-toolkit +``` + +#### Configure Docker to use Nvidia driver +``` +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +#### Start the container + +```bash +docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +### AMD GPU + +To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command: + +``` +docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm +``` + +### Run model locally + +Now you can run a model: + +``` +docker exec -it ollama ollama run llama3 +``` + +### Try different models + +More models can be found on the [Ollama library](https://ollama.com/library). diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index b9038e38..2586e4e4 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -82,4 +82,23 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example -OLLAMA_TMPDIR=/usr/share/ollama/ \ No newline at end of file +OLLAMA_TMPDIR=/usr/share/ollama/ + +## Container fails to run on NVIDIA GPU + +Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md) + +Sometimes the container runtime can have difficulties initializing the GPU. +When you check the server logs, this can show up as various error codes, such +as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" +(unknown), or others. The following troubleshooting techniques may help resolve +the problem + +- Is the uvm driver not loaded? `sudo nvidia-modprobe -u` +- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` +- Try rebooting +- Make sure you're running the latest nvidia drivers + +If none of those resolve the problem, gather additional information and file an issue: +- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs +- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md index 63b34aa6..4d60afb6 100644 --- a/docs/tutorials/langchainjs.md +++ b/docs/tutorials/langchainjs.md @@ -5,13 +5,13 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**: ```bash -npm install langchain +npm install @langchain/community ``` Now we can start building out our JavaScript: ```javascript -import { Ollama } from "langchain/llms/ollama"; +import { Ollama } from "@langchain/community/llms/ollama"; const ollama = new Ollama({ baseUrl: "http://localhost:11434", diff --git a/examples/bash-comparemodels/README.md b/examples/bash-comparemodels/README.md deleted file mode 100644 index 65e66f1e..00000000 --- a/examples/bash-comparemodels/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Bash Shell examples - -When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other: - -`ollama run llama3 < sourcequestions.txt` - -This concept is used in the following example. - -## Compare Models -`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally. diff --git a/examples/bash-comparemodels/comparemodels.sh b/examples/bash-comparemodels/comparemodels.sh deleted file mode 100755 index 1ce249a6..00000000 --- a/examples/bash-comparemodels/comparemodels.sh +++ /dev/null @@ -1,64 +0,0 @@ -#! /usr/bin/env bash -# Compare multiple models by running them with the same questions - -NUMBEROFCHOICES=4 -SELECTIONS=() -declare -a SUMS=() - -# Get the list of models -CHOICES=$(ollama list | awk '{print $1}') - -# Select which models to run as a comparison -echo "Select $NUMBEROFCHOICES models to compare:" -select ITEM in $CHOICES; do - if [[ -n $ITEM ]]; then - echo "You have selected $ITEM" - SELECTIONS+=("$ITEM") - ((COUNT++)) - if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then - break - fi - else - echo "Invalid selection" - fi -done - -# Loop through each of the selected models -for ITEM in "${SELECTIONS[@]}"; do - echo "--------------------------------------------------------------" - echo "Loading the model $ITEM into memory" - ollama run "$ITEM" "" - echo "--------------------------------------------------------------" - echo "Running the questions through the model $ITEM" - COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr) - - # eval duration is sometimes listed in seconds and sometimes in milliseconds. - # Add up the values for each model - SUM=$(echo "$COMMAND_OUTPUT" | awk ' - /eval duration:/ { - value = $3 - if (index(value, "ms") > 0) { - gsub("ms", "", value) - value /= 1000 - } else { - gsub("s", "", value) - } - sum += value - } - END { print sum }') - - - SUMS+=("All questions for $ITEM completed in $SUM seconds") -done - -echo "" -echo "--------------------------------------------------------------" -echo -e "Sums of eval durations for each run:" -for val in "${SUMS[@]}"; do - echo "$val" -done - -echo "--------------------------------------------------------------" -echo "Comparison complete. Now you can decide" -echo "which model is best." -echo "--------------------------------------------------------------" \ No newline at end of file diff --git a/examples/bash-comparemodels/sourcequestions.txt b/examples/bash-comparemodels/sourcequestions.txt deleted file mode 100644 index 90004c07..00000000 --- a/examples/bash-comparemodels/sourcequestions.txt +++ /dev/null @@ -1,7 +0,0 @@ -Why is the sky blue -What is a black hole -Explain the big bang theory like I am 5? -What is the quickest way to win a game of Monopoly with 3 others? -Why does a vacuum bottle keep my coffee hot and my milkshake cold? -What is the difference between a meteor, a meteorite, and a meteoroid? -Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust. \ No newline at end of file diff --git a/gpu/amd_hip_windows.go b/gpu/amd_hip_windows.go index 4e216132..8572a24c 100644 --- a/gpu/amd_hip_windows.go +++ b/gpu/amd_hip_windows.go @@ -3,7 +3,6 @@ package gpu import ( "fmt" "log/slog" - "strconv" "syscall" "unsafe" @@ -74,16 +73,22 @@ func (hl *HipLib) Release() { hl.dll = 0 } -func (hl *HipLib) AMDDriverVersion() (string, error) { +func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) { if hl.dll == 0 { - return "", fmt.Errorf("dll has been unloaded") + return 0, 0, fmt.Errorf("dll has been unloaded") } var version int status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version))) if status != hipSuccess { - return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err) + return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err) } - return strconv.Itoa(version), nil + + slog.Debug("hipDriverGetVersion", "version", version) + // TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway... + driverMajor = version / 1000 + driverMinor = (version - (driverMajor * 1000)) / 10 + + return driverMajor, driverMinor, nil } func (hl *HipLib) HipGetDeviceCount() int { diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 9f9f8e74..6b08ac2e 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -8,6 +8,7 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "slices" "strconv" "strings" @@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo { } // Opportunistic logging of driver version to aid in troubleshooting - ver, err := AMDDriverVersion() - if err == nil { - slog.Info("AMD Driver: " + ver) - } else { + driverMajor, driverMinor, err := AMDDriverVersion() + if err != nil { // TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err) } @@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo { scanner := bufio.NewScanner(fp) isCPU := false var major, minor, patch uint64 + var vendor, device uint64 for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) // Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs @@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo { slog.Debug("malformed int " + line) continue } + } else if strings.HasPrefix(line, "vendor_id") { + ver := strings.Fields(line) + if len(ver) != 2 { + slog.Debug("malformed vendor_id", "vendor_id", line) + continue + } + vendor, err = strconv.ParseUint(ver[1], 10, 32) + if err != nil { + slog.Debug("malformed vendor_id" + line) + } + } else if strings.HasPrefix(line, "device_id") { + ver := strings.Fields(line) + if len(ver) != 2 { + slog.Debug("malformed device_id", "device_id", line) + continue + } + device, err = strconv.ParseUint(ver[1], 10, 32) + if err != nil { + slog.Debug("malformed device_id" + line) + } } // TODO - any other properties we want to extract and record? @@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo { } if int(major) < RocmComputeMin { - slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID) + slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID) continue } @@ -210,24 +230,29 @@ func AMDGetGPUInfo() []GpuInfo { // iGPU detection, remove this check once we can support an iGPU variant of the rocm library if totalMemory < IGPUMemLimit { - slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) + slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory)) continue } + var name string + // TODO - PCI ID lookup + if vendor > 0 && device > 0 { + name = fmt.Sprintf("%04x:%04x", vendor, device) + } - slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) - slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) + slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) gpuInfo := GpuInfo{ Library: "rocm", memInfo: memInfo{ TotalMemory: totalMemory, FreeMemory: (totalMemory - usedMemory), }, - ID: fmt.Sprintf("%d", gpuID), - // Name: not exposed in sysfs directly, would require pci device id lookup - Major: int(major), - Minor: int(minor), - Patch: int(patch), + ID: fmt.Sprintf("%d", gpuID), + Name: name, + Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), MinimumMemory: rocmMinimumMemory, + DriverMajor: driverMajor, + DriverMinor: driverMinor, } // If the user wants to filter to a subset of devices, filter out if we aren't a match @@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo { } slog.Debug("rocm supported GPUs", "types", supported) } - gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch) + gfx := gpuInfo.Compute if !slices.Contains[[]string, string](supported, gfx) { slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported) // TODO - consider discrete markdown just for ROCM troubleshooting? @@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo { slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx) } } else { - slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) + slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) } // The GPU has passed all the verification steps and is supported @@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) { return "", fmt.Errorf("no suitable rocm found, falling back to CPU") } -func AMDDriverVersion() (string, error) { - _, err := os.Stat(DriverVersionFile) +func AMDDriverVersion() (driverMajor, driverMinor int, err error) { + _, err = os.Stat(DriverVersionFile) if err != nil { - return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err) + return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err) } fp, err := os.Open(DriverVersionFile) if err != nil { - return "", err + return 0, 0, err } defer fp.Close() verString, err := io.ReadAll(fp) if err != nil { - return "", err + return 0, 0, err } - return strings.TrimSpace(string(verString)), nil + + pattern := `\A(\d+)\.(\d+).*` + regex := regexp.MustCompile(pattern) + match := regex.FindStringSubmatch(string(verString)) + if len(match) < 2 { + return 0, 0, fmt.Errorf("malformed version string %s", string(verString)) + } + driverMajor, err = strconv.Atoi(match[1]) + if err != nil { + return 0, 0, err + } + driverMinor, err = strconv.Atoi(match[2]) + if err != nil { + return 0, 0, err + } + return driverMajor, driverMinor, nil } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 22c9f427..aae6c5b7 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -7,7 +7,6 @@ import ( "os" "path/filepath" "slices" - "strconv" "strings" "github.com/ollama/ollama/format" @@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo { } defer hl.Release() - ver, err := hl.AMDDriverVersion() - if err == nil { - slog.Info("AMD Driver: " + ver) - } else { - // For now this is benign, but we may eventually need to fail compatibility checks - slog.Debug("error looking up amd driver version", "error", err) - } + // TODO - this reports incorrect version information, so omitting for now + // driverMajor, driverMinor, err := hl.AMDDriverVersion() + // if err != nil { + // // For now this is benign, but we may eventually need to fail compatibility checks + // slog.Debug("error looking up amd driver version", "error", err) + // } // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() @@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo { return nil } } else { - slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) + slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) } - slog.Info("detected hip devices", "count", count) + slog.Debug("detected hip devices", "count", count) // TODO how to determine the underlying device ID when visible devices is causing this to subset? for i := 0; i < count; i++ { err = hl.HipSetDevice(i) @@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo { // Can luid be used on windows for setting visible devices (and is it actually set?) n = bytes.IndexByte(props.GcnArchName[:], 0) gfx := string(props.GcnArchName[:n]) - slog.Info("hip device", "id", i, "name", name, "gfx", gfx) - var major, minor, patch string - switch len(gfx) { - case 6: - major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:] - case 7: - major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:] - } + slog.Debug("hip device", "id", i, "name", name, "gfx", gfx) //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0 // TODO Why isn't props.iGPU accurate!? if strings.EqualFold(name, iGPUName) { - slog.Info("iGPU detected skipping", "id", i) + slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx) continue } if gfxOverride == "" { @@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo { slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage") continue } else { - slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx) + slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx) } } @@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo { // TODO revisit this once ROCm v6 is available on windows. // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable - slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) - slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) + slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) gpuInfo := GpuInfo{ Library: "rocm", memInfo: memInfo{ @@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo { ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices DependencyPath: libDir, MinimumMemory: rocmMinimumMemory, - } - if major != "" { - gpuInfo.Major, err = strconv.Atoi(major) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } - } - if minor != "" { - gpuInfo.Minor, err = strconv.Atoi(minor) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } - } - if patch != "" { - // Patch rev is hex; e.g. gfx90a - p, err := strconv.ParseInt(patch, 16, 0) - if err != nil { - slog.Info("failed to parse version", "version", gfx, "error", err) - } else { - gpuInfo.Patch = int(p) - } - } - if gpuInfo.Major < RocmComputeMin { - slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)) - continue + Name: name, + Compute: gfx, + + // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve + // DriverMajor: driverMajor, + // DriverMinor: driverMinor, } resp = append(resp, gpuInfo) diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 3b299e42..920d0f5b 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -8,14 +8,14 @@ import ( func GetCPUVariant() string { if cpu.X86.HasAVX2 { - slog.Info("CPU has AVX2") + slog.Debug("CPU has AVX2") return "avx2" } if cpu.X86.HasAVX { - slog.Info("CPU has AVX") + slog.Debug("CPU has AVX") return "avx" } - slog.Info("CPU does not have vector extensions") + slog.Debug("CPU does not have vector extensions") // else LCD return "" } diff --git a/gpu/gpu.go b/gpu/gpu.go index f8bae9b0..781e23df 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -31,8 +31,8 @@ type handles struct { } const ( - cudaMinimumMemory = 256 * format.MebiByte - rocmMinimumMemory = 256 * format.MebiByte + cudaMinimumMemory = 457 * format.MebiByte + rocmMinimumMemory = 457 * format.MebiByte ) var gpuMutex sync.Mutex @@ -119,12 +119,12 @@ func initGPUHandles() *handles { return gpuHandles } - slog.Info("Detecting GPUs") + slog.Debug("Detecting GPUs") nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) if nvcuda != nil { - slog.Info("detected GPUs", "count", deviceCount, "library", libPath) + slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) gpuHandles.nvcuda = nvcuda gpuHandles.deviceCount = deviceCount return gpuHandles @@ -135,7 +135,7 @@ func initGPUHandles() *handles { if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) if cudart != nil { - slog.Info("detected GPUs", "library", libPath, "count", deviceCount) + slog.Debug("detected GPUs", "library", libPath, "count", deviceCount) gpuHandles.cudart = cudart gpuHandles.deviceCount = deviceCount return gpuHandles @@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList { gpuInfo := GpuInfo{ Library: "cuda", } + var driverMajor int + var driverMinor int if gpuHandles.cudart != nil { C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) } else { C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) + driverMajor = int(gpuHandles.nvcuda.driver_major) + driverMinor = int(gpuHandles.nvcuda.driver_minor) } if memInfo.err != nil { slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) @@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList { gpuInfo.TotalMemory = uint64(memInfo.total) gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Major = int(memInfo.major) - gpuInfo.Minor = int(memInfo.minor) + gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.DependencyPath = depPath + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DriverMajor = int(driverMajor) + gpuInfo.DriverMinor = int(driverMinor) // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... resp = append(resp, gpuInfo) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 0ba02e1b..f8cc1adb 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -15,7 +15,7 @@ import ( ) const ( - metalMinimumMemory = 384 * format.MebiByte + metalMinimumMemory = 512 * format.MebiByte ) func GetGPUInfo() GpuInfoList { diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 577bd3f0..2fa86f8d 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -39,16 +39,19 @@ extern "C" { #endif #define GPU_ID_LEN 64 +#define GPU_NAME_LEN 96 typedef struct mem_info { char *err; // If non-nill, caller responsible for freeing char gpu_id[GPU_ID_LEN]; + char gpu_name[GPU_NAME_LEN]; uint64_t total; uint64_t free; // Compute Capability int major; int minor; + int patch; } mem_info_t; void cpu_check_ram(mem_info_t *resp); diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c index 81ba3de4..6cbe28b0 100644 --- a/gpu/gpu_info_cpu.c +++ b/gpu/gpu_info_cpu.c @@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) { if (GlobalMemoryStatusEx(&info) != 0) { resp->total = info.ullTotalPhys; resp->free = info.ullAvailPhys; - resp->major = 0; - resp->minor = 0; snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); } else { resp->err = LOAD_ERR(); @@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) { } else { resp->total = info.totalram * info.mem_unit; resp->free = info.freeram * info.mem_unit; - resp->major = 0; - resp->minor = 0; snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); } return; diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index e192d2e6..26d855df 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet}, {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute}, {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid}, + {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName}, {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3}, {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2}, {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy}, @@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { } int version = 0; - nvcudaDriverVersion_t driverVersion; - driverVersion.major = 0; - driverVersion.minor = 0; + resp->ch.driver_major = 0; + resp->ch.driver_minor = 0; // Report driver version if we're in verbose mode, ignore errors ret = (*resp->ch.cuDriverGetVersion)(&version); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); } else { - driverVersion.major = version / 1000; - driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; - LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + resp->ch.driver_major = version / 1000; + resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor); } ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); @@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { return; } - resp->major = 0; - resp->minor = 0; int major = 0; int minor = 0; ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); @@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { ); } + ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret); + resp->gpu_name[0] = '\0'; + } + // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h index c4d94edd..2b232839 100644 --- a/gpu/gpu_info_nvcuda.h +++ b/gpu/gpu_info_nvcuda.h @@ -44,12 +44,15 @@ typedef void* CUcontext; typedef struct nvcuda_handle { void *handle; uint16_t verbose; + int driver_major; + int driver_minor; CUresult (*cuInit)(unsigned int Flags); CUresult (*cuDriverGetVersion)(int *driverVersion); CUresult (*cuDeviceGetCount)(int *); CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2 + CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev); // Context specific aspects CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev); diff --git a/gpu/types.go b/gpu/types.go index 7a5d5ba7..af33b896 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -1,5 +1,12 @@ package gpu +import ( + "fmt" + "log/slog" + + "github.com/ollama/ollama/format" +) + type memInfo struct { TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` @@ -20,11 +27,13 @@ type GpuInfo struct { DependencyPath string `json:"lib_path,omitempty"` // GPU information - ID string `json:"gpu_id"` // string to use for selection of this specific GPU - Name string `json:"name"` // user friendly name if available - Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx) - Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx) - Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD + ID string `json:"gpu_id"` // string to use for selection of this specific GPU + Name string `json:"name"` // user friendly name if available + Compute string `json:"compute"` // Compute Capability or gfx + + // Driver Information - TODO no need to put this on each GPU + DriverMajor int `json:"driver_major,omitempty"` + DriverMinor int `json:"driver_minor,omitempty"` // TODO other performance capability info to help in scheduling decisions } @@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { return resp } +// Report the GPU information into the log an Info level +func (l GpuInfoList) LogDetails() { + for _, g := range l { + slog.Info("inference compute", + "id", g.ID, + "library", g.Library, + "compute", g.Compute, + "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), + "name", g.Name, + "total", format.HumanBytes2(g.TotalMemory), + "available", format.HumanBytes2(g.FreeMemory), + ) + } +} + // Sort by Free Space type ByFreeMemory []GpuInfo diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 110301ab..f6bdb9d4 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -217,7 +217,7 @@ func TestMultiModelStress(t *testing.T) { defer wg.Done() for j := 0; j < 3; j++ { slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model) - DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second) + DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second) } }(i) } diff --git a/integration/utils_test.go b/integration/utils_test.go index e133e76d..c6f19e98 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -85,7 +85,7 @@ func GetTestEndpoint() (*api.Client, string) { var serverMutex sync.Mutex var serverReady bool -func startServer(ctx context.Context, ollamaHost string) error { +func startServer(t *testing.T, ctx context.Context, ollamaHost string) error { // Make sure the server has been built CLIName, err := filepath.Abs("../ollama") if err != nil { @@ -200,7 +200,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin } lifecycle.ServerLogFile = fp.Name() fp.Close() - require.NoError(t, startServer(ctx, testEndpoint)) + require.NoError(t, startServer(t, ctx, testEndpoint)) } return client, testEndpoint, func() { diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index df28c412..0c339989 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -66,7 +66,7 @@ struct server_params { }; bool server_verbose = false; -bool server_log_json = true; +bool server_log_json = false; enum stop_type { STOP_FULL, @@ -266,7 +266,7 @@ struct server_slot { sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -280,7 +280,7 @@ struct server_slot { sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", t_token_generation, n_decoded, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_token_generation", t_token_generation}, @@ -290,7 +290,7 @@ struct server_slot { }); sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -371,7 +371,7 @@ struct llama_server_context { if (clp_ctx) { - LOG_INFO("freeing clip model", {}); + LOG_DEBUG("freeing clip model", {}); clip_free(clp_ctx); clp_ctx = nullptr; } @@ -392,7 +392,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_INFO("Multi Modal Mode Enabled", {}); + LOG_DEBUG("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -445,7 +445,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -454,7 +454,7 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { + LOG_DEBUG("new slot", { {"slot_id", slot.id}, {"n_ctx_slot", slot.n_ctx} }); @@ -468,7 +468,7 @@ struct llama_server_context //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { + LOG_DEBUG("slot self-extend", { {"slot_id", slot.id}, {"ga_n", ga_n}, {"ga_w", ga_w} @@ -827,7 +827,7 @@ struct llama_server_context all_slots_are_idle = false; - LOG_INFO("slot is processing task", { + LOG_DEBUG("slot is processing task", { {"slot_id", slot->id}, {"task_id", slot->task_id}, }); @@ -1504,7 +1504,7 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_INFO("slot data", { + LOG_DEBUG("slot data", { {"task_id", task.id}, {"n_idle_slots", n_idle_slots}, {"n_processing_slots", n_processing_slots} @@ -1566,7 +1566,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_INFO("updating system prompt", {}); + LOG_DEBUG("updating system prompt", {}); system_prompt_update(); } @@ -1576,7 +1576,7 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); + LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; @@ -1599,7 +1599,7 @@ struct llama_server_context const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_INFO("slot context shift", { + LOG_DEBUG("slot context shift", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_keep", n_keep}, @@ -1638,7 +1638,7 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_INFO("slot released", { + LOG_DEBUG("slot released", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_ctx", n_ctx}, @@ -1807,7 +1807,7 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_INFO("slot progression", { + LOG_DEBUG("slot progression", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "n_past", slot.n_past }, @@ -1822,7 +1822,7 @@ struct llama_server_context if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { + LOG_DEBUG("we have to evaluate at least 1 token to generate logits", { { "slot_id", slot.id }, { "task_id", slot.task_id } }); @@ -1834,7 +1834,7 @@ struct llama_server_context } int p0 = (int) system_tokens.size() + slot.n_past; - LOG_INFO("kv cache rm [p0, end)", { + LOG_DEBUG("kv cache rm [p0, end)", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "p0", p0 } @@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } else if (arg == "-v" || arg == "--verbose") { -#if SERVER_VERBOSE != 1 - LOG_WARNING("server.cpp is not built with verbose logging.", {}); -#else server_verbose = true; -#endif } else if (arg == "--mlock") { @@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, else if (arg == "--log-disable") { log_set_target(stdout); - LOG_INFO("logging to file is disabled.", {}); + LOG_DEBUG("logging to file is disabled.", {}); } else if (arg == "--slots-endpoint-disable") { @@ -2727,12 +2723,12 @@ static json format_detokenized_response(std::string content) static void log_server_request(const httplib::Request &req, const httplib::Response &res) { // skip GH copilot requests when using default port - if (req.path == "/v1/health" || req.path == "/v1/completions") + if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions") { return; } - LOG_INFO("request", { + LOG_DEBUG("request", { {"remote_addr", req.remote_addr}, {"remote_port", req.remote_port}, {"status", res.status}, @@ -3054,6 +3050,26 @@ int main(int argc, char **argv) { log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } + if (sparams.n_threads_http < 1) { + // +2 threads for monitoring endpoints + sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + } + log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); + svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + // load the model if (!llama.load_model(params)) { @@ -3258,26 +3274,6 @@ int main(int argc, char **argv) { }*/ //); - if (sparams.n_threads_http < 1) { - // +2 threads for monitoring endpoints - sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); - svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; - - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( diff --git a/llm/ext_server/utils.hpp b/llm/ext_server/utils.hpp index bd340656..d63ead04 100644 --- a/llm/ext_server/utils.hpp +++ b/llm/ext_server/utils.hpp @@ -55,9 +55,10 @@ extern bool server_log_json; } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__) enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet @@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line, {"timestamp", time(nullptr)}, }; + if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) { + return; + } + if (server_log_json) { log.merge_patch( { @@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line, std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - if (!extra.empty()) { log.merge_patch(extra); } + std::stringstream ss; - ss << buf << " |"; + ss << level << " [" << function << "] " << message << " |"; for (const auto& el : log.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); diff --git a/llm/ggml.go b/llm/ggml.go index 1c21bde0..40089be2 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -329,7 +329,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui 4*batch*(1+4*embedding+context+context*heads), ) - partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128 + partialOffload = max( + 4*batch*(2*embedding+vocab)+embedding*vocab*105/128, + 4*batch*(2+3*embedding+context+context*heads), + ) case "stablelm": fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) partialOffload = max( diff --git a/llm/memory.go b/llm/memory.go index 6890b08c..df7081cf 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -12,17 +12,8 @@ import ( // This algorithm looks for a complete fit to determine if we need to unload other models func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { - var estimatedVRAM uint64 - if opts.NumCtx > int(ggml.KV().ContextLength()) { - slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength()) - opts.NumCtx = int(ggml.KV().ContextLength()) - } - - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - // Split up the GPUs by type and try them + var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { var layerCount int layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) diff --git a/llm/server.go b/llm/server.go index 78106ea0..33c56f1f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -53,6 +53,7 @@ type llmServer struct { estimatedTotal uint64 // Total size of model totalLayers uint64 gpuCount int + loadDuration time.Duration // Record how long it took the model to load sem *semaphore.Weighted } @@ -76,15 +77,7 @@ func LoadModel(model string) (*GGML, error) { // The gpu list must be a single family. func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { var err error - if opts.NumCtx > int(ggml.KV().ContextLength()) { - slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength()) - } - - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - - cpuRunner := "" + var cpuRunner string var estimatedVRAM uint64 var estimatedTotal uint64 var systemMemory uint64 @@ -112,6 +105,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 + } else if gpus[0].Library != "metal" && layers == 0 { + // Don't bother loading into the GPU if no layers can fit + cpuRunner = serverForCpu() + gpuCount = 0 } else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" { opts.NumGPU = layers } @@ -156,11 +153,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr "--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--embedding", } - if envconfig.Debug { - params = append(params, "--log-format", "json") - } else { - params = append(params, "--log-disable") - } + + params = append(params, "--log-disable") if opts.NumGPU >= 0 { params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) @@ -220,7 +214,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if dir == "" { // Shouldn't happen finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers) - slog.Error("sever list inconsistent", "error", finalErr) + slog.Error("server list inconsistent", "error", finalErr) continue } @@ -291,34 +285,29 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1, gpuCount: gpuCount, + done: make(chan error, 1), } - s.cmd.Env = os.Environ() s.cmd.Stdout = os.Stdout s.cmd.Stderr = s.status - visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv() - pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) + if v := strings.Join(libraryPaths, string(filepath.ListSeparator)); v != "" { + s.cmd.Env = append(s.cmd.Env, pathEnv+"="+v) + } - // Update or add the path and visible devices variable with our adjusted version - pathNeeded := true - devicesNeeded := visibleDevicesEnv != "" - for i := range s.cmd.Env { - cmp := strings.SplitN(s.cmd.Env[i], "=", 2) - if strings.EqualFold(cmp[0], pathEnv) { - s.cmd.Env[i] = pathEnv + "=" + pathEnvVal - pathNeeded = false - } else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) { - s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal - devicesNeeded = false + if k, v := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv(); k != "" { + s.cmd.Env = append(s.cmd.Env, k+"="+v) + } + + for _, ev := range os.Environ() { + if strings.HasPrefix(ev, "CUDA_") || + strings.HasPrefix(ev, "ROCM_") || + strings.HasPrefix(ev, "HIP_") || + strings.HasPrefix(ev, "HSA_") || + strings.HasPrefix(ev, "GGML_") { + s.cmd.Env = append(s.cmd.Env, ev) } } - if pathNeeded { - s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) - } - if devicesNeeded { - s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal) - } slog.Info("starting llama server", "cmd", s.cmd.String()) // Log at debug as the environment is inherited and might contain sensitive information @@ -339,6 +328,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr continue } + // reap subprocess when it exits + go func() { + s.done <- s.cmd.Wait() + }() + return s, nil } @@ -483,13 +477,11 @@ func (s *llmServer) Ping(ctx context.Context) error { func (s *llmServer) WaitUntilRunning(ctx context.Context) error { start := time.Now() - // TODO we need to wire up a better way to detect hangs during model load and startup of the server expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load - ticker := time.NewTicker(50 * time.Millisecond) - defer ticker.Stop() slog.Info("waiting for llama runner to start responding") var lastStatus ServerStatus = -1 + for { select { case <-ctx.Done(): @@ -501,41 +493,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { msg = s.status.LastErrMsg } return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) - case <-ticker.C: - if time.Now().After(expiresAt) { - // timeout - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - return fmt.Errorf("timed out waiting for llama runner to start: %s", msg) + default: + } + if time.Now().After(expiresAt) { + // timeout + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg } - if s.cmd.ProcessState != nil { - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) - } - - c, cancel := context.WithTimeout(ctx, 200*time.Millisecond) - defer cancel() - status, err := s.getServerStatus(c) - if err != nil && lastStatus != status { - slog.Debug("server not yet available", "error", err) - lastStatus = status - continue - } - - switch status { - case ServerStatusLoadingModel: - // TODO - this state never seems to happen with the current server.cpp code (bug?) - // it doesn't respond to the health endpoint until after the model is loaded - slog.Debug("loading model") - case ServerStatusReady: - slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds())) - return nil + return fmt.Errorf("timed out waiting for llama runner to start: %s", msg) + } + if s.cmd.ProcessState != nil { + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg } + return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) + } + ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond) + defer cancel() + status, _ := s.getServerStatus(ctx) + if lastStatus != status && status != ServerStatusReady { + // Only log on status changes + slog.Info("waiting for server to become available", "status", status.ToString()) + } + switch status { + case ServerStatusReady: + s.loadDuration = time.Since(start) + slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds())) + return nil + default: + lastStatus = status + time.Sleep(time.Millisecond * 250) + continue } } } @@ -576,10 +566,11 @@ type ImageData struct { } type completion struct { - Content string `json:"content"` - Model string `json:"model"` - Prompt string `json:"prompt"` - Stop bool `json:"stop"` + Content string `json:"content"` + Model string `json:"model"` + Prompt string `json:"prompt"` + Stop bool `json:"stop"` + StoppedLimit bool `json:"stopped_limit"` Timings struct { PredictedN int `json:"predicted_n"` @@ -598,6 +589,7 @@ type CompletionRequest struct { type CompletionResponse struct { Content string + DoneReason string Done bool PromptEvalCount int PromptEvalDuration time.Duration @@ -739,8 +731,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } if c.Stop { + doneReason := "stop" + if c.StoppedLimit { + doneReason = "length" + } + fn(CompletionResponse{ Done: true, + DoneReason: doneReason, PromptEvalCount: c.Timings.PromptN, PromptEvalDuration: parseDurationMs(c.Timings.PromptMS), EvalCount: c.Timings.PredictedN, @@ -935,8 +933,11 @@ func (s *llmServer) Close() error { if err := s.cmd.Process.Kill(); err != nil { return err } - - _ = s.cmd.Wait() + // if ProcessState is already populated, Wait already completed, no need to wait again + if s.cmd.ProcessState == nil { + slog.Debug("waiting for llama server to exit") + <-s.done + } slog.Debug("llama server stopped") } diff --git a/openai/openai.go b/openai/openai.go index 96d8f218..4b335f36 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -107,15 +107,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { Model: r.Model, SystemFingerprint: "fp_ollama", Choices: []Choice{{ - Index: 0, - Message: Message{Role: r.Message.Role, Content: r.Message.Content}, - FinishReason: func(done bool) *string { - if done { - reason := "stop" - return &reason - } - return nil - }(r.Done), + Index: 0, + Message: Message{Role: r.Message.Role, Content: r.Message.Content}, + FinishReason: &r.DoneReason, }}, Usage: Usage{ // TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count @@ -135,15 +129,9 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk { SystemFingerprint: "fp_ollama", Choices: []ChunkChoice{ { - Index: 0, - Delta: Message{Role: "assistant", Content: r.Message.Content}, - FinishReason: func(done bool) *string { - if done { - reason := "stop" - return &reason - } - return nil - }(r.Done), + Index: 0, + Delta: Message{Role: "assistant", Content: r.Message.Content}, + FinishReason: &r.DoneReason, }, }, } diff --git a/server/images.go b/server/images.go index 2be1d366..3f415b6d 100644 --- a/server/images.go +++ b/server/images.go @@ -565,7 +565,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m } if !envconfig.NoPrune { - if err := deleteUnusedLayers(nil, unref, false); err != nil { + if err := deleteUnusedLayers(nil, unref); err != nil { return err } } @@ -613,7 +613,7 @@ func CopyModel(src, dst model.Name) error { return err } -func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error { +func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error { fp, err := GetManifestPath() if err != nil { return err @@ -660,13 +660,9 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err)) continue } - if !dryRun { - if err := os.Remove(fp); err != nil { - slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err)) - continue - } - } else { - slog.Info(fmt.Sprintf("wanted to remove: %s", fp)) + if err := os.Remove(fp); err != nil { + slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err)) + continue } } @@ -689,14 +685,25 @@ func PruneLayers() error { for _, blob := range blobs { name := blob.Name() name = strings.ReplaceAll(name, "-", ":") - if strings.HasPrefix(name, "sha256:") { - deleteMap[name] = struct{}{} + + _, err := GetBlobsPath(name) + if err != nil { + if errors.Is(err, ErrInvalidDigestFormat) { + // remove invalid blobs (e.g. partial downloads) + if err := os.Remove(filepath.Join(p, blob.Name())); err != nil { + slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err) + } + } + + continue } + + deleteMap[name] = struct{}{} } slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - err = deleteUnusedLayers(nil, deleteMap, false) + err = deleteUnusedLayers(nil, deleteMap) if err != nil { return err } @@ -752,7 +759,7 @@ func DeleteModel(name string) error { } deleteMap[manifest.Config.Digest] = struct{}{} - err = deleteUnusedLayers(&mp, deleteMap, false) + err = deleteUnusedLayers(&mp, deleteMap) if err != nil { return err } @@ -912,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu if noprune == "" { fn(api.ProgressResponse{Status: "removing any unused layers"}) - err = deleteUnusedLayers(nil, deleteMap, false) + err = deleteUnusedLayers(nil, deleteMap) if err != nil { return err } diff --git a/server/modelpath.go b/server/modelpath.go index 86908226..25a817ca 100644 --- a/server/modelpath.go +++ b/server/modelpath.go @@ -154,9 +154,6 @@ func GetBlobsPath(digest string) (string, error) { // only accept actual sha256 digests pattern := "^sha256[:-][0-9a-fA-F]{64}$" re := regexp.MustCompile(pattern) - if err != nil { - return "", err - } if digest != "" && !re.MatchString(digest) { return "", ErrInvalidDigestFormat diff --git a/server/routes.go b/server/routes.go index 7dfeb513..600a30fa 100644 --- a/server/routes.go +++ b/server/routes.go @@ -127,10 +127,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -156,9 +152,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { // of `raw` mode so we need to check for it too if req.Prompt == "" && req.Template == "" && req.System == "" { c.JSON(http.StatusOK, api.GenerateResponse{ - CreatedAt: time.Now().UTC(), - Model: req.Model, - Done: true, + CreatedAt: time.Now().UTC(), + Model: req.Model, + Done: true, + DoneReason: "load", }) return } @@ -226,10 +223,11 @@ func (s *Server) GenerateHandler(c *gin.Context) { } resp := api.GenerateResponse{ - Model: req.Model, - CreatedAt: time.Now().UTC(), - Done: r.Done, - Response: r.Content, + Model: req.Model, + CreatedAt: time.Now().UTC(), + Done: r.Done, + Response: r.Content, + DoneReason: r.DoneReason, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, PromptEvalDuration: r.PromptEvalDuration, @@ -370,10 +368,6 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -560,7 +554,12 @@ func (s *Server) CreateModelHandler(c *gin.Context) { ctx, cancel := context.WithCancel(c.Request.Context()) defer cancel() - if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil { + quantization := req.Quantization + if req.Quantize != "" { + quantization = req.Quantize + } + + if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil { ch <- gin.H{"error": err.Error()} } }() @@ -740,20 +739,28 @@ func (s *Server) ListModelsHandler(c *gin.Context) { } n := model.ParseNameFromFilepath(rel) + if !n.IsValid() { + slog.Warn("bad manifest filepath", "path", rel) + return nil + } + m, err := ParseNamedManifest(n) if err != nil { - return err + slog.Warn("bad manifest", "name", n, "error", err) + return nil } f, err := m.Config.Open() if err != nil { - return err + slog.Warn("bad manifest config filepath", "name", n, "error", err) + return nil } defer f.Close() var c ConfigV2 if err := json.NewDecoder(f).Decode(&c); err != nil { - return err + slog.Warn("bad manifest config", "name", n, "error", err) + return nil } // tag should never be masked @@ -1037,7 +1044,8 @@ func Serve(ln net.Listener) error { } ctx, done := context.WithCancel(context.Background()) - sched := InitScheduler(ctx) + schedCtx, schedDone := context.WithCancel(ctx) + sched := InitScheduler(schedCtx) s := &Server{addr: ln.Addr(), sched: sched} r := s.GenerateRoutes() @@ -1052,23 +1060,31 @@ func Serve(ln net.Listener) error { go func() { <-signals srvr.Close() - done() + schedDone() sched.unloadAllRunners() gpu.Cleanup() - os.Exit(0) + done() }() if err := llm.Init(); err != nil { return fmt.Errorf("unable to initialize llm library %w", err) } - s.sched.Run(ctx) + s.sched.Run(schedCtx) // At startup we retrieve GPU information so we can get log messages before loading a model // This will log warnings to the log in case we have problems with detected GPUs - _ = gpu.GetGPUInfo() + gpus := gpu.GetGPUInfo() + gpus.LogDetails() - return srvr.Serve(ln) + err = srvr.Serve(ln) + // If server is closed from the signal handler, wait for the ctx to be done + // otherwise error out quickly + if !errors.Is(err, http.ErrServerClosed) { + return err + } + <-ctx.Done() + return err } func waitForStream(c *gin.Context, ch chan interface{}) { @@ -1177,10 +1193,6 @@ func (s *Server) ChatHandler(c *gin.Context) { opts, err := modelOptions(model, req.Options) if err != nil { - if errors.Is(err, api.ErrInvalidOpts) { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -1222,10 +1234,11 @@ func (s *Server) ChatHandler(c *gin.Context) { // an empty request loads the model if len(req.Messages) == 0 || prompt == "" { resp := api.ChatResponse{ - CreatedAt: time.Now().UTC(), - Model: req.Model, - Done: true, - Message: api.Message{Role: "assistant"}, + CreatedAt: time.Now().UTC(), + Model: req.Model, + Done: true, + DoneReason: "load", + Message: api.Message{Role: "assistant"}, } c.JSON(http.StatusOK, resp) return @@ -1258,10 +1271,11 @@ func (s *Server) ChatHandler(c *gin.Context) { fn := func(r llm.CompletionResponse) { resp := api.ChatResponse{ - Model: req.Model, - CreatedAt: time.Now().UTC(), - Message: api.Message{Role: "assistant", Content: r.Content}, - Done: r.Done, + Model: req.Model, + CreatedAt: time.Now().UTC(), + Message: api.Message{Role: "assistant", Content: r.Content}, + Done: r.Done, + DoneReason: r.DoneReason, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, PromptEvalDuration: r.PromptEvalDuration, diff --git a/server/sched.go b/server/sched.go index c4a071c1..eff2b117 100644 --- a/server/sched.go +++ b/server/sched.go @@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { // allocate a large enough kv cache for all parallel requests + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } + opts.NumCtx = opts.NumCtx * envconfig.NumParallel req := &LlmRequest{ @@ -265,11 +269,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) { s.loadedMu.Lock() slog.Debug("got lock to unload", "model", runner.model) + finished := runner.waitForVRAMRecovery() runner.unload() delete(s.loaded, runner.model) s.loadedMu.Unlock() slog.Debug("runner released", "model", runner.model) runner.refMu.Unlock() + + <-finished slog.Debug("sending an unloaded event", "model", runner.model) s.unloadedCh <- struct{}{} } @@ -465,6 +472,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool return false } +// Free memory reporting on GPUs can lag for a while even after the runner +// exits, so we have to keep checking until we see the available memory recover, +// otherwise subsequent model loads will get far less layers loaded or worse +// case, may completely fall back to CPU mode. +// This routine must be called before the runner unloads so it can establish +// a before and after GPU memory allocation. The returned channel +// will be notified when we're done waiting, or have timed out and should +// proceed anyway +func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { + finished := make(chan interface{}, 1) + + // CPU or Metal don't need checking, so no waiting required + if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") { + finished <- struct{}{} + return finished + } + start := time.Now() + + // Establish a baseline before we unload + gpusBefore := gpu.GetGPUInfo() + var totalMemoryBefore, freeMemoryBefore uint64 + for _, gpu := range gpusBefore { + totalMemoryBefore += gpu.TotalMemory + freeMemoryBefore += gpu.FreeMemory + } + go func() { + expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s + ticker := time.NewTicker(250 * time.Millisecond) + defer ticker.Stop() + for { + <-ticker.C + if time.Now().After(expiresAt) { + slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds()) + finished <- struct{}{} + } + + // Query GPUs, look for free to go back up + gpusNow := gpu.GetGPUInfo() + var totalMemoryNow, freeMemoryNow uint64 + for _, gpu := range gpusNow { + totalMemoryNow += gpu.TotalMemory + freeMemoryNow += gpu.FreeMemory + } + // If we're within ~80% of the estimated memory usage recovered, bail out + if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { + slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds())) + finished <- struct{}{} + return + } + } + }() + return finished + +} + type ByDuration []*runnerRef func (a ByDuration) Len() int { return len(a) } @@ -505,9 +567,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. // - try subsets of GPUs instead of just falling back to 1 or all in a family // Now try all the GPUs - if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { - slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM)) - return gl + if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM)) + return sgl } } return nil