Merge branch 'ollama:main' into main

Merge pull request #4329 from dhiltgen/zero_layers
Fall back to CPU runner with zero layers
2025-12-23 15:08:27 +00:00 · 2024-05-11 12:00:17 +08:00 · 2024-05-10 15:23:16 -07:00 · 2024-05-10 15:09:48 -07:00 · 2024-05-10 14:25:59 -07:00 · 2024-05-10 14:20:10 -07:00
35 changed files with 609 additions and 489 deletions
--- a/README.md
+++ b/README.md
@@ -331,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
 ### Libraries
@@ -357,7 +358,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
-
+- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/WolfTheDeveloper/llamascript)
 ### Mobile
 - [Enchanted](https://github.com/AugustDev/enchanted)
--- a/api/types.go
+++ b/api/types.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"log/slog"
 	"math"
 	"os"
 	"reflect"
@@ -116,6 +117,7 @@ type ChatResponse struct {
 	Model      string    `json:"model"`
 	CreatedAt  time.Time `json:"created_at"`
 	Message    Message   `json:"message"`
 	DoneReason string    `json:"done_reason,omitempty"`
 	Done bool `json:"done"`
@@ -161,7 +163,6 @@ type Runner struct {
 	UseNUMA   bool `json:"numa,omitempty"`
 	NumCtx    int  `json:"num_ctx,omitempty"`
 	NumBatch  int  `json:"num_batch,omitempty"`
 	NumGQA    int  `json:"num_gqa,omitempty"`
 	NumGPU    int  `json:"num_gpu,omitempty"`
 	MainGPU   int  `json:"main_gpu,omitempty"`
 	LowVRAM   bool `json:"low_vram,omitempty"`
@@ -171,11 +172,6 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
 	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
 	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
 	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
 	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 }
 // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -205,10 +201,13 @@ type CreateRequest struct {
 	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
-	Quantization string `json:"quantization,omitempty"`
+	Quantize  string `json:"quantize,omitempty"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 	// Quantization is deprecated, see Quantize
 	Quantization string `json:"quantization,omitempty"`
 }
 // DeleteRequest is the request passed to [Client.Delete].
@@ -314,6 +313,9 @@ type GenerateResponse struct {
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`
 	// DoneReason is the reason the model stopped generating text.
 	DoneReason string `json:"done_reason,omitempty"`
 	// Context is an encoding of the conversation used in this response; this
 	// can be sent in the next request to keep a conversational memory.
 	Context []int `json:"context,omitempty"`
@@ -359,8 +361,6 @@ func (m *Metrics) Summary() {
 	}
 }
 // ErrInvalidOpts is returned when invalid options are passed to the client.
 var ErrInvalidOpts = errors.New("invalid options")
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 func (opts *Options) FromMap(m map[string]interface{}) error {
@@ -376,9 +376,13 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 		}
 	}
 	invalidOpts := []string{}
 	for key, val := range m {
-		if opt, ok := jsonOpts[key]; ok {
+		opt, ok := jsonOpts[key]
 		if !ok {
 			slog.Warn("invalid option provided", "option", opt.Name)
 			continue
 		}
 		field := valueOpts.FieldByName(opt.Name)
 		if field.IsValid() && field.CanSet() {
 			if val == nil {
@@ -435,14 +439,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 				return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 			}
 		}
 		} else {
 			invalidOpts = append(invalidOpts, key)
 		}
 	}
 	if len(invalidOpts) > 0 {
 		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
 	}
 	return nil
 }
@@ -475,7 +473,6 @@ func DefaultOptions() Options {
 			NumCtx:    2048,
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumGQA:    1,
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
 			F16KV:     true,
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}
-	quantization, _ := cmd.Flags().GetString("quantization")
+	quantize, _ := cmd.Flags().GetString("quantize")
-	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command {
 	}
 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
-	createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,7 +6,7 @@
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
-* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
+* [Docker Documentation](./docker.md)
 ### Reference
--- a/docs/api.md
+++ b/docs/api.md
@@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{
    "numa": false,
    "num_ctx": 1024,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
@@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
  }
 }'
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -0,0 +1,71 @@
 # Ollama Docker image
 ### CPU only
 ```bash
 docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 ### Nvidia GPU
 Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
 #### Install with Apt
 1.  Configure the repository
 ```bash
 curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
 curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
 sudo apt-get update
 ```
 2.  Install the NVIDIA Container Toolkit packages
 ```bash
 sudo apt-get install -y nvidia-container-toolkit
 ```
 #### Install with Yum or Dnf
 1.  Configure the repository
 ```bash
 curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
    | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
 ```
 2. Install the NVIDIA Container Toolkit packages
 ```bash
 sudo yum install -y nvidia-container-toolkit
 ```
 #### Configure Docker to use Nvidia driver 
 ```
 sudo nvidia-ctk runtime configure --runtime=docker
 sudo systemctl restart docker
 ```
 #### Start the container
 ```bash
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 ### AMD GPU
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
 ```
 docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
 ```
 ### Run model locally
 Now you can run a model:
 ```
 docker exec -it ollama ollama run llama3
 ```
 ### Try different models
 More models can be found on the [Ollama library](https://ollama.com/library).
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -83,3 +83,22 @@ If your system is configured with the "noexec" flag where Ollama stores its
 temporary executable files, you can specify an alternate location by setting
 OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
 OLLAMA_TMPDIR=/usr/share/ollama/
 ## Container fails to run on NVIDIA GPU
 Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md)
 Sometimes the container runtime can have difficulties initializing the GPU.
 When you check the server logs, this can show up as various error codes, such
 as "3" (not initialized), "46" (device unavailable), "100" (no device), "999"
 (unknown), or others.  The following troubleshooting techniques may help resolve
 the problem
 - Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
 - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
 - Try rebooting
 - Make sure you're running the latest nvidia drivers
 If none of those resolve the problem, gather additional information and file an issue:
 - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -5,13 +5,13 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le
 To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
 ```bash
-npm install langchain
+npm install @langchain/community
 ```
 Now we can start building out our JavaScript:
 ```javascript
-import { Ollama } from "langchain/llms/ollama";
+import { Ollama } from "@langchain/community/llms/ollama";
 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
--- a/examples/bash-comparemodels/README.md
+++ b/examples/bash-comparemodels/README.md
@@ -1,10 +0,0 @@
 # Bash Shell examples
 When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
 `ollama run llama3 < sourcequestions.txt`
 This concept is used in the following example.
 ## Compare Models
 `comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
--- a/examples/bash-comparemodels/comparemodels.sh
+++ b/examples/bash-comparemodels/comparemodels.sh
@@ -1,64 +0,0 @@
 #! /usr/bin/env bash
 # Compare multiple models by running them with the same questions
 NUMBEROFCHOICES=4
 SELECTIONS=()
 declare -a SUMS=()
 # Get the list of models
 CHOICES=$(ollama list | awk '{print $1}')
 # Select which models to run as a comparison
 echo "Select $NUMBEROFCHOICES models to compare:"
 select ITEM in $CHOICES; do
    if [[ -n $ITEM ]]; then
        echo "You have selected $ITEM"
        SELECTIONS+=("$ITEM")
        ((COUNT++))
        if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
            break
        fi
    else
        echo "Invalid selection"
    fi
 done
 # Loop through each of the selected models
 for ITEM in "${SELECTIONS[@]}"; do
    echo "--------------------------------------------------------------"
    echo "Loading the model $ITEM into memory"
    ollama run "$ITEM" ""
    echo "--------------------------------------------------------------"
    echo "Running the questions through the model $ITEM"
    COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
    # eval duration is sometimes listed in seconds and sometimes in milliseconds. 
    # Add up the values for each model
    SUM=$(echo "$COMMAND_OUTPUT" | awk '
    /eval duration:/ {
        value = $3
        if (index(value, "ms") > 0) {
            gsub("ms", "", value)
            value /= 1000
        } else {
            gsub("s", "", value)
        }
        sum += value
    }
    END { print sum }')
    SUMS+=("All questions for $ITEM completed in $SUM seconds")
 done
 echo ""
 echo "--------------------------------------------------------------"
 echo -e "Sums of eval durations for each run:"
 for val in "${SUMS[@]}"; do
    echo "$val"
 done
 echo "--------------------------------------------------------------"
 echo "Comparison complete. Now you can decide"
 echo "which model is best."
 echo "--------------------------------------------------------------"
--- a/examples/bash-comparemodels/sourcequestions.txt
+++ b/examples/bash-comparemodels/sourcequestions.txt
@@ -1,7 +0,0 @@
 Why is the sky blue
 What is a black hole
 Explain the big bang theory like I am 5?
 What is the quickest way to win a game of Monopoly with 3 others?
 Why does a vacuum bottle keep my coffee hot and my milkshake cold?
 What is the difference between a meteor, a meteorite, and a meteoroid?
 Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -3,7 +3,6 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
 	"strconv"
 	"syscall"
 	"unsafe"
@@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
 	hl.dll = 0
 }
-func (hl *HipLib) AMDDriverVersion() (string, error) {
+func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	if hl.dll == 0 {
-		return "", fmt.Errorf("dll has been unloaded")
+		return 0, 0, fmt.Errorf("dll has been unloaded")
 	}
 	var version int
 	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
 	if status != hipSuccess {
-		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
 	}
-	return strconv.Itoa(version), nil
+
 	slog.Debug("hipDriverGetVersion", "version", version)
 	// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
 	driverMajor = version / 1000
 	driverMinor = (version - (driverMajor * 1000)) / 10
 	return driverMajor, driverMinor, nil
 }
 func (hl *HipLib) HipGetDeviceCount() int {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"slices"
 	"strconv"
 	"strings"
@@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
 	}
 	// Opportunistic logging of driver version to aid in troubleshooting
-	ver, err := AMDDriverVersion()
+	driverMajor, driverMinor, err := AMDDriverVersion()
-	if err == nil {
+	if err != nil {
 		slog.Info("AMD Driver: " + ver)
 	} else {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
 		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
@@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
 		var vendor, device uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
 					slog.Debug("malformed int " + line)
 					continue
 				}
 			} else if strings.HasPrefix(line, "vendor_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
 					slog.Debug("malformed vendor_id", "vendor_id", line)
 					continue
 				}
 				vendor, err = strconv.ParseUint(ver[1], 10, 32)
 				if err != nil {
 					slog.Debug("malformed vendor_id" + line)
 				}
 			} else if strings.HasPrefix(line, "device_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
 					slog.Debug("malformed device_id", "device_id", line)
 					continue
 				}
 				device, err = strconv.ParseUint(ver[1], 10, 32)
 				if err != nil {
 					slog.Debug("malformed device_id" + line)
 				}
 			}
 			// TODO - any other properties we want to extract and record?
@@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		}
 		if int(major) < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}
@@ -210,12 +230,17 @@ func AMDGetGPUInfo() []GpuInfo {
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
 		var name string
 		// TODO - PCI ID lookup
 		if vendor > 0 && device > 0 {
 			name = fmt.Sprintf("%04x:%04x", vendor, device)
 		}
-		slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
@@ -223,11 +248,11 @@ func AMDGetGPUInfo() []GpuInfo {
 				FreeMemory:  (totalMemory - usedMemory),
 			},
 			ID:            fmt.Sprintf("%d", gpuID),
-			// Name: not exposed in sysfs directly, would require pci device id lookup
+			Name:          name,
-			Major:         int(major),
+			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 			Minor:         int(minor),
 			Patch:         int(patch),
 			MinimumMemory: rocmMinimumMemory,
 			DriverMajor:   driverMajor,
 			DriverMinor:   driverMinor,
 		}
 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
-			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
+			gfx := gpuInfo.Compute
 			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
@@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
 			}
 		} else {
-			slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}
 		// The GPU has passed all the verification steps and is supported
@@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
-func AMDDriverVersion() (string, error) {
+func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
-	_, err := os.Stat(DriverVersionFile)
+	_, err = os.Stat(DriverVersionFile)
 	if err != nil {
-		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
 	}
 	fp, err := os.Open(DriverVersionFile)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
 	defer fp.Close()
 	verString, err := io.ReadAll(fp)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
-	return strings.TrimSpace(string(verString)), nil
+
 	pattern := `\A(\d+)\.(\d+).*`
 	regex := regexp.MustCompile(pattern)
 	match := regex.FindStringSubmatch(string(verString))
 	if len(match) < 2 {
 		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
 	}
 	driverMajor, err = strconv.Atoi(match[1])
 	if err != nil {
 		return 0, 0, err
 	}
 	driverMinor, err = strconv.Atoi(match[2])
 	if err != nil {
 		return 0, 0, err
 	}
 	return driverMajor, driverMinor, nil
 }
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,7 +7,6 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/format"
@@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
 	}
 	defer hl.Release()
-	ver, err := hl.AMDDriverVersion()
+	// TODO - this reports incorrect version information, so omitting for now
-	if err == nil {
+	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
-		slog.Info("AMD Driver: " + ver)
+	// if err != nil {
-	} else {
+	// 	// For now this is benign, but we may eventually need to fail compatibility checks
-		// For now this is benign, but we may eventually need to fail compatibility checks
+	// 	slog.Debug("error looking up amd driver version", "error", err)
-		slog.Debug("error looking up amd driver version", "error", err)
+	// }
 	}
 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
@@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
 			return nil
 		}
 	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 	}
-	slog.Info("detected hip devices", "count", count)
+	slog.Debug("detected hip devices", "count", count)
 	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := 0; i < count; i++ {
 		err = hl.HipSetDevice(i)
@@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
-		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
+		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
 		var major, minor, patch string
 		switch len(gfx) {
 		case 6:
 			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
 		case 7:
 			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
 		}
 		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
-			slog.Info("iGPU detected skipping", "id", i)
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
 			continue
 		}
 		if gfxOverride == "" {
@@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
-				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
+				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}
@@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {
 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
-		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
@@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
 			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
 			DependencyPath: libDir,
 			MinimumMemory:  rocmMinimumMemory,
-		}
+			Name:           name,
-		if major != "" {
+			Compute:        gfx,
-			gpuInfo.Major, err = strconv.Atoi(major)
+
-			if err != nil {
+			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
-				slog.Info("failed to parse version", "version", gfx, "error", err)
+			// DriverMajor:    driverMajor,
-			}
+			// DriverMinor:    driverMinor,
 		}
 		if minor != "" {
 			gpuInfo.Minor, err = strconv.Atoi(minor)
 			if err != nil {
 				slog.Info("failed to parse version", "version", gfx, "error", err)
 			}
 		}
 		if patch != "" {
 			// Patch rev is hex; e.g. gfx90a
 			p, err := strconv.ParseInt(patch, 16, 0)
 			if err != nil {
 				slog.Info("failed to parse version", "version", gfx, "error", err)
 			} else {
 				gpuInfo.Patch = int(p)
 			}
 		}
 		if gpuInfo.Major < RocmComputeMin {
 			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
 			continue
 		}
 		resp = append(resp, gpuInfo)
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -8,14 +8,14 @@ import (
 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
-		slog.Info("CPU has AVX2")
+		slog.Debug("CPU has AVX2")
 		return "avx2"
 	}
 	if cpu.X86.HasAVX {
-		slog.Info("CPU has AVX")
+		slog.Debug("CPU has AVX")
 		return "avx"
 	}
-	slog.Info("CPU does not have vector extensions")
+	slog.Debug("CPU does not have vector extensions")
 	// else LCD
 	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -31,8 +31,8 @@ type handles struct {
 }
 const (
-	cudaMinimumMemory = 256 * format.MebiByte
+	cudaMinimumMemory = 457 * format.MebiByte
-	rocmMinimumMemory = 256 * format.MebiByte
+	rocmMinimumMemory = 457 * format.MebiByte
 )
 var gpuMutex sync.Mutex
@@ -119,12 +119,12 @@ func initGPUHandles() *handles {
 		return gpuHandles
 	}
-	slog.Info("Detecting GPUs")
+	slog.Debug("Detecting GPUs")
 	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
 		if nvcuda != nil {
-			slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
+			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
 			gpuHandles.nvcuda = nvcuda
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -135,7 +135,7 @@ func initGPUHandles() *handles {
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
-			slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
+			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.cudart = cudart
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo := GpuInfo{
 			Library: "cuda",
 		}
 		var driverMajor int
 		var driverMinor int
 		if gpuHandles.cudart != nil {
 			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
 		} else {
 			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
 			driverMajor = int(gpuHandles.nvcuda.driver_major)
 			driverMinor = int(gpuHandles.nvcuda.driver_minor)
 		}
 		if memInfo.err != nil {
 			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
@@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo.TotalMemory = uint64(memInfo.total)
 		gpuInfo.FreeMemory = uint64(memInfo.free)
 		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Major = int(memInfo.major)
+		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 		gpuInfo.Minor = int(memInfo.minor)
 		gpuInfo.MinimumMemory = cudaMinimumMemory
 		gpuInfo.DependencyPath = depPath
 		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 		gpuInfo.DriverMajor = int(driverMajor)
 		gpuInfo.DriverMinor = int(driverMinor)
 		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 		resp = append(resp, gpuInfo)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -15,7 +15,7 @@ import (
 )
 const (
-	metalMinimumMemory = 384 * format.MebiByte
+	metalMinimumMemory = 512 * format.MebiByte
 )
 func GetGPUInfo() GpuInfoList {
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -39,16 +39,19 @@ extern "C" {
 #endif
 #define GPU_ID_LEN 64
 #define GPU_NAME_LEN 96
 typedef struct mem_info {
  char *err;  // If non-nill, caller responsible for freeing
  char gpu_id[GPU_ID_LEN];
  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;
  // Compute Capability
  int major; 
  int minor;
  int patch;
 } mem_info_t;
 void cpu_check_ram(mem_info_t *resp);
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) {
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
    resp->major = 0;
    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  } else {
    resp->err = LOAD_ERR();
@@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) {
  } else {
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
    resp->major = 0;
    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  }
  return;
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
@@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  }
  int version = 0;
-  nvcudaDriverVersion_t driverVersion;
+  resp->ch.driver_major = 0;
-  driverVersion.major = 0;
+  resp->ch.driver_minor = 0;
  driverVersion.minor = 0;
  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
-    driverVersion.major = version / 1000;
+    resp->ch.driver_major = version / 1000;
-    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
@@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
    return;
  }
  resp->major = 0;
  resp->minor = 0;
  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
@@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
      );
  }
  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
    resp->gpu_name[0] = '\0';
  }
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -44,12 +44,15 @@ typedef void* CUcontext;
 typedef struct nvcuda_handle {
  void *handle;
  uint16_t verbose;
  int driver_major;
  int driver_minor;
  CUresult (*cuInit)(unsigned int Flags);
  CUresult (*cuDriverGetVersion)(int *driverVersion);
  CUresult (*cuDeviceGetCount)(int *);
  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
  // Context specific aspects
  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -1,5 +1,12 @@
 package gpu
 import (
 	"fmt"
 	"log/slog"
 	"github.com/ollama/ollama/format"
 )
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
@@ -22,9 +29,11 @@ type GpuInfo struct {
 	// GPU information
 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
 	Name    string `json:"name"`    // user friendly name if available
-	Major int    `json:"major,omitempty"` // Major compatibility version (CC or gfx)
+	Compute string `json:"compute"` // Compute Capability or gfx
-	Minor int    `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
+
-	Patch int    `json:"patch,omitempty"` // Patch compatibility only matters on AMD
+	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
 	DriverMinor int `json:"driver_minor,omitempty"`
 	// TODO other performance capability info to help in scheduling decisions
 }
@@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }
 // Report the GPU information into the log an Info level
 func (l GpuInfoList) LogDetails() {
 	for _, g := range l {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,
 			"total", format.HumanBytes2(g.TotalMemory),
 			"available", format.HumanBytes2(g.FreeMemory),
 		)
 	}
 }
 // Sort by Free Space
 type ByFreeMemory []GpuInfo
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -217,7 +217,7 @@ func TestMultiModelStress(t *testing.T) {
 			defer wg.Done()
 			for j := 0; j < 3; j++ {
 				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
-				DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
 			}
 		}(i)
 	}
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -85,7 +85,7 @@ func GetTestEndpoint() (*api.Client, string) {
 var serverMutex sync.Mutex
 var serverReady bool
-func startServer(ctx context.Context, ollamaHost string) error {
+func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	// Make sure the server has been built
 	CLIName, err := filepath.Abs("../ollama")
 	if err != nil {
@@ -200,7 +200,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 		}
 		lifecycle.ServerLogFile = fp.Name()
 		fp.Close()
-		require.NoError(t, startServer(ctx, testEndpoint))
+		require.NoError(t, startServer(t, ctx, testEndpoint))
 	}
 	return client, testEndpoint, func() {
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -66,7 +66,7 @@ struct server_params {
 };
 bool server_verbose = false;
-bool server_log_json = true;
+bool server_log_json = false;
 enum stop_type {
    STOP_FULL,
@@ -266,7 +266,7 @@ struct server_slot {
        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",                   id},
            {"task_id",                   task_id},
            {"t_prompt_processing",       t_prompt_processing},
@@ -280,7 +280,7 @@ struct server_slot {
        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",            id},
            {"task_id",            task_id},
            {"t_token_generation", t_token_generation},
@@ -290,7 +290,7 @@ struct server_slot {
        });
        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
            {"t_prompt_processing", t_prompt_processing},
@@ -371,7 +371,7 @@ struct llama_server_context
    {
        if (clp_ctx)
        {
-            LOG_INFO("freeing clip model", {});
+            LOG_DEBUG("freeing clip model", {});
            clip_free(clp_ctx);
            clp_ctx = nullptr;
        }
@@ -392,7 +392,7 @@ struct llama_server_context
        params = params_;
        if (!params.mmproj.empty()) {
            multimodal = true;
-            LOG_INFO("Multi Modal Mode Enabled", {});
+            LOG_DEBUG("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -445,7 +445,7 @@ struct llama_server_context
        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
        for (int i = 0; i < params.n_parallel; i++)
        {
            server_slot slot;
@@ -454,7 +454,7 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;
-            LOG_INFO("new slot", {
+            LOG_DEBUG("new slot", {
                {"slot_id",    slot.id},
                {"n_ctx_slot", slot.n_ctx}
            });
@@ -468,7 +468,7 @@ struct llama_server_context
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-                LOG_INFO("slot self-extend", {
+                LOG_DEBUG("slot self-extend", {
                    {"slot_id",   slot.id},
                    {"ga_n",      ga_n},
                    {"ga_w",      ga_w}
@@ -827,7 +827,7 @@ struct llama_server_context
        all_slots_are_idle = false;
-        LOG_INFO("slot is processing task", {
+        LOG_DEBUG("slot is processing task", {
            {"slot_id", slot->id},
            {"task_id", slot->task_id},
        });
@@ -1504,7 +1504,7 @@ struct llama_server_context
                    }
                    slots_data.push_back(slot_data);
                }
-                LOG_INFO("slot data", {
+                LOG_DEBUG("slot data", {
                    {"task_id",            task.id},
                    {"n_idle_slots",       n_idle_slots},
                    {"n_processing_slots", n_processing_slots}
@@ -1566,7 +1566,7 @@ struct llama_server_context
    bool update_slots() {
        if (system_need_update)
        {
-            LOG_INFO("updating system prompt", {});
+            LOG_DEBUG("updating system prompt", {});
            system_prompt_update();
        }
@@ -1576,7 +1576,7 @@ struct llama_server_context
        {
            if (system_prompt.empty() && clean_kv_cache)
            {
-                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
+                LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
                kv_cache_clear();
            }
            return true;
@@ -1599,7 +1599,7 @@ struct llama_server_context
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
                    const int n_discard = n_left / 2;
-                    LOG_INFO("slot context shift", {
+                    LOG_DEBUG("slot context shift", {
                        {"slot_id",         slot.id},
                        {"task_id",         slot.task_id},
                        {"n_keep",          n_keep},
@@ -1638,7 +1638,7 @@ struct llama_server_context
                slot.command = NONE;
                slot.t_last_used = ggml_time_us();
-                LOG_INFO("slot released", {
+                LOG_DEBUG("slot released", {
                    {"slot_id",         slot.id},
                    {"task_id",         slot.task_id},
                    {"n_ctx",           n_ctx},
@@ -1807,7 +1807,7 @@ struct llama_server_context
                            slot.ga_i = ga_i;
                        }
-                        LOG_INFO("slot progression", {
+                        LOG_DEBUG("slot progression", {
                            { "slot_id",    slot.id },
                            { "task_id",    slot.task_id },
                            { "n_past",     slot.n_past },
@@ -1822,7 +1822,7 @@ struct llama_server_context
                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
                    {
                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+                        LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
                            { "slot_id", slot.id },
                            { "task_id", slot.task_id }
                        });
@@ -1834,7 +1834,7 @@ struct llama_server_context
                    }
                    int p0 = (int) system_tokens.size() + slot.n_past;
-                    LOG_INFO("kv cache rm [p0, end)", {
+                    LOG_DEBUG("kv cache rm [p0, end)", {
                        { "slot_id", slot.id },
                        { "task_id", slot.task_id },
                        { "p0",      p0 }
@@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "-v" || arg == "--verbose")
        {
 #if SERVER_VERBOSE != 1
            LOG_WARNING("server.cpp is not built with verbose logging.", {});
 #else
            server_verbose = true;
 #endif
        }
        else if (arg == "--mlock")
        {
@@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        else if (arg == "--log-disable")
        {
            log_set_target(stdout);
-            LOG_INFO("logging to file is disabled.", {});
+            LOG_DEBUG("logging to file is disabled.", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
@@ -2727,12 +2723,12 @@ static json format_detokenized_response(std::string content)
 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health" || req.path == "/v1/completions")
+    if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
    {
        return;
    }
-    LOG_INFO("request", {
+    LOG_DEBUG("request", {
        {"remote_addr", req.remote_addr},
        {"remote_port", req.remote_port},
        {"status",      res.status},
@@ -3054,6 +3050,26 @@ int main(int argc, char **argv) {
        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
    }
    if (sparams.n_threads_http < 1) {
        // +2 threads for monitoring endpoints
        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
    }
    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
                if (!svr.listen_after_bind())
                {
                    state.store(SERVER_STATE_ERROR);
                    return 1;
                }
                return 0;
            });
    // load the model
    if (!llama.load_model(params))
    {
@@ -3258,26 +3274,6 @@ int main(int argc, char **argv) {
    }*/
    //);
    if (sparams.n_threads_http < 1) {
        // +2 threads for monitoring endpoints
        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
    }
    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
                if (!svr.listen_after_bind())
                {
                    state.store(SERVER_STATE_ERROR);
                    return 1;
                }
                return 0;
            });
    llama.queue_tasks.on_new_task(std::bind(
        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
    llama.queue_tasks.on_finish_multitask(std::bind(
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -55,9 +55,10 @@ extern bool server_log_json;
    } while (0)
 #endif
-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
        {"timestamp", time(nullptr)},
    };
    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
        return;
    }
    if (server_log_json) {
        log.merge_patch(
                {
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,
        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
        char buf[1024];
        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::stringstream ss;
-        ss << buf << " |";
+        ss << level << " [" << function << "] " << message << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -329,7 +329,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(1+4*embedding+context+context*heads),
 		)
-		partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
+		partialOffload = max(
 			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(2+3*embedding+context+context*heads),
 		)
 	case "stablelm":
 		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
 		partialOffload = max(
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -12,17 +12,8 @@ import (
 // This algorithm looks for a complete fit to determine if we need to unload other models
 func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	var estimatedVRAM uint64
 	if opts.NumCtx > int(ggml.KV().ContextLength()) {
 		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
 		opts.NumCtx = int(ggml.KV().ContextLength())
 	}
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
 		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
--- a/llm/server.go
+++ b/llm/server.go
@@ -53,6 +53,7 @@ type llmServer struct {
 	estimatedTotal uint64 // Total size of model
 	totalLayers    uint64
 	gpuCount       int
 	loadDuration   time.Duration // Record how long it took the model to load
 	sem *semaphore.Weighted
 }
@@ -76,15 +77,7 @@ func LoadModel(model string) (*GGML, error) {
 // The gpu list must be a single family.
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
+	var cpuRunner string
 		slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
 	}
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	cpuRunner := ""
 	var estimatedVRAM uint64
 	var estimatedTotal uint64
 	var systemMemory uint64
@@ -112,6 +105,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
 		} else if gpus[0].Library != "metal" && layers == 0 {
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
 			gpuCount = 0
 		} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
 			opts.NumGPU = layers
 		}
@@ -156,11 +153,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if envconfig.Debug {
+
 		params = append(params, "--log-format", "json")
 	} else {
 	params = append(params, "--log-disable")
 	}
 	if opts.NumGPU >= 0 {
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
@@ -220,7 +214,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if dir == "" {
 			// Shouldn't happen
 			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
-			slog.Error("sever list inconsistent", "error", finalErr)
+			slog.Error("server list inconsistent", "error", finalErr)
 			continue
 		}
@@ -291,33 +285,28 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			sem:            semaphore.NewWeighted(int64(numParallel)),
 			totalLayers:    ggml.KV().BlockCount() + 1,
 			gpuCount:       gpuCount,
 			done:           make(chan error, 1),
 		}
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
-		visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
+		if v := strings.Join(libraryPaths, string(filepath.ListSeparator)); v != "" {
-		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+v)
 		}
-		// Update or add the path and visible devices variable with our adjusted version
+		if k, v := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv(); k != "" {
-		pathNeeded := true
+			s.cmd.Env = append(s.cmd.Env, k+"="+v)
 		devicesNeeded := visibleDevicesEnv != ""
 		for i := range s.cmd.Env {
 			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
 			if strings.EqualFold(cmp[0], pathEnv) {
 				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
 				pathNeeded = false
 			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
 				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
 				devicesNeeded = false
 		}
 		for _, ev := range os.Environ() {
 			if strings.HasPrefix(ev, "CUDA_") ||
 				strings.HasPrefix(ev, "ROCM_") ||
 				strings.HasPrefix(ev, "HIP_") ||
 				strings.HasPrefix(ev, "HSA_") ||
 				strings.HasPrefix(ev, "GGML_") {
 				s.cmd.Env = append(s.cmd.Env, ev)
 			}
 		if pathNeeded {
 			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
 		}
 		if devicesNeeded {
 			s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
 		}
 		slog.Info("starting llama server", "cmd", s.cmd.String())
@@ -339,6 +328,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			continue
 		}
 		// reap subprocess when it exits
 		go func() {
 			s.done <- s.cmd.Wait()
 		}()
 		return s, nil
 	}
@@ -483,13 +477,11 @@ func (s *llmServer) Ping(ctx context.Context) error {
 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
 	// TODO we need to wire up a better way to detect hangs during model load and startup of the server
 	expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
 	ticker := time.NewTicker(50 * time.Millisecond)
 	defer ticker.Stop()
 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
 	for {
 		select {
 		case <-ctx.Done():
@@ -501,7 +493,8 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				msg = s.status.LastErrMsg
 			}
 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
-		case <-ticker.C:
+		default:
 		}
 		if time.Now().After(expiresAt) {
 			// timeout
 			msg := ""
@@ -517,25 +510,22 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			}
 			return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 		}
-
+		ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
 			c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
 		defer cancel()
-			status, err := s.getServerStatus(c)
+		status, _ := s.getServerStatus(ctx)
-			if err != nil && lastStatus != status {
+		if lastStatus != status && status != ServerStatusReady {
-				slog.Debug("server not yet available", "error", err)
+			// Only log on status changes
-				lastStatus = status
+			slog.Info("waiting for server to become available", "status", status.ToString())
 				continue
 		}
 		switch status {
 			case ServerStatusLoadingModel:
 				// TODO - this state never seems to happen with the current server.cpp code (bug?)
 				// it doesn't respond to the health endpoint until after the model is loaded
 				slog.Debug("loading model")
 		case ServerStatusReady:
-				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
+			s.loadDuration = time.Since(start)
 			slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds()))
 			return nil
-			}
+		default:
 			lastStatus = status
 			time.Sleep(time.Millisecond * 250)
 			continue
 		}
 	}
 }
@@ -580,6 +570,7 @@ type completion struct {
 	Model        string `json:"model"`
 	Prompt       string `json:"prompt"`
 	Stop         bool   `json:"stop"`
 	StoppedLimit bool   `json:"stopped_limit"`
 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
@@ -598,6 +589,7 @@ type CompletionRequest struct {
 type CompletionResponse struct {
 	Content            string
 	DoneReason         string
 	Done               bool
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
@@ -739,8 +731,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}
 			if c.Stop {
 				doneReason := "stop"
 				if c.StoppedLimit {
 					doneReason = "length"
 				}
 				fn(CompletionResponse{
 					Done:               true,
 					DoneReason:         doneReason,
 					PromptEvalCount:    c.Timings.PromptN,
 					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
 					EvalCount:          c.Timings.PredictedN,
@@ -935,8 +933,11 @@ func (s *llmServer) Close() error {
 		if err := s.cmd.Process.Kill(); err != nil {
 			return err
 		}
-
+		// if ProcessState is already populated, Wait already completed, no need to wait again
-		_ = s.cmd.Wait()
+		if s.cmd.ProcessState == nil {
 			slog.Debug("waiting for llama server to exit")
 			<-s.done
 		}
 		slog.Debug("llama server stopped")
 	}
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -109,13 +109,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		Choices: []Choice{{
 			Index:        0,
 			Message:      Message{Role: r.Message.Role, Content: r.Message.Content},
-			FinishReason: func(done bool) *string {
+			FinishReason: &r.DoneReason,
 				if done {
 					reason := "stop"
 					return &reason
 				}
 				return nil
 			}(r.Done),
 		}},
 		Usage: Usage{
 			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
@@ -137,13 +131,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 			{
 				Index:        0,
 				Delta:        Message{Role: "assistant", Content: r.Message.Content},
-				FinishReason: func(done bool) *string {
+				FinishReason: &r.DoneReason,
 					if done {
 						reason := "stop"
 						return &reason
 					}
 					return nil
 				}(r.Done),
 			},
 		},
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -565,7 +565,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 	}
 	if !envconfig.NoPrune {
-		if err := deleteUnusedLayers(nil, unref, false); err != nil {
+		if err := deleteUnusedLayers(nil, unref); err != nil {
 			return err
 		}
 	}
@@ -613,7 +613,7 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }
-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
+func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
 	fp, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -660,14 +660,10 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
 			slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
 			continue
 		}
 		if !dryRun {
 		if err := os.Remove(fp); err != nil {
 			slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
 			continue
 		}
 		} else {
 			slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
 		}
 	}
 	return nil
@@ -689,14 +685,25 @@ func PruneLayers() error {
 	for _, blob := range blobs {
 		name := blob.Name()
 		name = strings.ReplaceAll(name, "-", ":")
-		if strings.HasPrefix(name, "sha256:") {
+
-			deleteMap[name] = struct{}{}
+		_, err := GetBlobsPath(name)
 		if err != nil {
 			if errors.Is(err, ErrInvalidDigestFormat) {
 				// remove invalid blobs (e.g. partial downloads)
 				if err := os.Remove(filepath.Join(p, blob.Name())); err != nil {
 					slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err)
 				}
 			}
 			continue
 		}
 		deleteMap[name] = struct{}{}
 	}
 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
-	err = deleteUnusedLayers(nil, deleteMap, false)
+	err = deleteUnusedLayers(nil, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -752,7 +759,7 @@ func DeleteModel(name string) error {
 	}
 	deleteMap[manifest.Config.Digest] = struct{}{}
-	err = deleteUnusedLayers(&mp, deleteMap, false)
+	err = deleteUnusedLayers(&mp, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -912,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap, false)
+		err = deleteUnusedLayers(nil, deleteMap)
 		if err != nil {
 			return err
 		}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -154,9 +154,6 @@ func GetBlobsPath(digest string) (string, error) {
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
 	re := regexp.MustCompile(pattern)
 	if err != nil {
 		return "", err
 	}
 	if digest != "" && !re.MatchString(digest) {
 		return "", ErrInvalidDigestFormat
--- a/server/routes.go
+++ b/server/routes.go
@@ -127,10 +127,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
 		if errors.Is(err, api.ErrInvalidOpts) {
 			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -159,6 +155,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			CreatedAt:  time.Now().UTC(),
 			Model:      req.Model,
 			Done:       true,
 			DoneReason: "load",
 		})
 		return
 	}
@@ -230,6 +227,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				CreatedAt:  time.Now().UTC(),
 				Done:       r.Done,
 				Response:   r.Content,
 				DoneReason: r.DoneReason,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
@@ -370,10 +368,6 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
 		if errors.Is(err, api.ErrInvalidOpts) {
 			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -560,7 +554,12 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
+		quantization := req.Quantization
 		if req.Quantize != "" {
 			quantization = req.Quantize
 		}
 		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -740,20 +739,28 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 			}
 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
 				slog.Warn("bad manifest filepath", "path", rel)
 				return nil
 			}
 			m, err := ParseNamedManifest(n)
 			if err != nil {
-				return err
+				slog.Warn("bad manifest", "name", n, "error", err)
 				return nil
 			}
 			f, err := m.Config.Open()
 			if err != nil {
-				return err
+				slog.Warn("bad manifest config filepath", "name", n, "error", err)
 				return nil
 			}
 			defer f.Close()
 			var c ConfigV2
 			if err := json.NewDecoder(f).Decode(&c); err != nil {
-				return err
+				slog.Warn("bad manifest config", "name", n, "error", err)
 				return nil
 			}
 			// tag should never be masked
@@ -1037,7 +1044,8 @@ func Serve(ln net.Listener) error {
 	}
 	ctx, done := context.WithCancel(context.Background())
-	sched := InitScheduler(ctx)
+	schedCtx, schedDone := context.WithCancel(ctx)
 	sched := InitScheduler(schedCtx)
 	s := &Server{addr: ln.Addr(), sched: sched}
 	r := s.GenerateRoutes()
@@ -1052,23 +1060,31 @@ func Serve(ln net.Listener) error {
 	go func() {
 		<-signals
 		srvr.Close()
-		done()
+		schedDone()
 		sched.unloadAllRunners()
 		gpu.Cleanup()
-		os.Exit(0)
+		done()
 	}()
 	if err := llm.Init(); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}
-	s.sched.Run(ctx)
+	s.sched.Run(schedCtx)
 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
-	_ = gpu.GetGPUInfo()
+	gpus := gpu.GetGPUInfo()
 	gpus.LogDetails()
-	return srvr.Serve(ln)
+	err = srvr.Serve(ln)
 	// If server is closed from the signal handler, wait for the ctx to be done
 	// otherwise error out quickly
 	if !errors.Is(err, http.ErrServerClosed) {
 		return err
 	}
 	<-ctx.Done()
 	return err
 }
 func waitForStream(c *gin.Context, ch chan interface{}) {
@@ -1177,10 +1193,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
 		if errors.Is(err, api.ErrInvalidOpts) {
 			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -1225,6 +1237,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			CreatedAt:  time.Now().UTC(),
 			Model:      req.Model,
 			Done:       true,
 			DoneReason: "load",
 			Message:    api.Message{Role: "assistant"},
 		}
 		c.JSON(http.StatusOK, resp)
@@ -1262,6 +1275,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				CreatedAt:  time.Now().UTC(),
 				Message:    api.Message{Role: "assistant", Content: r.Content},
 				Done:       r.Done,
 				DoneReason: r.DoneReason,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
--- a/server/sched.go
+++ b/server/sched.go
@@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
 	// allocate a large enough kv cache for all parallel requests
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	opts.NumCtx = opts.NumCtx * envconfig.NumParallel
 	req := &LlmRequest{
@@ -265,11 +269,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 			s.loadedMu.Lock()
 			slog.Debug("got lock to unload", "model", runner.model)
 			finished := runner.waitForVRAMRecovery()
 			runner.unload()
 			delete(s.loaded, runner.model)
 			s.loadedMu.Unlock()
 			slog.Debug("runner released", "model", runner.model)
 			runner.refMu.Unlock()
 			<-finished
 			slog.Debug("sending an unloaded event", "model", runner.model)
 			s.unloadedCh <- struct{}{}
 		}
@@ -465,6 +472,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 	return false
 }
 // Free memory reporting on GPUs can lag for a while even after the runner
 // exits, so we have to keep checking until we see the available memory recover,
 // otherwise subsequent model loads will get far less layers loaded or worse
 // case, may completely fall back to CPU mode.
 // This routine must be called before the runner unloads so it can establish
 // a before and after GPU memory allocation.  The returned channel
 // will be notified when we're done waiting, or have timed out and should
 // proceed anyway
 func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	finished := make(chan interface{}, 1)
 	// CPU or Metal don't need checking, so no waiting required
 	if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
 		finished <- struct{}{}
 		return finished
 	}
 	start := time.Now()
 	// Establish a baseline before we unload
 	gpusBefore := gpu.GetGPUInfo()
 	var totalMemoryBefore, freeMemoryBefore uint64
 	for _, gpu := range gpusBefore {
 		totalMemoryBefore += gpu.TotalMemory
 		freeMemoryBefore += gpu.FreeMemory
 	}
 	go func() {
 		expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
 		ticker := time.NewTicker(250 * time.Millisecond)
 		defer ticker.Stop()
 		for {
 			<-ticker.C
 			if time.Now().After(expiresAt) {
 				slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
 				finished <- struct{}{}
 			}
 			// Query GPUs, look for free to go back up
 			gpusNow := gpu.GetGPUInfo()
 			var totalMemoryNow, freeMemoryNow uint64
 			for _, gpu := range gpusNow {
 				totalMemoryNow += gpu.TotalMemory
 				freeMemoryNow += gpu.FreeMemory
 			}
 			// If we're within ~80% of the estimated memory usage recovered, bail out
 			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
 				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
 				finished <- struct{}{}
 				return
 			}
 		}
 	}()
 	return finished
 }
 type ByDuration []*runnerRef
 func (a ByDuration) Len() int      { return len(a) }
@@ -505,9 +567,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		// - try subsets of GPUs instead of just falling back to 1 or all in a family
 		// Now try all the GPUs
-		if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+		if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
-			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
+			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
-			return gl
+			return sgl
 		}
 	}
 	return nil
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 ###download 
-Make preparation as guide on (Development) [https://github.com/ollama/ollama/blob/main/docs/development.md]
+Make preparation as guide on [Development] (https://github.com/ollama/ollama/blob/main/docs/development.md)
 run 
 	$env:CGO_ENABLED="1"
 	go generate ./...
Author	SHA1	Message	Date
likelovewant	33d0209023	Merge branch 'ollama:main' into main	2024-05-11 12:00:17 +08:00
Daniel Hiltgen	879e2caf8c	Merge pull request #4329 from dhiltgen/zero_layers Fall back to CPU runner with zero layers	2024-05-10 15:23:16 -07:00
Daniel Hiltgen	c4014e73a2	Fall back to CPU runner with zero layers	2024-05-10 15:09:48 -07:00
Daniel Hiltgen	be9efdb981	Merge pull request #4326 from dhiltgen/fix_integration Integration fixes	2024-05-10 14:25:59 -07:00
Daniel Hiltgen	074dc3b9d8	Integration fixes	2024-05-10 14:20:10 -07:00
Daniel Hiltgen	86f9b582d5	Merge pull request #4323 from dhiltgen/sort_by_free Always use the sorted list of GPUs	2024-05-10 14:12:15 -07:00
Daniel Hiltgen	4142c3ef7c	Always use the sorted list of GPUs Make sure the first GPU has the most free space	2024-05-10 13:53:21 -07:00
Jeffrey Morgan	6602e793c0	Use `--quantize` flag and `quantize` api parameter (#4321 ) * rename `--quantization` to `--quantize` * backwards * Update api/types.go Co-authored-by: Michael Yang <mxyng@pm.me> --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2024-05-10 13:06:13 -07:00
Michael Yang	ea0fdaed28	Merge pull request #4320 from ollama/mxyng/phi2-mem add phi2 mem	2024-05-10 12:35:08 -07:00
Michael Yang	1eb382da5a	add phi2 mem	2024-05-10 12:13:28 -07:00
Jeffrey Morgan	bb6fd02298	Don't clamp ctx size in `PredictServerFit` (#4317 ) * dont clamp ctx size in `PredictServerFit` * minimum 4 context * remove context warning	2024-05-10 10:17:12 -07:00
Daniel Hiltgen	7e2bceceee	Merge pull request #4316 from dhiltgen/more_buffer Bump VRAM buffer back up	2024-05-10 10:02:34 -07:00
Daniel Hiltgen	30a7d7096c	Bump VRAM buffer back up Under stress scenarios we're seeing OOMs so this should help stabilize the allocations under heavy concurrency stress.	2024-05-10 09:15:28 -07:00
Michael Yang	200a18820e	Merge pull request #4306 from ollama/mxyng/fix-routes	2024-05-10 08:58:16 -07:00
Michael Yang	e03637176d	fix(routes): skip bad manifests	2024-05-10 08:46:11 -07:00
Bruce MacDonald	c02db93243	omit empty done reason	2024-05-09 16:45:29 -07:00
Michael Yang	ffa4d5134a	Merge pull request #4305 from ollama/mxyng/typo fix typo	2024-05-09 16:42:09 -07:00
Jeffrey Morgan	302d7fdbf3	prune partial downloads (#4272 )	2024-05-09 16:35:20 -07:00
Michael Yang	cf442cd57e	fix typo	2024-05-09 16:23:37 -07:00
Michael Yang	0e1ba65855	Merge pull request #4302 from ollama/mxyng/forward-env only forward some env vars	2024-05-09 16:21:05 -07:00
Michael Yang	6aad333c63	Merge pull request #4298 from ollama/mxyng/log-cleanup log clean up	2024-05-09 16:20:57 -07:00
Daniel Hiltgen	4fcc84e67a	Merge pull request #4304 from dhiltgen/signals Fix race in shutdown logic	2024-05-09 15:58:44 -07:00
Daniel Hiltgen	3ae2f441e0	Fix race in shutdown logic Ensure the runners are terminated	2024-05-09 15:54:02 -07:00
Zander Lewis	2abb3f6424	Update README.md (#4300 )	2024-05-09 15:30:49 -07:00
Michael Yang	ce3b212d12	only forward some env vars	2024-05-09 15:16:09 -07:00
Daniel Hiltgen	83d6d46e29	Merge pull request #4299 from dhiltgen/handle_vram_reporting_lag Wait for GPU free memory reporting to converge	2024-05-09 15:08:56 -07:00
Daniel Hiltgen	354ad9254e	Wait for GPU free memory reporting to converge The GPU drivers take a while to update their free memory reporting, so we need to wait until the values converge with what we're expecting before proceeding to start another runner in order to get an accurate picture.	2024-05-09 14:56:01 -07:00
Michael Yang	58876091f7	log clean up	2024-05-09 14:55:36 -07:00
Daniel Hiltgen	dc18eee39d	Merge pull request #4238 from dhiltgen/gpu_info Record more GPU information	2024-05-09 14:26:58 -07:00
Daniel Hiltgen	8727a9c140	Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX.	2024-05-09 14:18:14 -07:00
Daniel Hiltgen	d0425f26cf	Merge pull request #4294 from dhiltgen/harden_subprocess_reaping Harden subprocess reaping	2024-05-09 14:02:16 -07:00
Bruce MacDonald	cfa84b8470	add done_reason to the api (#4235 )	2024-05-09 13:30:14 -07:00
Michael Yang	1580ed4c06	Merge pull request #4295 from ollama/mxyng/fix-list routes: skip invalid filepaths	2024-05-09 11:37:34 -07:00
Michael Yang	a7ee84fc31	routes: skip invalid filepaths	2024-05-09 11:23:22 -07:00
Daniel Hiltgen	84ac7ce139	Refine subprocess reaping	2024-05-09 11:21:31 -07:00
tusharhero	788b092c49	docs: add Guix package manager in README. (#4040 )	2024-05-09 11:10:24 -07:00
J S	5cde17a096	Add PromptingTools.jl (#2192 )	2024-05-09 09:39:05 -07:00
Daniel Hiltgen	c3837eb08c	Merge pull request #4289 from dhiltgen/doc_container_workarounds Doc container usage and workaround for nvidia errors	2024-05-09 09:27:29 -07:00
Daniel Hiltgen	8cc0ee2efe	Doc container usage and workaround for nvidia errors	2024-05-09 09:26:45 -07:00
Jeffrey Morgan	d5eec16d23	use model defaults for `num_gqa`, `rope_frequency_base` and `rope_frequency_scale` (#1983 )	2024-05-09 09:06:13 -07:00
likelovewant	a3906a6173	update links	2024-05-09 14:00:53 +08:00
Carlos Gamez	daa1a032f7	Update langchainjs.md (#2027 ) Updated sample code as per warning notification from the package maintainers	2024-05-08 20:21:03 -07:00
jmorganca	6042e8bc57	remove `bash-comparemodels` example	2024-05-08 19:49:45 -07:00
Daniel Hiltgen	920a4b0794	Merge remote-tracking branch 'upstream/main' into pr3702	2024-05-08 16:44:35 -07:00
ManniX-ITA	c496967e56	Merge branch 'ollama:main' into mannix-server	2024-04-18 18:45:15 +02:00
ManniX-ITA	c942e4a07b	Fixed startup sequence to report model loading	2024-04-17 17:40:32 +02:00
ManniX-ITA	bd54b08261	Streamlined WaitUntilRunning	2024-04-17 17:39:52 +02:00