Merge branch 'ollama:main' into main

2025-12-21 22:33:56 +00:00 · 2025-08-13 12:36:50 +08:00
parent 9231379bce a343ae53a4
commit d4af9f04f9
21 changed files with 729 additions and 249 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -66,6 +66,7 @@ Examples:

      llm/backend/mlx: support the llama architecture
      CONTRIBUTING: provide clarity on good commit messages, and bad
+      docs: simplify manual installation with shorter curl commands

 Bad Examples:

--- a/api/types.go
+++ b/api/types.go
@@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
 	return ok
 }

-// AsBool returns the value as a bool (true if enabled in any way)
-func (t *ThinkValue) AsBool() bool {
+// Bool returns the value as a bool (true if enabled in any way)
+func (t *ThinkValue) Bool() bool {
 	if t == nil || t.Value == nil {
 		return false
 	}
@@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
 	}
 }

-// AsString returns the value as a string
-func (t *ThinkValue) AsString() string {
+// String returns the value as a string
+func (t *ThinkValue) String() string {
 	if t == nil || t.Value == nil {
 		return ""
 	}
--- a/discover/types.go
+++ b/discover/types.go
@@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 // For each GPU, check if it does NOT support flash attention
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
-		supportsFA := gpu.Library == "metal" ||
+		supportsFA := gpu.Library == "cpu" ||
+			gpu.Library == "metal" ||
 			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
 			gpu.Library == "rocm"

--- a/docs/api.md
+++ b/docs/api.md
@@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co

 ```json
 {
-  "status": "downloading digestname",
+  "status": "pulling digestname",
  "digest": "digestname",
  "total": 2142590208,
  "completed": 241970
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.

-This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
+This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

+Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
+
 ## How can I tell if my model was loaded onto the GPU?

 Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -57,8 +59,8 @@ ollama ps
 > **Output**:
 >
 > ```
-> NAME      	ID          	SIZE 	PROCESSOR	UNTIL
-> llama3:70b	bcfb190ca3a7	42 GB	100% GPU 	4 minutes from now
+> NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
+> gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
 > ```

 The `Processor` column will show which memory the model was loaded in to:
@@ -148,9 +150,11 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```

-## Does Ollama send my prompts and answers back to ollama.com?
+## Does Ollama send my prompts and responses back to ollama.com?

-No. Ollama runs locally, and conversation data does not leave your machine.
+If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
+
+If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.

 ## How can I expose Ollama on my network?

@@ -345,4 +349,4 @@ Ollama for Windows and macOS register as a login item during installation.  You
 - Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove

 **MacOS Ventura (v13) and later**
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -28,7 +28,7 @@ import (
 	"github.com/stretchr/testify/require"
 )

-const (
+var (
 	smol = "llama3.2:1b"
 )

@@ -37,6 +37,7 @@ var (

 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
+		"gpt-oss:20b",
 		"gemma3n:e2b",
 		"mistral-small3.2:latest",
 		"deepseek-r1:1.5b",
@@ -126,6 +127,7 @@ var (
 		"gemma3n",
 		"glm4",
 		"goliath",
+		"gpt-oss:20b",
 		"granite-code",
 		"granite3-dense",
 		"granite3-guardian",
@@ -255,8 +257,13 @@ var (
 	}
 )

-func Init() {
+func init() {
 	lifecycle.InitLogging()
+	custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL")
+	if custom != "" {
+		slog.Info("setting smol test model to " + custom)
+		smol = custom
+	}
 }

 func FindPort() string {
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -7,12 +7,12 @@ This enables matching up devices and information reported by the backend
 with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
 ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 39 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 67 +++++++++++++++++++++++++++++---
 ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 41 insertions(+)
+ 3 files changed, 63 insertions(+), 6 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..48839339 100644
+index 74e467163..48839339d 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
@@ -24,10 +24,93 @@ index 74e46716..48839339 100644
         size_t memory_total;
         enum ggml_backend_dev_type type;
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..d6960174 100644
+index cb0d8528d..1492368de 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+@@ -173,6 +173,51 @@ static int ggml_cuda_parse_id(char devName[]) {
+ }
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+ 
+static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
+    char id[64];
+
+    #if !defined(GGML_USE_HIP)
+    snprintf(id, sizeof(id),
+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        (unsigned char)prop.uuid.bytes[0],
+        (unsigned char)prop.uuid.bytes[1],
+        (unsigned char)prop.uuid.bytes[2],
+        (unsigned char)prop.uuid.bytes[3],
+        (unsigned char)prop.uuid.bytes[4],
+        (unsigned char)prop.uuid.bytes[5],
+        (unsigned char)prop.uuid.bytes[6],
+        (unsigned char)prop.uuid.bytes[7],
+        (unsigned char)prop.uuid.bytes[8],
+        (unsigned char)prop.uuid.bytes[9],
+        (unsigned char)prop.uuid.bytes[10],
+        (unsigned char)prop.uuid.bytes[11],
+        (unsigned char)prop.uuid.bytes[12],
+        (unsigned char)prop.uuid.bytes[13],
+        (unsigned char)prop.uuid.bytes[14],
+        (unsigned char)prop.uuid.bytes[15]
+        );
+    #else
+    #ifdef _WIN32
+        snprintf(id, sizeof(id), "%d", device_num);
+    #else
+    try {
+        std::string uuid = std::string(prop.uuid.bytes, 16);
+
+        size_t pos = 0;
+        unsigned long long v = stoull(uuid, &pos, 16);
+        if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
+            throw std::invalid_argument("invalid uuid");
+
+        snprintf(id, sizeof(id), "GPU-%016llx", v);
+    } catch (const std::exception &e) {
+        snprintf(id, sizeof(id), "%d", device_num);
+    }
+    #endif
+    #endif
+
+    return id;
+}
+
+ static ggml_cuda_device_info ggml_cuda_init() {
+ #ifdef __HIP_PLATFORM_AMD__
+     // Workaround for a rocBLAS bug when using multiple graphics cards:
+@@ -261,22 +306,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
+                 info.devices[id].cc += prop.minor * 0x10;
+             }
+         }
+-        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
+                       id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
+-                      device_vmm ? "yes" : "no", prop.warpSize);
+                      device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
+ #elif defined(GGML_USE_MUSA)
+         // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
+         info.devices[id].warp_size = 32;
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
+         info.devices[id].cc += prop.minor * 0x10;
+-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
+ #else
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+     }
+ 
+@@ -2884,6 +2931,7 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
@@ -35,7 +118,7 @@ index cb0d8528..d6960174 100644
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+@@ -2896,6 +2944,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
@@ -47,7 +130,7 @@ index cb0d8528..d6960174 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+@@ -2910,6 +2963,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
@@ -55,47 +138,16 @@ index cb0d8528..d6960174 100644
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
-@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3457,6 +3511,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
+                dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
 
-+                #if !defined(GGML_USE_HIP)
-+                char id[64];
-+                snprintf(id, sizeof(id),
-+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+                    (unsigned char)prop.uuid.bytes[0],
-+                    (unsigned char)prop.uuid.bytes[1],
-+                    (unsigned char)prop.uuid.bytes[2],
-+                    (unsigned char)prop.uuid.bytes[3],
-+                    (unsigned char)prop.uuid.bytes[4],
-+                    (unsigned char)prop.uuid.bytes[5],
-+                    (unsigned char)prop.uuid.bytes[6],
-+                    (unsigned char)prop.uuid.bytes[7],
-+                    (unsigned char)prop.uuid.bytes[8],
-+                    (unsigned char)prop.uuid.bytes[9],
-+                    (unsigned char)prop.uuid.bytes[10],
-+                    (unsigned char)prop.uuid.bytes[11],
-+                    (unsigned char)prop.uuid.bytes[12],
-+                    (unsigned char)prop.uuid.bytes[13],
-+                    (unsigned char)prop.uuid.bytes[14],
-+                    (unsigned char)prop.uuid.bytes[15]
-+                  );
-+                dev_ctx->id = id;
-+                #else
-+                #ifdef _WIN32
-+                char id[16];
-+                snprintf(id, sizeof(id), "%d", i);
-+                dev_ctx->id = id;
-+                #else
-+                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
-+                #endif
-+
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
-                     /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..a9eeebc6 100644
+index 1b56f858c..a9eeebc6a 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
--- a/llama/patches/0026-ggml-No-alloc-mode.patch
+++ b/llama/patches/0026-ggml-No-alloc-mode.patch
@@ -0,0 +1,99 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Wed, 23 Jul 2025 11:58:49 -0700
+Subject: [PATCH] ggml: No-alloc mode
+
+Callers can set a backend buffer type to be no-alloc, meaning that
+it does not allocate memory for tensors or operations. This can
+be used for calculating memory requirements. Tensors and graphs
+must be recreated with no-alloc set to false before loading data.
+
+Defaults to false for newly created backend buffer types.
+---
+ ggml/include/ggml-backend.h  |  1 +
+ ggml/src/ggml-backend-impl.h |  2 ++
+ ggml/src/ggml-backend.cpp    | 19 ++++++++++++++++++-
+ 3 files changed, 21 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 48839339..3903c3cb 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -35,6 +35,7 @@ extern "C" {
+     //
+ 
+     GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API void                  ggml_backend_buft_set_alloc     (ggml_backend_buffer_type_t buft, bool alloc);
+     GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
+     GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+     GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
+index c36c12d6..81749a5a 100644
+--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
+@@ -32,6 +32,7 @@ extern "C" {
+         struct ggml_backend_buffer_type_i  iface;
+         ggml_backend_dev_t device;
+         void * context;
+        bool no_alloc;
+     };
+ 
+     //
+@@ -63,6 +64,7 @@ extern "C" {
+         void * context;
+         size_t size;
+         enum ggml_backend_buffer_usage usage;
+        bool no_alloc;
+     };
+ 
+     GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index be335e8c..84928bc3 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
+     return buft->iface.get_name(buft);
+ }
+ 
+void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
+    buft->no_alloc = !alloc;
+}
+
+ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+     if (size == 0) {
+         // return a dummy buffer for zero-sized allocations
+         return ggml_backend_buffer_init(buft, {}, NULL, 0);
+     }
+ 
+    if (buft->no_alloc) {
+        ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        buf->no_alloc = true;
+        return buf;
+    }
+
+     return buft->iface.alloc_buffer(buft, size);
+ }
+ 
+@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
+         /* .buft      = */ buft,
+         /* .context   = */ context,
+         /* .size      = */ size,
+-        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
+        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY,
+        /* .no_alloc  = */ false
+     };
+ 
+     return buffer;
+@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+         return NULL;
+     }
+ 
+    // If we aren't allocating memory, return a placeholder non-NULL pointer
+    // that meets alignment requirements
+    if (buffer->no_alloc) {
+        return (void *)ggml_backend_buffer_get_alignment(buffer);
+    }
+
+     void * base = buffer->iface.get_base(buffer);
+ 
+     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -15,6 +15,9 @@ import (
 )

 type Backend interface {
+	// Close frees all memory associated with this backend
+	Close()
+
 	Load(ctx context.Context, progress func(float32)) error

 	// BackendMemory returns the memory allocations that were made for this model
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -19,6 +19,7 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"unicode"
 	"unsafe"
@@ -33,15 +34,33 @@ import (
 	"golang.org/x/sync/errgroup"
 )

-func devices() []*C.struct_ggml_backend_device {
-	ggml.OnceLoad()
-	ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
-	for i := range ds {
-		ds[i] = C.ggml_backend_dev_get(C.size_t(i))
-	}
+var (
+	cpus, accels, gpus []C.ggml_backend_dev_t
+	backends           map[C.ggml_backend_dev_t]C.ggml_backend_t
+)

-	return ds
-}
+var initDevices = sync.OnceFunc(func() {
+	ggml.OnceLoad()
+
+	backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
+	for i := range C.ggml_backend_dev_count() {
+		d := C.ggml_backend_dev_get(i)
+
+		switch C.ggml_backend_dev_type(d) {
+		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
+			if len(cpus) == 0 {
+				// only the first cpu device should be used
+				cpus = append(cpus, d)
+			}
+		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
+			accels = append(accels, d)
+		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
+			gpus = append(gpus, d)
+		}
+
+		backends[d] = C.ggml_backend_dev_init(d, nil)
+	}
+})

 type Backend struct {
 	// modelPath is the location of the model data
@@ -53,28 +72,31 @@ type Backend struct {
 	// to the name that is used by the model definition
 	tensorLoadTargets map[string][]string

-	sched         *C.struct_ggml_backend_sched
-	schedBackends []*C.struct_ggml_backend
-	schedBufts    []*C.struct_ggml_backend_buffer_type
+	sched         C.ggml_backend_sched_t
+	schedBackends []C.ggml_backend_t
+	schedBufts    []C.ggml_backend_buffer_type_t

 	tensors map[string]*C.struct_ggml_tensor

 	// input is the backend used for inputs
-	input *C.struct_ggml_backend_buffer_type
+	input C.ggml_backend_buffer_type_t

 	// layers is the backend used for repeating layers
-	layers map[int]*C.struct_ggml_backend_buffer_type
+	layers map[int]C.ggml_backend_buffer_type_t

 	// requiredMemory is the cumulative memory allocations needed by the backend
 	requiredMemory *ml.BackendMemory

 	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
-	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
+	btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory

 	flashAttention bool

 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
+
+	// weightBuffers are the GGML contexts and buffers for allocating weights
+	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
 }

 func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
@@ -99,27 +121,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		"num_key_values", len(meta.KV()),
 	)

+	initDevices()
+
 	var requiredMemory ml.BackendMemory
-	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
+	btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)

 	type deviceBufferType struct {
-		d   *C.struct_ggml_backend_device
-		bts []*C.struct_ggml_backend_buffer_type
-	}
-
-	var cpus, accels, gpus []*C.struct_ggml_backend_device
-	for _, d := range devices() {
-		switch C.ggml_backend_dev_type(d) {
-		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-			if len(cpus) == 0 {
-				// only the first cpu device should be used
-				cpus = append(cpus, d)
-			}
-		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-			accels = append(accels, d)
-		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
-			gpus = append(gpus, d)
-		}
+		d   C.ggml_backend_dev_t
+		bts []C.ggml_backend_buffer_type_t
 	}

 	blocks := int(meta.KV().BlockCount())
@@ -149,7 +158,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
-			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
+			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
@@ -235,16 +244,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	targets := make(map[string][]string)

 	// contexts are shared by tensors of the same buffer type
-	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
+	ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
+	createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
-				// slog.Info("XXX before ggml_init")
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
 					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
 					no_alloc: true,
 				})
-				// slog.Info("XXX after ggml_init")
 			}

 			targets[t.source.Name] = append(targets[t.source.Name], t.target)
@@ -332,7 +339,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}

 	// allocate buffers for each context
-	bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
+	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
 	for bt, c := range ctxs {
 		if C.ggml_get_first_tensor(c) == nil {
 			continue
@@ -350,6 +357,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}

 		if b == nil {
+			for _, b := range bbs {
+				C.ggml_backend_buffer_free(b)
+			}
+
+			for _, ctx := range ctxs {
+				C.ggml_free(ctx)
+			}
+
 			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
 		}

@@ -390,13 +405,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}

 	// map devices to backend buffer types so new tensors can be assigned to the correct device
-	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
+	deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)

 	// create backends and buffer types used for the compute graph scheduler
-	var schedBackends []*C.struct_ggml_backend
-	var schedBufts []*C.struct_ggml_backend_buffer_type
+	var schedBackends []C.ggml_backend_t
+	var schedBufts []C.ggml_backend_buffer_type_t
 	for _, d := range append(gpus, append(accels, cpus...)...) {
-		b := C.ggml_backend_dev_init(d, nil)
+		b := backends[d]
 		bt := C.ggml_backend_get_default_buffer_type(b)

 		deviceBufferTypes[d] = bt
@@ -428,8 +443,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		schedBackends: schedBackends,
 		schedBufts:    schedBufts,
 		input:         deviceBufferTypes[input.d],
-		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
-			m := make(map[int]*C.struct_ggml_backend_buffer_type)
+		layers: func() map[int]C.ggml_backend_buffer_type_t {
+			m := make(map[int]C.ggml_backend_buffer_type_t)
 			for i, layer := range layers {
 				m[i] = deviceBufferTypes[layer.d]
 			}
@@ -438,6 +453,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory: &requiredMemory,
 		btDeviceMemory: btDeviceMemory,
 		maxGraphNodes:  maxGraphNodes,
+		weightBuffers:  bbs,
 	}, nil
 }

@@ -445,6 +461,19 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }

+func (b *Backend) Close() {
+	if b == nil {
+		return
+	}
+
+	for ctx, b := range b.weightBuffers {
+		C.ggml_backend_buffer_free(b)
+		C.ggml_free(ctx)
+	}
+
+	C.ggml_backend_sched_free(b.sched)
+}
+
 func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 	var doneBytes atomic.Uint64
 	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
@@ -541,10 +570,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
 	}

-	var allocatedBuffers []*C.struct_ggml_backend_buffer
+	var allocatedBuffers []C.ggml_backend_buffer_t

-	// slog.Info("XXX before ggml_init")
-	// defer slog.Info("XXX after ggml_init")
 	return &Context{
 		b:             b,
 		maxGraphNodes: n,
@@ -572,11 +599,11 @@ type Context struct {
 	graph *C.struct_ggml_cgraph

 	// buft is the buffer type used for new tensors
-	buft *C.struct_ggml_backend_buffer_type
+	buft C.ggml_backend_buffer_type_t

 	// allocatedBuffers are buffers for tensors that we have allocated in this context
 	// so that we can free them when we close the context
-	allocatedBuffers *[]*C.struct_ggml_backend_buffer
+	allocatedBuffers *[]C.ggml_backend_buffer_t

 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
@@ -1407,55 +1434,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {

 	return t
 }
-
-// TODO - DRY this out with New if possible
-func newTestBackend(size int) *Backend {
-	var cpus []*C.struct_ggml_backend_device
-	for _, d := range devices() {
-		switch C.ggml_backend_dev_type(d) {
-		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-			if len(cpus) == 0 {
-				// only the first cpu device should be used
-				cpus = append(cpus, d)
-				break
-			}
-		}
-	}
-	var schedBackends []*C.struct_ggml_backend
-	var schedBufts []*C.struct_ggml_backend_buffer_type
-	b := C.ggml_backend_dev_init(cpus[0], nil)
-	bt := C.ggml_backend_get_default_buffer_type(b)
-	C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
-	// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
-	schedBackends = append(schedBackends, b)
-	schedBufts = append(schedBufts, bt)
-	return &Backend{
-		meta: nil,
-		sched: C.ggml_backend_sched_new(
-			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
-			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
-			C.int(len(schedBackends)),
-			C.size_t(max(8192, size)),
-			false,
-			false,
-		),
-		input:         bt,
-		maxGraphNodes: max(8192, size),
-		schedBackends: schedBackends,
-		schedBufts:    schedBufts,
-	}
-}
-
-func newTestContext(b *Backend, n int) *Context {
-	n = max(8192, n)
-	// slog.Info("XXX before ggml_init")
-	// defer slog.Info("XXX after ggml_init")
-	return &Context{
-		b:             b,
-		maxGraphNodes: n,
-		ctx: C.ggml_init(C.struct_ggml_init_params{
-			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
-			no_alloc: true,
-		}),
-	}
-}
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -35,6 +35,7 @@ extern "C" {
    //

    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API void                  ggml_backend_buft_set_alloc     (ggml_backend_buffer_type_t buft, bool alloc);
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@@ -32,6 +32,7 @@ extern "C" {
        struct ggml_backend_buffer_type_i  iface;
        ggml_backend_dev_t device;
        void * context;
+        bool no_alloc;
    };

    //
@@ -63,6 +64,7 @@ extern "C" {
        void * context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
+        bool no_alloc;
    };

    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name(buft);
 }

+void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
+    buft->no_alloc = !alloc;
+}
+
 ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    if (size == 0) {
        // return a dummy buffer for zero-sized allocations
        return ggml_backend_buffer_init(buft, {}, NULL, 0);
    }

+    if (buft->no_alloc) {
+        ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        buf->no_alloc = true;
+        return buf;
+    }
+
    return buft->iface.alloc_buffer(buft, size);
 }

@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
        /* .buft      = */ buft,
        /* .context   = */ context,
        /* .size      = */ size,
-        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
+        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY,
+        /* .no_alloc  = */ false
    };

    return buffer;
@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
        return NULL;
    }

+    // If we aren't allocating memory, return a placeholder non-NULL pointer
+    // that meets alignment requirements
+    if (buffer->no_alloc) {
+        return (void *)ggml_backend_buffer_get_alignment(buffer);
+    }
+
    void * base = buffer->iface.get_base(buffer);

    GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -175,6 +175,51 @@ static int ggml_cuda_parse_id(char devName[]) {
 }
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)

+static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
+    char id[64];
+
+    #if !defined(GGML_USE_HIP)
+    snprintf(id, sizeof(id),
+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        (unsigned char)prop.uuid.bytes[0],
+        (unsigned char)prop.uuid.bytes[1],
+        (unsigned char)prop.uuid.bytes[2],
+        (unsigned char)prop.uuid.bytes[3],
+        (unsigned char)prop.uuid.bytes[4],
+        (unsigned char)prop.uuid.bytes[5],
+        (unsigned char)prop.uuid.bytes[6],
+        (unsigned char)prop.uuid.bytes[7],
+        (unsigned char)prop.uuid.bytes[8],
+        (unsigned char)prop.uuid.bytes[9],
+        (unsigned char)prop.uuid.bytes[10],
+        (unsigned char)prop.uuid.bytes[11],
+        (unsigned char)prop.uuid.bytes[12],
+        (unsigned char)prop.uuid.bytes[13],
+        (unsigned char)prop.uuid.bytes[14],
+        (unsigned char)prop.uuid.bytes[15]
+        );
+    #else
+    #ifdef _WIN32
+        snprintf(id, sizeof(id), "%d", device_num);
+    #else
+    try {
+        std::string uuid = std::string(prop.uuid.bytes, 16);
+
+        size_t pos = 0;
+        unsigned long long v = stoull(uuid, &pos, 16);
+        if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
+            throw std::invalid_argument("invalid uuid");
+
+        snprintf(id, sizeof(id), "GPU-%016llx", v);
+    } catch (const std::exception &e) {
+        snprintf(id, sizeof(id), "%d", device_num);
+    }
+    #endif
+    #endif
+
+    return id;
+}
+
 static ggml_cuda_device_info ggml_cuda_init() {
 #ifdef __HIP_PLATFORM_AMD__
    // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -263,22 +308,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
                info.devices[id].cc += prop.minor * 0x10;
            }
        }
-        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
                      id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
-                      device_vmm ? "yes" : "no", prop.warpSize);
+                      device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
 #elif defined(GGML_USE_MUSA)
        // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
        info.devices[id].warp_size = 32;
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
        info.devices[id].cc += prop.minor * 0x10;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
 #else
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
    }

@@ -3475,38 +3522,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                cudaDeviceProp prop;
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;
-
-                #if !defined(GGML_USE_HIP)
-                char id[64];
-                snprintf(id, sizeof(id),
-                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                    (unsigned char)prop.uuid.bytes[0],
-                    (unsigned char)prop.uuid.bytes[1],
-                    (unsigned char)prop.uuid.bytes[2],
-                    (unsigned char)prop.uuid.bytes[3],
-                    (unsigned char)prop.uuid.bytes[4],
-                    (unsigned char)prop.uuid.bytes[5],
-                    (unsigned char)prop.uuid.bytes[6],
-                    (unsigned char)prop.uuid.bytes[7],
-                    (unsigned char)prop.uuid.bytes[8],
-                    (unsigned char)prop.uuid.bytes[9],
-                    (unsigned char)prop.uuid.bytes[10],
-                    (unsigned char)prop.uuid.bytes[11],
-                    (unsigned char)prop.uuid.bytes[12],
-                    (unsigned char)prop.uuid.bytes[13],
-                    (unsigned char)prop.uuid.bytes[14],
-                    (unsigned char)prop.uuid.bytes[15]
-                  );
-                dev_ctx->id = id;
-                #else
-                #ifdef _WIN32
-                char id[16];
-                snprintf(id, sizeof(id), "%d", i);
-                dev_ctx->id = id;
-                #else
-                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
-                #endif
+                dev_ctx->id = ggml_cuda_parse_uuid(prop, i);

                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -34,10 +34,12 @@ type ErrorResponse struct {
 }

 type Message struct {
-	Role      string     `json:"role"`
-	Content   any        `json:"content"`
-	Reasoning string     `json:"reasoning,omitempty"`
-	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
+	Role       string     `json:"role"`
+	Content    any        `json:"content"`
+	Reasoning  string     `json:"reasoning,omitempty"`
+	ToolCalls  []ToolCall `json:"tool_calls,omitempty"`
+	Name       string     `json:"name,omitempty"`
+	ToolCallID string     `json:"tool_call_id,omitempty"`
 }

 type Choice struct {
@@ -101,6 +103,7 @@ type ChatCompletionRequest struct {
 	ResponseFormat   *ResponseFormat `json:"response_format"`
 	Tools            []api.Tool      `json:"tools"`
 	Reasoning        *Reasoning      `json:"reasoning,omitempty"`
+	ReasoningEffort  *string         `json:"reasoning_effort,omitempty"`
 }

 type ChatCompletion struct {
@@ -401,9 +404,20 @@ func toModel(r api.ShowResponse, m string) Model {
 func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	var messages []api.Message
 	for _, msg := range r.Messages {
+		toolName := ""
+		if strings.ToLower(msg.Role) == "tool" {
+			toolName = msg.Name
+			if toolName == "" && msg.ToolCallID != "" {
+				toolName = nameFromToolCallID(r.Messages, msg.ToolCallID)
+			}
+		}
 		switch content := msg.Content.(type) {
 		case string:
-			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning})
+			toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
+			if err != nil {
+				return nil, err
+			}
+			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName})
 		case []any:
 			for _, c := range content {
 				data, ok := c.(map[string]any)
@@ -454,7 +468,21 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					return nil, errors.New("invalid message format")
 				}
 			}
+			// since we might have added multiple messages above, if we have tools
+			// calls we'll add them to the last message
+			if len(messages) > 0 && len(msg.ToolCalls) > 0 {
+				toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
+				if err != nil {
+					return nil, err
+				}
+				messages[len(messages)-1].ToolCalls = toolCalls
+				if toolName != "" {
+					messages[len(messages)-1].ToolName = toolName
+				}
+				messages[len(messages)-1].Thinking = msg.Reasoning
+			}
 		default:
+			// content is only optional if tool calls are present
 			if msg.ToolCalls == nil {
 				return nil, fmt.Errorf("invalid message content type: %T", content)
 			}
@@ -467,7 +495,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					return nil, errors.New("invalid tool call arguments")
 				}
 			}
-			messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls})
+			messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls})
 		}
 	}

@@ -514,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		options["top_p"] = 1.0
 	}

-	if r.Reasoning != nil {
-		options["reasoning"] = *r.Reasoning.Effort
-	}
-
 	var format json.RawMessage
 	if r.ResponseFormat != nil {
 		switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
@@ -533,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {

 	var think *api.ThinkValue
 	if r.Reasoning != nil {
+		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
+	} else if r.ReasoningEffort != nil {
+		options["reasoning"] = *r.ReasoningEffort
+		think = &api.ThinkValue{
+			Value: *r.ReasoningEffort,
+		}
 	}

 	return &api.ChatRequest{
@@ -549,6 +579,33 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}, nil
 }

+func nameFromToolCallID(messages []Message, toolCallID string) string {
+	// iterate backwards to be more resilient to duplicate tool call IDs (this
+	// follows "last one wins")
+	for i := len(messages) - 1; i >= 0; i-- {
+		msg := messages[i]
+		for _, tc := range msg.ToolCalls {
+			if tc.ID == toolCallID {
+				return tc.Function.Name
+			}
+		}
+	}
+	return ""
+}
+
+func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
+	apiToolCalls := make([]api.ToolCall, len(toolCalls))
+	for i, tc := range toolCalls {
+		apiToolCalls[i].Function.Name = tc.Function.Name
+		err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments)
+		if err != nil {
+			return nil, errors.New("invalid tool call arguments")
+		}
+	}
+
+	return apiToolCalls, nil
+}
+
 func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	options := make(map[string]any)

--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -235,6 +235,210 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
+		{
+			name: "chat handler with tools and content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role:    "assistant",
+						Content: "Let's see what the weather is like in Paris",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "chat handler with tools and empty content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "chat handler with tools and thinking content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role:     "assistant",
+						Thinking: "Let's see what the weather is like in Paris",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "tool response with call ID",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
+					{"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+					{
+						Role:     "tool",
+						Content:  "The weather in Paris is 20 degrees Celsius",
+						ToolName: "get_current_weather",
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "tool response with name",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
+					{"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+					{
+						Role:     "tool",
+						Content:  "The weather in Paris is 20 degrees Celsius",
+						ToolName: "get_current_weather",
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
 		{
 			name: "chat handler with streaming tools",
 			body: `{
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -70,6 +70,10 @@ func kvCacheTypeFromStr(s string) ml.DType {
 }

 func (c *InputCache) Close() {
+	if c == nil {
+		return
+	}
+
 	c.cache.Close()
 }

--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -877,6 +877,15 @@ func (s *Server) load(
 ) {
 	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
 	if err != nil {
+		var noMem ml.ErrNoMem
+		if errors.As(err, &noMem) {
+			// We can't yet handle this but in the future we will
+			s.cache.Close()
+			if s.model != nil {
+				s.model.Backend().Close()
+			}
+		}
+
 		panic(err)
 	}

--- a/server/prompt.go
+++ b/server/prompt.go
@@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 		thinkVal := false
 		thinkLevel := ""
 		if think != nil {
-			thinkVal = think.AsBool()
-			thinkLevel = think.AsString()
+			thinkVal = think.Bool()
+			thinkLevel = think.String()
 		}
 		var b bytes.Buffer
 		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
@@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	thinkVal := false
 	thinkLevel := ""
 	if think != nil {
-		thinkVal = think.AsBool()
-		thinkLevel = think.AsString()
+		thinkVal = think.Bool()
+		thinkLevel = think.String()
 	}
 	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
 		return "", nil, err
--- a/server/routes.go
+++ b/server/routes.go
@@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
@@ -50,11 +51,16 @@ func experimentEnabled(name string) bool {

 var useClient2 = experimentEnabled("client2")

+// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
+// reduced context length on some models
+var lowVRAMThreshold uint64 = 20 * format.GibiByte
+
 var mode string = gin.DebugMode

 type Server struct {
-	addr  net.Addr
-	sched *Scheduler
+	addr    net.Addr
+	sched   *Scheduler
+	lowVRAM bool
 }

 func init() {
@@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}

-	// This model requires a minimum context to function effectively
-	if slices.Contains(model.Config.ModelFamilies, "gptoss") {
+	// This model is much more capable with a larger context, so set that
+	// unless it would penalize performance too much
+	if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") {
 		opts.NumCtx = max(opts.NumCtx, 8192)
 	}

@@ -198,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	// Validate Think value: string values currently only allowed for gptoss models
 	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
 		return
 	}

@@ -206,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
-	if req.Think != nil && req.Think.AsBool() {
+	if req.Think != nil && req.Think.Bool() {
 		caps = append(caps, model.CapabilityThinking)
 		// TODO(drifkin): consider adding a warning if it's false and the model
 		// doesn't support thinking. It's not strictly required, but it can be a
@@ -281,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}

-		values.Think = req.Think != nil && req.Think.AsBool()
+		values.Think = req.Think != nil && req.Think.Bool()
 		values.ThinkLevel = ""
 		if req.Think != nil {
-			values.ThinkLevel = req.Think.AsString()
+			values.ThinkLevel = req.Think.String()
 		}
 		values.IsThinkSet = req.Think != nil

@@ -310,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	var thinkingState *thinking.Parser
 	if !useHarmony {
 		openingTag, closingTag := thinking.InferTags(m.Template.Template)
-		if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
+		if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
 			thinkingState = &thinking.Parser{
 				OpeningTag: openingTag,
 				ClosingTag: closingTag,
@@ -364,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
-							ch <- gin.H{"error parsing tool call": err.Error()}
+							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
+							ch <- gin.H{"error": errStr}
 							return
 						}

@@ -1382,6 +1390,15 @@ func Serve(ln net.Listener) error {
 	gpus := discover.GetGPUInfo()
 	gpus.LogDetails()

+	var totalVRAM uint64
+	for _, gpu := range gpus {
+		totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
+	}
+	if totalVRAM < lowVRAMThreshold {
+		s.lowVRAM = true
+		slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
+	}
+
 	err = srvr.Serve(ln)
 	// If server is closed from the signal handler, wait for the ctx to be done
 	// otherwise error out quickly
@@ -1530,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
-	if req.Think != nil && req.Think.AsBool() {
+	if req.Think != nil && req.Think.Bool() {
 		caps = append(caps, model.CapabilityThinking)
 	}

@@ -1584,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	// Validate Think value: string values currently only allowed for gptoss models
 	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
 		return
 	}

@@ -1603,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	var thinkingState *thinking.Parser
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
+	if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
 		thinkingState = &thinking.Parser{
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
@@ -1655,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
-							ch <- gin.H{"error parsing tool call": err.Error()}
+							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
+							ch <- gin.H{"error": errStr}
 							return
 						}
 						res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}
--- a/server/sched.go
+++ b/server/sched.go
@@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
-	var estimatedVRAM uint64
-
 	var numParallelToTry []int
 	if *numParallel <= 0 {
 		// If no specific parallel setting was provided, try larger then smaller, always end with 1
@@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 	}

 	for _, gl := range gpus.ByLibrary() {
-		var ok bool
 		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)

 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
+		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
 		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))

-		// First attempt to fit the model into a single GPU
-		for _, p := range numParallelToTry {
-			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread() {
-				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+		if !envconfig.SchedSpread() {
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				// Try to pack into as few GPUs as possible, starting from 1 GPU
+				for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
+					gpuSubset := sgl[:numGPUs]
+					ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
+
+					if ok {
+						slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
+							"model", req.model.ModelPath,
+							"library", sgl[0].Library,
+							"parallel", p,
+							"required", format.HumanBytes2(estimatedVRAM),
+							"gpus", numGPUs)
 						*numParallel = p
-						return []discover.GpuInfo{g}
+						return gpuSubset
 					}
 				}
 			}
-		}
+		} else {
+			// TODO future refinements
+			// - if multiple Libraries, see if any single GPU in any Library will fit
+			// - try subsets of GPUs instead of just falling back to 1 or all in a family

-		// TODO future refinements
-		// - if multiple Libraries, see if any single GPU in any Library will fit
-		// - try subsets of GPUs instead of just falling back to 1 or all in a family
-
-		// Now try all the GPUs
-		for _, p := range numParallelToTry {
-			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
-				*numParallel = p
-				return sgl
+			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					slog.Info("new model will fit in available VRAM, loading",
+						"model", req.model.ModelPath,
+						"library", sgl[0].Library,
+						"parallel", p,
+						"required", format.HumanBytes2(estimatedVRAM),
+						"gpus", len(sgl))
+					*numParallel = p
+					return sgl
+				}
 			}
 		}
 	}