Merge branch 'ollama:main' into main

2025-12-23 15:08:27 +00:00 · 2024-07-05 21:48:45 +08:00
parent 5c0881973f 4d71c559b2
commit dc1d1a121b
16 changed files with 228 additions and 231 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -159,49 +159,18 @@ type Options struct {
 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool     `json:"numa,omitempty"`
+	UseNUMA   bool  `json:"numa,omitempty"`
-	NumCtx    int      `json:"num_ctx,omitempty"`
+	NumCtx    int   `json:"num_ctx,omitempty"`
-	NumBatch  int      `json:"num_batch,omitempty"`
+	NumBatch  int   `json:"num_batch,omitempty"`
-	NumGPU    int      `json:"num_gpu,omitempty"`
+	NumGPU    int   `json:"num_gpu,omitempty"`
-	MainGPU   int      `json:"main_gpu,omitempty"`
+	MainGPU   int   `json:"main_gpu,omitempty"`
-	LowVRAM   bool     `json:"low_vram,omitempty"`
+	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool     `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"`
-	LogitsAll bool     `json:"logits_all,omitempty"`
+	LogitsAll bool  `json:"logits_all,omitempty"`
-	VocabOnly bool     `json:"vocab_only,omitempty"`
+	VocabOnly bool  `json:"vocab_only,omitempty"`
-	UseMMap   TriState `json:"use_mmap,omitempty"`
+	UseMMap   *bool `json:"use_mmap,omitempty"`
-	UseMLock  bool     `json:"use_mlock,omitempty"`
+	UseMLock  bool  `json:"use_mlock,omitempty"`
-	NumThread int      `json:"num_thread,omitempty"`
+	NumThread int   `json:"num_thread,omitempty"`
 }
 type TriState int
 const (
 	TriStateUndefined TriState = -1
 	TriStateFalse     TriState = 0
 	TriStateTrue      TriState = 1
 )
 func (b *TriState) UnmarshalJSON(data []byte) error {
 	var v bool
 	if err := json.Unmarshal(data, &v); err != nil {
 		return err
 	}
 	if v {
 		*b = TriStateTrue
 	}
 	*b = TriStateFalse
 	return nil
 }
 func (b *TriState) MarshalJSON() ([]byte, error) {
 	if *b == TriStateUndefined {
 		return nil, nil
 	}
 	var v bool
 	if *b == TriStateTrue {
 		v = true
 	}
 	return json.Marshal(v)
 }
 // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -444,19 +413,6 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 				continue
 			}
 			if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
 				val, ok := val.(bool)
 				if !ok {
 					return fmt.Errorf("option %q must be of type boolean", key)
 				}
 				if val {
 					field.SetInt(int64(TriStateTrue))
 				} else {
 					field.SetInt(int64(TriStateFalse))
 				}
 				continue
 			}
 			switch field.Kind() {
 			case reflect.Int:
 				switch t := val.(type) {
@@ -503,6 +459,17 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 					slice[i] = str
 				}
 				field.Set(reflect.ValueOf(slice))
 			case reflect.Pointer:
 				var b bool
 				if field.Type() == reflect.TypeOf(&b) {
 					val, ok := val.(bool)
 					if !ok {
 						return fmt.Errorf("option %q must be of type boolean", key)
 					}
 					field.Set(reflect.ValueOf(&val))
 				} else {
 					return fmt.Errorf("unknown type loading config params: %v %v", field.Kind(), field.Type())
 				}
 			default:
 				return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 			}
@@ -545,7 +512,7 @@ func DefaultOptions() Options {
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
-			UseMMap:   TriStateUndefined,
+			UseMMap:   nil,
 			UseNUMA:   false,
 		},
 	}
@@ -615,19 +582,6 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
 		} else {
 			field := valueOpts.FieldByName(opt.Name)
 			if field.IsValid() && field.CanSet() {
 				if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
 					boolVal, err := strconv.ParseBool(vals[0])
 					if err != nil {
 						return nil, fmt.Errorf("invalid bool value %s", vals)
 					}
 					if boolVal {
 						out[key] = TriStateTrue
 					} else {
 						out[key] = TriStateFalse
 					}
 					continue
 				}
 				switch field.Kind() {
 				case reflect.Float32:
 					floatVal, err := strconv.ParseFloat(vals[0], 32)
@@ -655,6 +609,17 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
 				case reflect.Slice:
 					// TODO: only string slices are supported right now
 					out[key] = vals
 				case reflect.Pointer:
 					var b bool
 					if field.Type() == reflect.TypeOf(&b) {
 						boolVal, err := strconv.ParseBool(vals[0])
 						if err != nil {
 							return nil, fmt.Errorf("invalid bool value %s", vals)
 						}
 						out[key] = &boolVal
 					} else {
 						return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
 					}
 				default:
 					return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
 				}
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -108,25 +108,27 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
 }
 func TestUseMmapParsingFromJSON(t *testing.T) {
 	tr := true
 	fa := false
 	tests := []struct {
 		name string
 		req  string
-		exp  TriState
+		exp  *bool
 	}{
 		{
 			name: "Undefined",
 			req:  `{ }`,
-			exp:  TriStateUndefined,
+			exp:  nil,
 		},
 		{
 			name: "True",
 			req:  `{ "use_mmap": true }`,
-			exp:  TriStateTrue,
+			exp:  &tr,
 		},
 		{
 			name: "False",
 			req:  `{ "use_mmap": false }`,
-			exp:  TriStateFalse,
+			exp:  &fa,
 		},
 	}
@@ -144,50 +146,52 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
 }
 func TestUseMmapFormatParams(t *testing.T) {
 	tr := true
 	fa := false
 	tests := []struct {
 		name string
 		req  map[string][]string
-		exp  TriState
+		exp  *bool
 		err  error
 	}{
 		{
 			name: "True",
 			req: map[string][]string{
-				"use_mmap": []string{"true"},
+				"use_mmap": {"true"},
 			},
-			exp: TriStateTrue,
+			exp: &tr,
 			err: nil,
 		},
 		{
 			name: "False",
 			req: map[string][]string{
-				"use_mmap": []string{"false"},
+				"use_mmap": {"false"},
 			},
-			exp: TriStateFalse,
+			exp: &fa,
 			err: nil,
 		},
 		{
 			name: "Numeric True",
 			req: map[string][]string{
-				"use_mmap": []string{"1"},
+				"use_mmap": {"1"},
 			},
-			exp: TriStateTrue,
+			exp: &tr,
 			err: nil,
 		},
 		{
 			name: "Numeric False",
 			req: map[string][]string{
-				"use_mmap": []string{"0"},
+				"use_mmap": {"0"},
 			},
-			exp: TriStateFalse,
+			exp: &fa,
 			err: nil,
 		},
 		{
 			name: "invalid string",
 			req: map[string][]string{
-				"use_mmap": []string{"foo"},
+				"use_mmap": {"foo"},
 			},
-			exp: TriStateUndefined,
+			exp: nil,
 			err: fmt.Errorf("invalid bool value [foo]"),
 		},
 	}
@@ -195,11 +199,11 @@ func TestUseMmapFormatParams(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			resp, err := FormatParams(test.req)
-			require.Equal(t, err, test.err)
+			require.Equal(t, test.err, err)
 			respVal, ok := resp["use_mmap"]
-			if test.exp != TriStateUndefined {
+			if test.exp != nil {
 				assert.True(t, ok, "resp: %v", resp)
-				assert.Equal(t, test.exp, respVal)
+				assert.Equal(t, *test.exp, *respVal.(*bool))
 			}
 		})
 	}
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
-## Container fails to run on NVIDIA GPU
+## NVIDIA GPU Discovery
-Make sure you've set up the container runtime first as described in [docker.md](./docker.md)
+When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available.  Sometimes this discovery can fail to find your GPUs.  In general, running the latest driver will yield the best results.
-Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
+### Linux NVIDIA Troubleshooting
- Is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
+If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
+
 Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
 - If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
 - Is the uvm driver loaded? `sudo nvidia-modprobe -u`
 - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
 - Try rebooting
 - Make sure you're running the latest nvidia drivers
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -4,12 +4,14 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"math"
 	"net"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"time"
 )
 type OllamaHost struct {
@@ -34,17 +36,17 @@ var (
 	// Set via OLLAMA_HOST in the environment
 	Host *OllamaHost
 	// Set via OLLAMA_KEEP_ALIVE in the environment
-	KeepAlive string
+	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
 	LLMLibrary string
 	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_MAX_VRAM in the environment
 	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
 	NoHistory bool
 	// Set via OLLAMA_NOPRUNE in the environment
@@ -132,6 +134,7 @@ func init() {
 	NumParallel = 0 // Autoselect
 	MaxRunners = 0  // Autoselect
 	MaxQueuedRequests = 512
 	KeepAlive = 5 * time.Minute
 	LoadConfig()
 }
@@ -266,7 +269,10 @@ func LoadConfig() {
 		}
 	}
-	KeepAlive = clean("OLLAMA_KEEP_ALIVE")
+	ka := clean("OLLAMA_KEEP_ALIVE")
 	if ka != "" {
 		loadKeepAlive(ka)
 	}
 	var err error
 	ModelsDir, err = getModelsDir()
@@ -344,3 +350,24 @@ func getOllamaHost() (*OllamaHost, error) {
 		Port:   port,
 	}, nil
 }
 func loadKeepAlive(ka string) {
 	v, err := strconv.Atoi(ka)
 	if err != nil {
 		d, err := time.ParseDuration(ka)
 		if err == nil {
 			if d < 0 {
 				KeepAlive = time.Duration(math.MaxInt64)
 			} else {
 				KeepAlive = d
 			}
 		}
 	} else {
 		d := time.Duration(v) * time.Second
 		if d < 0 {
 			KeepAlive = time.Duration(math.MaxInt64)
 		} else {
 			KeepAlive = d
 		}
 	}
 }
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -2,8 +2,10 @@ package envconfig
 import (
 	"fmt"
 	"math"
 	"net"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -23,6 +25,21 @@ func TestConfig(t *testing.T) {
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
 	LoadConfig()
 	require.True(t, FlashAttention)
 	t.Setenv("OLLAMA_KEEP_ALIVE", "")
 	LoadConfig()
 	require.Equal(t, 5*time.Minute, KeepAlive)
 	t.Setenv("OLLAMA_KEEP_ALIVE", "3")
 	LoadConfig()
 	require.Equal(t, 3*time.Second, KeepAlive)
 	t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
 	LoadConfig()
 	require.Equal(t, 1*time.Hour, KeepAlive)
 	t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
 	LoadConfig()
 	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
 	t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
 	LoadConfig()
 	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
 }
 func TestClientFromEnvironment(t *testing.T) {
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList {
 	}()
 	if !bootstrapped {
-		slog.Debug("Detecting GPUs")
+		slog.Info("looking for compatible GPUs")
 		needRefresh = false
 		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t
@@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList {
 		rocmGPUs = AMDGetGPUInfo()
 		bootstrapped = true
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
 		}
 	}
 	// For detected GPUs, load library if not loaded
@@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 		defer C.free(unsafe.Pointer(lib))
 		C.nvcuda_init(lib, &resp)
 		if resp.err != nil {
-			slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
+			// Decide what log level based on the type of error message to help users understand why
 			msg := C.GoString(resp.err)
 			switch resp.cudaErr {
 			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
 				slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
 			case C.CUDA_ERROR_NO_DEVICE:
 				slog.Info("no nvidia devices detected", "library", libPath)
 			case C.CUDA_ERROR_UNKNOWN:
 				slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
 				slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
 			default:
 				if strings.Contains(msg, "wrong ELF class") {
 					slog.Debug("skipping 32bit library", "library", libPath)
 				} else {
 					slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
 				}
 			}
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return int(resp.num_devices), &resp.ch, libPath
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
  resp->cudaErr = CUDA_SUCCESS;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
@@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    resp->cudaErr = -1;
    return;
  }
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
              msg);
      free(msg);
      resp->err = strdup(buf);
      resp->cudaErr = -1;
      return;
    }
  }
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
-    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
+    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
    snprintf(buf, buflen, "nvcuda init failure: %d", ret);
    resp->err = strdup(buf);
    resp->cudaErr = ret;
    return;
  }
@@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    resp->cudaErr = ret;
    return;
  }
 }
@@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  if (h.handle == NULL) {
-    resp->err = strdup("nvcuda handle isn't initialized");
+    resp->err = strdup("cuda driver library handle isn't initialized");
    return;
  }
  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device failed to initialize");
+    snprintf(buf, buflen, "cuda driver library device failed to initialize");
    resp->err = strdup(buf);
    return;
  }
@@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
+    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
+    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
@@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release device context %d", ret);
+    LOG(1, "cuda driver library failed to release device context %d", ret);
  }
 }
@@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda device failed to initialize");
+    LOG(1, "cuda driver library device failed to initialize");
    return;
  }
@@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to get device context %d", ret);
+    LOG(1, "cuda driver library failed to get device context %d", ret);
    return;
  }
  ret = (*h.cuMemGetInfo_v2)(free, total);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda device memory info lookup failure %d", ret);
+    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
@@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release device context %d", ret);
+    LOG(1, "cuda driver library failed to release device context %d", ret);
  }
 }
 void nvcuda_release(nvcuda_handle_t h) {
-  LOG(h.verbose, "releasing nvcuda library\n");
+  LOG(h.verbose, "releasing cuda driver library\n");
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -7,9 +7,12 @@
 typedef enum cudaError_enum {
  CUDA_SUCCESS = 0,
  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_MEMORY_ALLOCATION = 2,
+  CUDA_ERROR_OUT_OF_MEMORY = 2,
  CUDA_ERROR_NOT_INITIALIZED = 3,
  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
  CUDA_ERROR_NO_DEVICE = 100,
  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
  CUDA_ERROR_UNKNOWN = 999,
  // Other values omitted for now...
 } CUresult;
@@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp {
  char *err;  // If err is non-null handle is invalid
  nvcuda_handle_t ch;
  int num_devices;
  CUresult cudaErr;
 } nvcuda_init_resp_t;
 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1732,7 +1732,7 @@ struct llama_server_context
                            slot.n_past -= 1;
                        }
-                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
+                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
                        if (slot.ga_n != 1)
                        {
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -38,7 +38,7 @@ func Init() error {
 	}
 	var variants []string
-	for v := range availableServers() {
+	for v := range getAvailableServers() {
 		variants = append(variants, v)
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
@@ -50,7 +50,7 @@ func Init() error {
 // binary names may contain an optional variant separated by '_'
 // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
 // Any library without a variant is the lowest common denominator
-func availableServers() map[string]string {
+func getAvailableServers() map[string]string {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		slog.Error("payload lookup error", "error", err)
@@ -80,7 +80,7 @@ func availableServers() map[string]string {
 // TODO - switch to metadata based mapping
 func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
-	availableServers := availableServers()
+	availableServers := getAvailableServers()
 	requested := info.Library
 	if info.Variant != gpu.CPUCapabilityNone {
 		requested += "_" + info.Variant.String()
@@ -115,27 +115,29 @@ func serversForGpu(info gpu.GpuInfo) []string {
 		servers = append(servers, alt...)
 	}
-	// Load up the best CPU variant if not primary requested
+	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
-	if info.Library != "cpu" {
+		// Load up the best CPU variant if not primary requested
-		variant := gpu.GetCPUCapability()
+		if info.Library != "cpu" {
-		// If no variant, then we fall back to default
+			variant := gpu.GetCPUCapability()
-		// If we have a variant, try that if we find an exact match
+			// If no variant, then we fall back to default
-		// Attempting to run the wrong CPU instructions will panic the
+			// If we have a variant, try that if we find an exact match
-		// process
+			// Attempting to run the wrong CPU instructions will panic the
-		if variant != gpu.CPUCapabilityNone {
+			// process
-			for cmp := range availableServers {
+			if variant != gpu.CPUCapabilityNone {
-				if cmp == "cpu_"+variant.String() {
+				for cmp := range availableServers {
-					servers = append(servers, cmp)
+					if cmp == "cpu_"+variant.String() {
-					break
+						servers = append(servers, cmp)
 						break
 					}
 				}
 			} else {
 				servers = append(servers, "cpu")
 			}
 		} else {
 			servers = append(servers, "cpu")
 		}
 	}
-	if len(servers) == 0 {
+		if len(servers) == 0 {
-		servers = []string{"cpu"}
+			servers = []string{"cpu"}
 		}
 	}
 	return servers
@@ -147,7 +149,7 @@ func serverForCpu() string {
 		return "metal"
 	}
 	variant := gpu.GetCPUCapability()
-	availableServers := availableServers()
+	availableServers := getAvailableServers()
 	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
--- a/llm/server.go
+++ b/llm/server.go
@@ -131,7 +131,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
-	availableServers := availableServers()
+	availableServers := getAvailableServers()
 	if len(availableServers) == 0 {
 		if runtime.GOOS != "windows" {
 			slog.Warn("llama server binary disappeared, reinitializing payloads")
 			err = Init()
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
 			}
 			availableServers = getAvailableServers()
 		} else {
 			return nil, finalErr
 		}
 	}
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
@@ -208,7 +221,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-			opts.UseMMap = api.TriStateFalse
+			opts.UseMMap = new(bool)
 			*opts.UseMMap = false
 		}
 	}
@@ -219,10 +233,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	// Windows CUDA should not use mmap for best performance
 	// Linux  with a model larger than free space, mmap leads to thrashing
 	// For CPU loads we want the memory to be allocated, not FS cache
-	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
+	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == nil) ||
-		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == nil) ||
-		(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
+		(gpus[0].Library == "cpu" && opts.UseMMap == nil) ||
-		opts.UseMMap == api.TriStateFalse {
+		(opts.UseMMap != nil && !*opts.UseMMap) {
 		params = append(params, "--no-mmap")
 	}
--- a/llm/status.go
+++ b/llm/status.go
@@ -25,7 +25,7 @@ var errorPrefixes = []string{
 	"CUDA error",
 	"cudaMalloc failed",
 	"\"ERR\"",
-	"architecture",
+	"error loading model",
 }
 func (w *StatusWriter) Write(b []byte) (int, error) {
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -103,18 +103,9 @@ func (mp ModelPath) GetShortTagname() string {
 	return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
 }
 // modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
 // The models directory is where Ollama stores its model files and manifests.
 func modelsDir() (string, error) {
 	return envconfig.ModelsDir, nil
 }
 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	dir, err := modelsDir()
+	dir := envconfig.ModelsDir
 	if err != nil {
 		return "", err
 	}
 	return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
 }
@@ -127,10 +118,7 @@ func (mp ModelPath) BaseURL() *url.URL {
 }
 func GetManifestPath() (string, error) {
-	dir, err := modelsDir()
+	dir := envconfig.ModelsDir
 	if err != nil {
 		return "", err
 	}
 	path := filepath.Join(dir, "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
@@ -141,10 +129,7 @@ func GetManifestPath() (string, error) {
 }
 func GetBlobsPath(digest string) (string, error) {
-	dir, err := modelsDir()
+	dir := envconfig.ModelsDir
 	if err != nil {
 		return "", err
 	}
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
--- a/server/routes.go
+++ b/server/routes.go
@@ -9,7 +9,6 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
 	"math"
 	"net"
 	"net/http"
 	"net/netip"
@@ -17,7 +16,6 @@ import (
 	"os/signal"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -56,8 +54,6 @@ func init() {
 	gin.SetMode(mode)
 }
 var defaultSessionDuration = 5 * time.Minute
 func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
 	opts := api.DefaultOptions()
 	if err := opts.FromMap(model.Options); err != nil {
@@ -133,14 +129,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
-	var sessionDuration time.Duration
+	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
 	if req.KeepAlive == nil {
 		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
 	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
 	var runner *runnerRef
 	select {
 	case runner = <-rCh:
@@ -320,32 +309,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }
 func getDefaultSessionDuration() time.Duration {
 	if envconfig.KeepAlive != "" {
 		v, err := strconv.Atoi(envconfig.KeepAlive)
 		if err != nil {
 			d, err := time.ParseDuration(envconfig.KeepAlive)
 			if err != nil {
 				return defaultSessionDuration
 			}
 			if d < 0 {
 				return time.Duration(math.MaxInt64)
 			}
 			return d
 		}
 		d := time.Duration(v) * time.Second
 		if d < 0 {
 			return time.Duration(math.MaxInt64)
 		}
 		return d
 	}
 	return defaultSessionDuration
 }
 func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	var req api.EmbeddingRequest
 	err := c.ShouldBindJSON(&req)
@@ -380,14 +343,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}
-	var sessionDuration time.Duration
+	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
 	if req.KeepAlive == nil {
 		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
 	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
 	var runner *runnerRef
 	select {
 	case runner = <-rCh:
@@ -1318,14 +1274,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
-	var sessionDuration time.Duration
+	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
 	if req.KeepAlive == nil {
 		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
 	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
 	var runner *runnerRef
 	select {
 	case runner = <-rCh:
--- a/server/sched.go
+++ b/server/sched.go
@@ -24,7 +24,7 @@ type LlmRequest struct {
 	model           *Model
 	opts            api.Options
 	origNumCtx      int // Track the initial ctx request
-	sessionDuration time.Duration
+	sessionDuration *api.Duration
 	successCh       chan *runnerRef
 	errCh           chan error
 	schedAttempts   uint
@@ -75,7 +75,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
 }
 // context must be canceled to decrement ref count and release the runner
-func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
+func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
@@ -389,7 +389,9 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 		runner.expireTimer.Stop()
 		runner.expireTimer = nil
 	}
-	runner.sessionDuration = pending.sessionDuration
+	if pending.sessionDuration != nil {
 		runner.sessionDuration = pending.sessionDuration.Duration
 	}
 	pending.successCh <- runner
 	go func() {
 		<-pending.ctx.Done()
@@ -402,6 +404,10 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	if numParallel < 1 {
 		numParallel = 1
 	}
 	sessionDuration := envconfig.KeepAlive
 	if req.sessionDuration != nil {
 		sessionDuration = req.sessionDuration.Duration
 	}
 	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 	if err != nil {
 		// some older models are not compatible with newer versions of llama.cpp
@@ -419,7 +425,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 		modelPath:       req.model.ModelPath,
 		llama:           llama,
 		Options:         &req.opts,
-		sessionDuration: req.sessionDuration,
+		sessionDuration: sessionDuration,
 		gpus:            gpus,
 		estimatedVRAM:   llama.EstimatedVRAM(),
 		estimatedTotal:  llama.EstimatedTotal(),
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -44,7 +44,7 @@ func TestLoad(t *testing.T) {
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
-		sessionDuration: 2,
+		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	// Fail to load model first
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
@@ -142,7 +142,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		ctx:             scenario.ctx,
 		model:           model,
 		opts:            api.DefaultOptions(),
-		sessionDuration: 5 * time.Millisecond,
+		sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
@@ -156,18 +156,18 @@ func TestRequests(t *testing.T) {
 	// Same model, same request
 	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = 5 * time.Millisecond
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
 	scenario1b.req.model = scenario1a.req.model
 	scenario1b.ggml = scenario1a.ggml
-	scenario1b.req.sessionDuration = 0
+	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
 	// simple reload of same model
 	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
 	tmpModel := *scenario1a.req.model
 	scenario2a.req.model = &tmpModel
 	scenario2a.ggml = scenario1a.ggml
-	scenario2a.req.sessionDuration = 5 * time.Millisecond
+	scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -318,11 +318,11 @@ func TestGetRunner(t *testing.T) {
 	defer done()
 	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
-	scenario1a.req.sessionDuration = 0
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
 	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
-	scenario1b.req.sessionDuration = 0
+	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
 	scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
-	scenario1c.req.sessionDuration = 0
+	scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
 	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
@@ -402,7 +402,7 @@ func TestPrematureExpired(t *testing.T) {
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
-	time.Sleep(scenario1a.req.sessionDuration)
+	time.Sleep(scenario1a.req.sessionDuration.Duration)
 	scenario1a.ctxDone()
 	time.Sleep(20 * time.Millisecond)
 	require.LessOrEqual(t, len(s.finishedReqCh), 1)
@@ -423,7 +423,7 @@ func TestUseLoadedRunner(t *testing.T) {
 		ctx:             ctx,
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
-		sessionDuration: 2,
+		sessionDuration: &api.Duration{Duration: 2},
 	}
 	finished := make(chan *LlmRequest)
 	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
@@ -614,7 +614,7 @@ func TestAlreadyCanceled(t *testing.T) {
 	dctx, done2 := context.WithCancel(ctx)
 	done2()
 	scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = 0
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
 	s := InitScheduler(ctx)
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req