From 72700279e260694e6df0a24a672f0a6f3e3dc3bf Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 7 May 2024 16:46:15 -0700
Subject: [PATCH 1/5] Detect noexec and report a better error

This will bubble up a much more informative error message if noexec
is preventing us from running the subprocess
---
 llm/server.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llm/server.go b/llm/server.go
index e2402256..b23a7749 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -307,6 +307,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		slog.Debug("subprocess", "environment", s.cmd.Env)
 
 		if err = s.cmd.Start(); err != nil {
+			// Detect permission denied and augment them essage about noexec
+			if errors.Is(err, os.ErrPermission) {
+				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
+				continue
+			}
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
@@ -382,6 +387,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if s.status != nil && s.status.LastErrMsg != "" {
 			msg = s.status.LastErrMsg
 		}
+		if s.cmd.ProcessState.ExitCode() == -1 {
+			// Most likely a signal killed it, log some more details to try to help troubleshoot
+			slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
+		}
 		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 	}
 

From 486a2c1d947880d22275756f70b96953ee1a2e40 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 8 May 2024 08:47:09 -0700
Subject: [PATCH 2/5] types/model: fix tag case

---
 types/model/name.go      | 12 +++++++-----
 types/model/name_test.go | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/types/model/name.go b/types/model/name.go
index 6d2a187b..b79374c3 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -290,12 +290,14 @@ func (n Name) Filepath() string {
 	if !n.IsFullyQualified() {
 		panic("illegal attempt to get filepath of invalid name")
 	}
-	return strings.ToLower(filepath.Join(
-		n.Host,
-		n.Namespace,
-		n.Model,
+	return filepath.Join(
+		strings.ToLower(filepath.Join(
+			n.Host,
+			n.Namespace,
+			n.Model,
+		)),
 		n.Tag,
-	))
+	)
 }
 
 // LogValue returns a slog.Value that represents the name as a string.
diff --git a/types/model/name_test.go b/types/model/name_test.go
index 19bc2e2d..fb584291 100644
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -19,6 +19,16 @@ func TestParseNameParts(t *testing.T) {
 		wantFilepath    string
 		wantValidDigest bool
 	}{
+		{
+			in: "registry.ollama.ai/library/dolphin-mistral:7b-v2.6-dpo-laser-q6_K",
+			want: Name{
+				Host:      "registry.ollama.ai",
+				Namespace: "library",
+				Model:     "dolphin-mistral",
+				Tag:       "7b-v2.6-dpo-laser-q6_K",
+			},
+			wantFilepath: filepath.Join("registry.ollama.ai", "library", "dolphin-mistral", "7b-v2.6-dpo-laser-q6_K"),
+		},
 		{
 			in: "scheme://host:port/namespace/model:tag",
 			want: Name{
@@ -266,9 +276,9 @@ func TestFilepathAllocs(t *testing.T) {
 	allocs := testing.AllocsPerRun(1000, func() {
 		n.Filepath()
 	})
-	allowedAllocs := 2.0
+	var allowedAllocs float64 = 3
 	if runtime.GOOS == "windows" {
-		allowedAllocs = 4
+		allowedAllocs = 5
 	}
 	if allocs > allowedAllocs {
 		t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs)

From b25976aeb8542d56f48bcb97003b256cba0f6237 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 8 May 2024 12:42:48 -0700
Subject: [PATCH 3/5] routes: fix show llava models

---
 server/images.go | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/server/images.go b/server/images.go
index a96db8d1..2be1d366 100644
--- a/server/images.go
+++ b/server/images.go
@@ -68,6 +68,20 @@ func (m *Model) String() string {
 		Args: m.ModelPath,
 	})
 
+	for _, adapter := range m.AdapterPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "adapter",
+			Args: adapter,
+		})
+	}
+
+	for _, projector := range m.ProjectorPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "model",
+			Args: projector,
+		})
+	}
+
 	if m.Template != "" {
 		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "template",
@@ -82,20 +96,6 @@ func (m *Model) String() string {
 		})
 	}
 
-	for _, adapter := range m.AdapterPaths {
-		modelfile.Commands = append(modelfile.Commands, model.Command{
-			Name: "adapter",
-			Args: adapter,
-		})
-	}
-
-	for _, projector := range m.ProjectorPaths {
-		modelfile.Commands = append(modelfile.Commands, model.Command{
-			Name: "projector",
-			Args: projector,
-		})
-	}
-
 	for k, v := range m.Options {
 		switch v := v.(type) {
 		case []any:

From cef45feaa4bb340e59dadf2bc504e2e39f32baa7 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Wed, 8 May 2024 13:14:00 -0700
Subject: [PATCH 4/5] Add preflight OPTIONS handling and update CORS config
 (#4086)

* Add preflight OPTIONS handling and update CORS config

- Implement early return with HTTP 204 (No Content) for OPTIONS requests in allowedHostsMiddleware to optimize preflight handling.

- Extend CORS configuration to explicitly allow 'Authorization' headers and 'OPTIONS' method when OLLAMA_ORIGINS environment variable is set.

* allow auth, content-type, and user-agent headers

* Update routes.go
---
 server/routes.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/server/routes.go b/server/routes.go
index e0459271..7dfeb513 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -935,6 +935,11 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
 		}
 
 		if allowedHost(host) {
+			if c.Request.Method == "OPTIONS" {
+				c.AbortWithStatus(http.StatusNoContent)
+				return
+			}
+
 			c.Next()
 			return
 		}
@@ -947,6 +952,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
+	config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
 	config.AllowOrigins = envconfig.AllowOrigins
 
 	r := gin.Default()

From bee2f4a3b0e5dbf6611a399cd6b8f6b176b9d376 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 4 May 2024 09:15:31 -0700
Subject: [PATCH 5/5] Record GPU usage information

This records more GPU usage information for eventual UX inclusion.
---
 format/bytes.go |  2 ++
 llm/memory.go   | 24 ++++++++++++------------
 llm/server.go   | 34 ++++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/format/bytes.go b/format/bytes.go
index 9fdc8bcf..13d8575e 100644
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
 
 func HumanBytes2(b uint64) string {
 	switch {
+	case b >= GibiByte:
+		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:
diff --git a/llm/memory.go b/llm/memory.go
index 005a15aa..6890b08c 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	// Split up the GPUs by type and try them
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }
 
-// Given a model and one or more GPU targets, predict how many layers and bytes we can load
+// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
-	if gpus[0].Library == "cpu" {
-		return 0, 0
-	}
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
 	var memoryAvailable uint64
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
 	memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
 
-	if memoryRequiredPartial > memoryAvailable {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return 0, 0
-	}
-
 	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			),
 		),
 	)
-	return layerCount, uint64(memoryRequiredPartial)
+	if gpus[0].Library == "cpu" {
+		return 0, 0, memoryRequiredTotal
+	}
+	if memoryRequiredPartial > memoryAvailable {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return 0, 0, memoryRequiredTotal
+	}
+
+	return layerCount, memoryRequiredPartial, memoryRequiredTotal
 }
diff --git a/llm/server.go b/llm/server.go
index e2402256..d96be9a0 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,7 +49,10 @@ type llmServer struct {
 	options api.Options
 
 	// TODO - this should be broken down by GPU
-	estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
+	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
+	estimatedTotal uint64 // Total size of model
+	totalLayers    uint64
+	gpuCount       int
 
 	sem *semaphore.Weighted
 }
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 	cpuRunner := ""
 	var estimatedVRAM uint64
+	var estimatedTotal uint64
 	var systemMemory uint64
+	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
 
 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
 
 		cpuRunner = serverForCpu()
+		gpuCount = 0
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			}
 		}
 		var layers int
-		layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
 
 		if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
 			// disable partial offloading when model is greater than total system memory as this
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
+			if strings.HasPrefix(demandLib, "cpu") {
+				// Omit the GPU flag to silence the warning
+				opts.NumGPU = -1
+			}
 		}
 	}
 
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			continue
 		}
 
+		if strings.HasPrefix(servers[i], "cpu") {
+			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
+			gpuCount = 0
+		}
+
 		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 
 		s := &llmServer{
-			port:          port,
-			cmd:           exec.Command(server, finalParams...),
-			status:        NewStatusWriter(os.Stderr),
-			options:       opts,
-			estimatedVRAM: estimatedVRAM,
-			sem:           semaphore.NewWeighted(int64(numParallel)),
+			port:           port,
+			cmd:            exec.Command(server, finalParams...),
+			status:         NewStatusWriter(os.Stderr),
+			options:        opts,
+			estimatedVRAM:  estimatedVRAM,
+			estimatedTotal: estimatedTotal,
+			sem:            semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:    ggml.KV().BlockCount() + 1,
+			gpuCount:       gpuCount,
 		}
 
 		s.cmd.Env = os.Environ()