From 93f19910c5b78488e8e29a649a467285480a18c8 Mon Sep 17 00:00:00 2001
From: Zander Lewis <wolfthedev@gmail.com>
Date: Sun, 12 May 2024 11:24:21 -0400
Subject: [PATCH 01/18] Update `LlamaScript` to point to new link.

Still used Legacy link.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4a6935f7..034760e1 100644
--- a/README.md
+++ b/README.md
@@ -359,7 +359,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
-- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript)
+- [LlamaScript](https://github.com/Project-Llama/llamascript)
 ### Mobile
 
 - [Enchanted](https://github.com/AugustDev/enchanted)

From 9c76b30d72b76f0ce1fe7f357651ea9985c2cb24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9D=A1=E8=A7=89=E5=9E=8B=E5=AD=A6=E6=B8=A3?=
 <59201842+fangtaosong@users.noreply.github.com>
Date: Mon, 13 May 2024 09:21:11 +0800
Subject: [PATCH 02/18] Correct typos. (#4387)

* Correct typos.

* Correct typos.
---
 docs/api.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 94cd9c90..0f11c388 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -797,9 +797,9 @@ curl http://localhost:11434/api/show -d '{
 
 ```json
 {
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"",
-  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSSISTANT:",
-  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ",
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
+  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSISTANT:",
+  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
   "details": {
     "format": "gguf",
     "family": "llama",

From 91a090a485ef48b6a5265c43bce4d98b49ae8dab Mon Sep 17 00:00:00 2001
From: Josh Yan <joshyan@Joshs-MBP.localdomain>
Date: Mon, 13 May 2024 14:08:22 -0700
Subject: [PATCH 03/18] removed inconsistent punctuation

---
 cmd/cmd.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index fb10e53f..6975899b 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1090,7 +1090,7 @@ func NewCLI() *cobra.Command {
 Environment Variables:
 
     OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
+    OLLAMA_ORIGINS      A comma separated list of allowed origins
     OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
     OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
     OLLAMA_DEBUG        Set to 1 to enable additional debug logging

From 50b9056e09744a2902b161e7db3507b3e108493a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 10 May 2024 14:40:37 -0700
Subject: [PATCH 04/18] count memory up to NumGPU

---
 llm/memory.go | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index df7081cf..7c6192ca 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -53,6 +53,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 
+	layers := ggml.Tensors().Layers()
+	// add one layer worth of memorr as a buffer
+	memoryMinimum += layers["blk.0"].size()
+
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
 
@@ -73,13 +77,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		graphPartialOffload = graphFullOffload
 	}
 
-	layers := ggml.Tensors().Layers()
-
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
-	memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
+	memoryRequiredTotal := memoryMinimum + graphFullOffload
 
 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
-	memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
+	memoryRequiredPartial := memoryMinimum + graphPartialOffload
 
 	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
@@ -106,7 +108,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryLayer += kv / ggml.KV().BlockCount()
 
 		memoryRequiredTotal += memoryLayer
-		if memoryAvailable > memoryRequiredPartial+memoryLayer {
+		if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
 			memoryRequiredPartial += memoryLayer
 			layerCount++
 		}
@@ -117,7 +119,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryRequiredTotal += memoryLayerOutput
 	}
 
-	if memoryAvailable > memoryRequiredTotal {
+	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}
@@ -128,10 +130,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		"offload to gpu",
 		slog.Group(
 			"layers",
-			// actual number of layers offloaded
-			"real", opts.NumGPU,
+			// requested number of layers to offload
+			"requested", opts.NumGPU,
 			// estimated number of layers that can be offloaded
-			"estimate", layerCount,
+			"real", layerCount,
 		),
 		slog.Group(
 			"memory",

From 1d359e737e6342d03553b7a49a9e5f588e9e611f Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 13 May 2024 14:14:10 -0700
Subject: [PATCH 05/18] typo

---
 llm/memory.go | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index 7c6192ca..acc2dd0b 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -54,8 +54,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	}
 
 	layers := ggml.Tensors().Layers()
-	// add one layer worth of memorr as a buffer
-	memoryMinimum += layers["blk.0"].size()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		memoryMinimum += blk0.size()
+	}
 
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
@@ -102,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
 	var layerCount int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			memoryLayer := blk.size()
 
-		// KV is proportional to the number of layers
-		memoryLayer += kv / ggml.KV().BlockCount()
+			// KV is proportional to the number of layers
+			memoryLayer += kv / ggml.KV().BlockCount()
 
-		memoryRequiredTotal += memoryLayer
-		if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
-			memoryRequiredPartial += memoryLayer
-			layerCount++
+			memoryRequiredTotal += memoryLayer
+			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
+				memoryRequiredPartial += memoryLayer
+				layerCount++
+			}
 		}
 	}
 

From f8464785a66461496fa1291cd68630d8540171d5 Mon Sep 17 00:00:00 2001
From: Josh Yan <joshyan@Joshs-MBP.localdomain>
Date: Mon, 13 May 2024 14:50:52 -0700
Subject: [PATCH 06/18] removed inconsistencies

---
 cmd/cmd.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 6975899b..085fe747 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1091,8 +1091,8 @@ Environment Variables:
 
     OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
     OLLAMA_ORIGINS      A comma separated list of allowed origins
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
-    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
+    OLLAMA_MODELS       The path to the models directory (default "~/.ollama/models")
+    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default "5m")
     OLLAMA_DEBUG        Set to 1 to enable additional debug logging
 `)
 

From 6845988807e458e8e0eb406a2739080f52c267bd Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Mon, 13 May 2024 17:17:36 -0700
Subject: [PATCH 07/18] Ollama `ps` command for showing currently loaded models
 (#4327)

---
 api/client.go        |  9 +++++
 api/types.go         |  4 ++-
 cmd/cmd.go           | 75 +++++++++++++++++++++++++++++++++++++++++
 cmd/interactive.go   |  5 +++
 format/time.go       |  4 ++-
 format/time_test.go  | 10 ++++++
 llm/server.go        |  5 +++
 server/routes.go     | 29 ++++++++++++++++
 server/sched.go      | 80 +++++++++++++++++++++++---------------------
 server/sched_test.go | 22 ++++++------
 10 files changed, 193 insertions(+), 50 deletions(-)

diff --git a/api/client.go b/api/client.go
index 5b1fc796..d50b397d 100644
--- a/api/client.go
+++ b/api/client.go
@@ -354,6 +354,15 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }
 
+// List running models.
+func (c *Client) ListRunning(ctx context.Context) (*ListResponse, error) {
+	var lr ListResponse
+	if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
+		return nil, err
+	}
+	return &lr, nil
+}
+
 // Copy copies a model - creating a model with another name from an existing
 // model.
 func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
diff --git a/api/types.go b/api/types.go
index fcab6fef..4195a7c5 100644
--- a/api/types.go
+++ b/api/types.go
@@ -289,10 +289,12 @@ type ListResponse struct {
 type ModelResponse struct {
 	Name       string       `json:"name"`
 	Model      string       `json:"model"`
-	ModifiedAt time.Time    `json:"modified_at"`
+	ModifiedAt time.Time    `json:"modified_at,omitempty"`
 	Size       int64        `json:"size"`
 	Digest     string       `json:"digest"`
 	Details    ModelDetails `json:"details,omitempty"`
+	ExpiresAt  time.Time    `json:"expires_at,omitempty"`
+	SizeVRAM   int64        `json:"size_vram,omitempty"`
 }
 
 type TokenResponse struct {
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 085fe747..f1429e1c 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -12,6 +12,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"math"
 	"net"
 	"net/http"
 	"os"
@@ -324,6 +325,18 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format
 
+	keepAlive, err := cmd.Flags().GetString("keepalive")
+	if err != nil {
+		return err
+	}
+	if keepAlive != "" {
+		d, err := time.ParseDuration(keepAlive)
+		if err != nil {
+			return err
+		}
+		opts.KeepAlive = &api.Duration{Duration: d}
+	}
+
 	prompts := args[1:]
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
@@ -496,6 +509,52 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }
 
+func ListRunningHandler(cmd *cobra.Command, args []string) error {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	models, err := client.ListRunning(cmd.Context())
+	if err != nil {
+		return err
+	}
+
+	var data [][]string
+
+	for _, m := range models.Models {
+		if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
+			var procStr string
+			switch {
+			case m.SizeVRAM == 0:
+				procStr = "100% CPU"
+			case m.SizeVRAM == m.Size:
+				procStr = "100% GPU"
+			case m.SizeVRAM > m.Size || m.Size == 0:
+				procStr = "Unknown"
+			default:
+				sizeCPU := m.Size - m.SizeVRAM
+				cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+				procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
+			}
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
+		}
+	}
+
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
+	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+	table.SetHeaderLine(false)
+	table.SetBorder(false)
+	table.SetNoWhiteSpace(true)
+	table.SetTablePadding("\t")
+	table.AppendBulk(data)
+	table.Render()
+
+	return nil
+}
+
 func DeleteHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -672,6 +731,7 @@ type runOptions struct {
 	Images      []api.ImageData
 	Options     map[string]interface{}
 	MultiModal  bool
+	KeepAlive   *api.Duration
 }
 
 type displayResponseState struct {
@@ -766,6 +826,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Options:  opts.Options,
 	}
 
+	if opts.KeepAlive != nil {
+		req.KeepAlive = opts.KeepAlive
+	}
+
 	if err := client.Chat(cancelCtx, req, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
@@ -1075,6 +1139,7 @@ func NewCLI() *cobra.Command {
 		RunE:    RunHandler,
 	}
 
+	runCmd.Flags().String("keepalive", "", "Duration to keep a model loaded (e.g. 5m)")
 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
@@ -1123,6 +1188,14 @@ Environment Variables:
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListHandler,
 	}
+
+	psCmd := &cobra.Command{
+		Use:     "ps",
+		Short:   "List running models",
+		PreRunE: checkServerHeartbeat,
+		RunE:    ListRunningHandler,
+	}
+
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@@ -1146,6 +1219,7 @@ Environment Variables:
 		pullCmd,
 		pushCmd,
 		listCmd,
+		psCmd,
 		copyCmd,
 		deleteCmd,
 	} {
@@ -1160,6 +1234,7 @@ Environment Variables:
 		pullCmd,
 		pushCmd,
 		listCmd,
+		psCmd,
 		copyCmd,
 		deleteCmd,
 	)
diff --git a/cmd/interactive.go b/cmd/interactive.go
index c294b7b5..21c0a6a9 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -56,6 +56,11 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
 		Model:    opts.Model,
 		Messages: []api.Message{},
 	}
+
+	if opts.KeepAlive != nil {
+		chatReq.KeepAlive = opts.KeepAlive
+	}
+
 	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		p.StopAndClear()
 		if len(opts.Messages) > 0 {
diff --git a/format/time.go b/format/time.go
index 6637c062..74062848 100644
--- a/format/time.go
+++ b/format/time.go
@@ -60,7 +60,9 @@ func humanTime(t time.Time, zeroValue string) string {
 	}
 
 	delta := time.Since(t)
-	if delta < 0 {
+	if int(delta.Hours())/24/365 < -20 {
+		return "Forever"
+	} else if delta < 0 {
 		return humanDuration(-delta) + " from now"
 	}
 
diff --git a/format/time_test.go b/format/time_test.go
index cc6b8930..bd0ba9a8 100644
--- a/format/time_test.go
+++ b/format/time_test.go
@@ -32,4 +32,14 @@ func TestHumanTime(t *testing.T) {
 		v := now.Add(800 * time.Millisecond)
 		assertEqual(t, HumanTime(v, ""), "Less than a second from now")
 	})
+
+	t.Run("time way in the future", func(t *testing.T) {
+		v := now.Add(24 * time.Hour * 365 * 200)
+		assertEqual(t, HumanTime(v, ""), "Forever")
+	})
+
+	t.Run("time way in the future lowercase", func(t *testing.T) {
+		v := now.Add(24 * time.Hour * 365 * 200)
+		assertEqual(t, HumanTimeLower(v, ""), "forever")
+	})
 }
diff --git a/llm/server.go b/llm/server.go
index c9874bb6..87f5b545 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -38,6 +38,7 @@ type LlamaServer interface {
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
 	EstimatedVRAM() uint64
+	EstimatedTotal() uint64
 }
 
 // llmServer is an instance of the llama.cpp server
@@ -955,6 +956,10 @@ func (s *llmServer) EstimatedVRAM() uint64 {
 	return s.estimatedVRAM
 }
 
+func (s *llmServer) EstimatedTotal() uint64 {
+	return s.estimatedTotal
+}
+
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
diff --git a/server/routes.go b/server/routes.go
index 600a30fa..d60a4d08 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -979,6 +979,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/show", s.ShowModelHandler)
 	r.POST("/api/blobs/:digest", s.CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
+	r.GET("/api/ps", s.ProcessHandler)
 
 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler)
@@ -1137,6 +1138,34 @@ func streamResponse(c *gin.Context, ch chan any) {
 	})
 }
 
+func (s *Server) ProcessHandler(c *gin.Context) {
+	models := []api.ModelResponse{}
+
+	for _, v := range s.sched.loaded {
+		model := v.model
+		modelDetails := api.ModelDetails{
+			Format:            model.Config.ModelFormat,
+			Family:            model.Config.ModelFamily,
+			Families:          model.Config.ModelFamilies,
+			ParameterSize:     model.Config.ModelType,
+			QuantizationLevel: model.Config.FileType,
+		}
+
+		mr := api.ModelResponse{
+			Model:     model.ShortName,
+			Name:      model.ShortName,
+			Size:      int64(v.estimatedTotal),
+			SizeVRAM:  int64(v.estimatedVRAM),
+			Digest:    model.Digest,
+			Details:   modelDetails,
+			ExpiresAt: v.expiresAt,
+		}
+		models = append(models, mr)
+	}
+
+	c.JSON(http.StatusOK, api.ListResponse{Models: models})
+}
+
 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
 func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) {
 	encode := func(s string) ([]int, error) {
diff --git a/server/sched.go b/server/sched.go
index eff2b117..198f0aca 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -177,7 +177,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				}
 				// Trigger an expiration to unload once it's done
 				runnerToExpire.refMu.Lock()
-				slog.Debug("resetting model to expire immediately to make room", "model", runnerToExpire.model, "refCount", runnerToExpire.refCount)
+				slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount)
 				if runnerToExpire.expireTimer != nil {
 					runnerToExpire.expireTimer.Stop()
 					runnerToExpire.expireTimer = nil
@@ -190,13 +190,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				// Wait for the unload to happen
 				// Note: at this point we're queueing up all incoming requests, even if they were for
 				// a different model that's loaded and not scheduled to be removed.
-				slog.Debug("waiting for pending requests to complete and unload to occur", "model", runnerToExpire.model)
+				slog.Debug("waiting for pending requests to complete and unload to occur", "modelPath", runnerToExpire.modelPath)
 				select {
 				case <-ctx.Done():
 					slog.Debug("shutting down scheduler pending loop")
 					return
 				case <-s.unloadedCh:
-					slog.Debug("unload completed", "model", runnerToExpire.model)
+					slog.Debug("unload completed", "modelPath", runnerToExpire.modelPath)
 					continue
 				}
 			}
@@ -219,23 +219,23 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 			runner := s.loaded[finished.model.ModelPath]
 			s.loadedMu.Unlock()
 			if runner == nil {
-				slog.Error("finished requeset signal received after model unloaded", "model", finished.model.ModelPath)
+				slog.Error("finished requeset signal received after model unloaded", "modelPath", finished.model.ModelPath)
 				continue
 			}
 			runner.refMu.Lock()
 			runner.refCount--
 			if runner.refCount <= 0 {
 				if runner.sessionDuration <= 0 {
-					slog.Debug("runner with zero duration has gone idle, expiring to unload", "model", runner.model)
+					slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath)
 					if runner.expireTimer != nil {
 						runner.expireTimer.Stop()
 						runner.expireTimer = nil
 					}
 					s.expiredCh <- runner
 				} else if runner.expireTimer == nil {
-					slog.Debug("runner with non-zero duration has gone idle, adding timer", "model", runner.model, "duration", runner.sessionDuration)
+					slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
 					runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() {
-						slog.Debug("timer expired, expiring to unload", "model", runner.model)
+						slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath)
 						runner.refMu.Lock()
 						defer runner.refMu.Unlock()
 						if runner.expireTimer != nil {
@@ -244,19 +244,21 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 						}
 						s.expiredCh <- runner
 					})
+					runner.expiresAt = time.Now().Add(runner.sessionDuration)
 				} else {
-					slog.Debug("runner with non-zero duration has gone idle, resetting timer", "model", runner.model, "duration", runner.sessionDuration)
+					slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
 					runner.expireTimer.Reset(runner.sessionDuration)
+					runner.expiresAt = time.Now().Add(runner.sessionDuration)
 				}
 			}
-			slog.Debug("after processing request finished event", "model", runner.model, "refCount", runner.refCount)
+			slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount)
 			runner.refMu.Unlock()
 		case runner := <-s.expiredCh:
-			slog.Debug("runner expired event received", "model", runner.model)
+			slog.Debug("runner expired event received", "modelPath", runner.modelPath)
 			runner.refMu.Lock()
 			if runner.refCount > 0 {
 				// Shouldn't happen, but safeguard to ensure no leaked runners
-				slog.Debug("expired event with positive ref count, retrying", "model", runner.model, "refCount", runner.refCount)
+				slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
 				go func(runner *runnerRef) {
 					// We can't unload yet, but want to as soon as the current request completes
 					// So queue up another expired event
@@ -268,16 +270,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 			}
 
 			s.loadedMu.Lock()
-			slog.Debug("got lock to unload", "model", runner.model)
+			slog.Debug("got lock to unload", "modelPath", runner.modelPath)
 			finished := runner.waitForVRAMRecovery()
 			runner.unload()
-			delete(s.loaded, runner.model)
+			delete(s.loaded, runner.modelPath)
 			s.loadedMu.Unlock()
-			slog.Debug("runner released", "model", runner.model)
+			slog.Debug("runner released", "modelPath", runner.modelPath)
 			runner.refMu.Unlock()
 
 			<-finished
-			slog.Debug("sending an unloaded event", "model", runner.model)
+			slog.Debug("sending an unloaded event", "modelPath", runner.modelPath)
 			s.unloadedCh <- struct{}{}
 		}
 	}
@@ -316,18 +318,20 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 		req.errCh <- err
 		return
 	}
-	runner := &runnerRef{}
-	runner.model = req.model.ModelPath
-	runner.adapters = req.model.AdapterPaths
-	runner.projectors = req.model.ProjectorPaths
-	runner.llama = llama
-	runner.Options = &req.opts
-	runner.sessionDuration = req.sessionDuration
-	runner.gpus = gpus
-	runner.estimatedVRAM = llama.EstimatedVRAM()
-	runner.loading = true
-	runner.refCount = 1
+	runner := &runnerRef{
+		model:           req.model,
+		modelPath:       req.model.ModelPath,
+		llama:           llama,
+		Options:         &req.opts,
+		sessionDuration: req.sessionDuration,
+		gpus:            gpus,
+		estimatedVRAM:   llama.EstimatedVRAM(),
+		estimatedTotal:  llama.EstimatedTotal(),
+		loading:         true,
+		refCount:        1,
+	}
 	runner.refMu.Lock()
+
 	s.loadedMu.Lock()
 	s.loaded[req.model.ModelPath] = runner
 	slog.Info("loaded runners", "count", len(s.loaded))
@@ -339,7 +343,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 			slog.Error("error loading llama server", "error", err)
 			runner.refCount--
 			req.errCh <- err
-			slog.Debug("triggering expiration for failed load", "model", runner.model)
+			slog.Debug("triggering expiration for failed load", "model", runner.modelPath)
 			s.expiredCh <- runner
 			return
 		}
@@ -408,17 +412,18 @@ type runnerRef struct {
 	refCount uint // prevent unloading if > 0
 	// unloading bool      // set to true when we are trying to unload the runner
 
-	llama         llm.LlamaServer
-	loading       bool            // True only during initial load, then false forever
-	gpus          gpu.GpuInfoList // Recorded at time of provisioning
-	estimatedVRAM uint64
+	llama          llm.LlamaServer
+	loading        bool            // True only during initial load, then false forever
+	gpus           gpu.GpuInfoList // Recorded at time of provisioning
+	estimatedVRAM  uint64
+	estimatedTotal uint64
 
 	sessionDuration time.Duration
 	expireTimer     *time.Timer
+	expiresAt       time.Time
 
-	model      string
-	adapters   []string
-	projectors []string
+	model     *Model
+	modelPath string
 	*api.Options
 }
 
@@ -431,9 +436,8 @@ func (runner *runnerRef) unload() {
 	if runner.llama != nil {
 		runner.llama.Close()
 	}
+	runner.model = nil
 	runner.llama = nil
-	runner.adapters = nil
-	runner.projectors = nil
 	runner.Options = nil
 	runner.gpus = nil
 }
@@ -462,8 +466,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 
 	ctx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
-	if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
-		!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
+	if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
+		!reflect.DeepEqual(runner.model.ProjectorPaths, req.model.ProjectorPaths) || // have the projectors changed?
 		!reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed?
 		runner.llama.Ping(ctx) != nil {
 		return true
diff --git a/server/sched_test.go b/server/sched_test.go
index 7e4faa61..6a6dd04f 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -164,7 +164,8 @@ func TestRequests(t *testing.T) {
 
 	// simple reload of same model
 	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
-	scenario2a.req.model = scenario1a.req.model
+	tmpModel := *scenario1a.req.model
+	scenario2a.req.model = &tmpModel
 	scenario2a.ggml = scenario1a.ggml
 
 	// Multiple loaded models
@@ -496,10 +497,9 @@ func TestNeedsReload(t *testing.T) {
 	llm := &mockLlm{}
 	do := api.DefaultOptions()
 	runner := &runnerRef{
-		adapters:   []string{"adapter1"},
-		projectors: []string{"projector1"},
-		Options:    &do,
-		llama:      llm,
+		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
+		Options: &do,
+		llama:   llm,
 	}
 	req := &LlmRequest{
 		model: &Model{
@@ -510,10 +510,10 @@ func TestNeedsReload(t *testing.T) {
 	}
 	resp := runner.needsReload(ctx, req)
 	require.True(t, resp)
-	req.model.AdapterPaths = runner.adapters
+	req.model.AdapterPaths = runner.model.AdapterPaths
 	resp = runner.needsReload(ctx, req)
 	require.True(t, resp)
-	req.model.ProjectorPaths = runner.projectors
+	req.model.ProjectorPaths = runner.model.ProjectorPaths
 	runner.loading = true
 	req.opts.NumBatch = 1234
 	resp = runner.needsReload(ctx, req)
@@ -558,11 +558,11 @@ func TestUnloadAllRunners(t *testing.T) {
 func TestUnload(t *testing.T) {
 	llm1 := &mockLlm{}
 	r1 := &runnerRef{llama: llm1}
-	r2 := &runnerRef{adapters: []string{"A"}}
+	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
 	r1.unload()
 	require.True(t, llm1.closeCalled)
 	r2.unload()
-	require.Nil(t, r2.adapters)
+	require.Nil(t, r2.model)
 }
 
 type mockLlm struct {
@@ -578,6 +578,7 @@ type mockLlm struct {
 	closeResp         error
 	closeCalled       bool
 	estimatedVRAM     uint64
+	estimatedTotal    uint64
 }
 
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
@@ -598,4 +599,5 @@ func (s *mockLlm) Close() error {
 	s.closeCalled = true
 	return s.closeResp
 }
-func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedVRAM() uint64  { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }

From f1548ef62d03ffefc6b23c615c1361e4cc2b6bea Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Mon, 13 May 2024 18:01:13 -0700
Subject: [PATCH 08/18] update the FAQ to be more clear about windows env
 variables (#4415)

---
 docs/faq.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index 3fe3da89..22bd4da7 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -80,17 +80,19 @@ If Ollama is run as a systemd service, environment variables should be set using
 
 ### Setting environment variables on Windows
 
-On windows, Ollama inherits your user and system environment variables.
+On Windows, Ollama inherits your user and system environment variables.
 
-1. First Quit Ollama by clicking on it in the task bar
+1. First Quit Ollama by clicking on it in the task bar.
 
-2. Edit system environment variables from the control panel
+2. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
 
-3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
+3. Click on _Edit environment variables for your account_.
 
-4. Click OK/Apply to save
+4. Edit or create a new variable for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
 
-5. Run `ollama` from a new terminal window
+5. Click OK/Apply to save.
+
+6. Start the Ollama application from the Windows Start menu.
 
 
 ## How can I expose Ollama on my network?
@@ -237,4 +239,4 @@ If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` AP
 
 If too many requests are sent to the server, it will respond with a 503 error
 indicating the server is overloaded.  You can adjust how many requests may be
-queue by setting `OLLAMA_MAX_QUEUE`
\ No newline at end of file
+queue by setting `OLLAMA_MAX_QUEUE`

From 7ca71a6b0fd4842e13cf9166f97ac2b58d4f874f Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Mon, 13 May 2024 18:48:28 -0700
Subject: [PATCH 09/18] don't abort when an invalid model name is used in /save
 (#4416)

---
 cmd/interactive.go         | 6 +++++-
 server/routes.go           | 3 ++-
 types/errtypes/errtypes.go | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index 21c0a6a9..c078650a 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -17,6 +17,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
+	"github.com/ollama/ollama/types/errtypes"
 )
 
 type MultilineState int
@@ -281,7 +282,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			fn := func(resp api.ProgressResponse) error { return nil }
 			err = client.Create(cmd.Context(), req, fn)
 			if err != nil {
-				fmt.Println("error: couldn't save model")
+				if strings.Contains(err.Error(), errtypes.InvalidModelNameErrMsg) {
+					fmt.Printf("error: The model name '%s' is invalid\n", args[1])
+					continue
+				}
 				return err
 			}
 			fmt.Printf("Created new model '%s'\n", args[1])
diff --git a/server/routes.go b/server/routes.go
index d60a4d08..aaa461c2 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/envconfig"
+	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -517,7 +518,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 
 	name := model.ParseName(cmp.Or(req.Model, req.Name))
 	if !name.IsValid() {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": errtypes.InvalidModelNameErrMsg})
 		return
 	}
 
diff --git a/types/errtypes/errtypes.go b/types/errtypes/errtypes.go
index e3a18d0b..d3073114 100644
--- a/types/errtypes/errtypes.go
+++ b/types/errtypes/errtypes.go
@@ -7,6 +7,7 @@ import (
 )
 
 const UnknownOllamaKeyErrMsg = "unknown ollama key"
+const InvalidModelNameErrMsg = "invalid model name"
 
 // TODO: This should have a structured response from the API
 type UnknownOllamaKey struct {

From ec231a7923ab213f7bb64f3c2ecbead394a47ef0 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 14 May 2024 09:48:13 -0700
Subject: [PATCH 10/18] Remove VRAM convergence check for windows

The APIs we query are optimistic on free space, and windows pages
VRAM, so we don't have to wait to see reported usage recover on unload
---
 server/sched.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 198f0aca..ceddc526 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"log/slog"
 	"reflect"
+	"runtime"
 	"sort"
 	"strings"
 	"sync"
@@ -487,8 +488,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	finished := make(chan interface{}, 1)
 
-	// CPU or Metal don't need checking, so no waiting required
-	if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
+	// CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
+	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
 		finished <- struct{}{}
 		return finished
 	}

From 798b107f19ed832d33a6816f11363b42888aaed3 Mon Sep 17 00:00:00 2001
From: Ryo Machida <55428929+machimachida@users.noreply.github.com>
Date: Wed, 15 May 2024 03:18:10 +0900
Subject: [PATCH 11/18] Fixed the API endpoint /api/tags when the model list is
 empty. (#4424)

* Fixed the API endpoint /api/tags to return {models: []} instead of {models: null} when the model list is empty.

* Update server/routes.go

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 server/routes.go      | 2 +-
 server/routes_test.go | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/server/routes.go b/server/routes.go
index aaa461c2..123ef9a3 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -725,7 +725,7 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 		return
 	}
 
-	var models []api.ModelResponse
+	models := []api.ModelResponse{}
 	if err := filepath.Walk(manifests, func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
 			rel, err := filepath.Rel(manifests, path)
diff --git a/server/routes_test.go b/server/routes_test.go
index 896dc27b..e144c957 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -95,6 +95,7 @@ func Test_Routes(t *testing.T) {
 				err = json.Unmarshal(body, &modelList)
 				assert.Nil(t, err)
 
+				assert.NotNil(t, modelList.Models)
 				assert.Equal(t, 0, len(modelList.Models))
 			},
 		},

From a4b8d1f89a73aab6dda01dae584c4e27aa4d632a Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Tue, 14 May 2024 11:38:20 -0700
Subject: [PATCH 12/18] re-add system context (#4435)

---
 cmd/interactive.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index c078650a..1078590c 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -292,6 +292,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			continue
 		case strings.HasPrefix(line, "/clear"):
 			opts.Messages = []api.Message{}
+			if opts.System != "" {
+				newMessage := api.Message{Role: "system", Content: opts.System}
+				opts.Messages = append(opts.Messages, newMessage)
+			}
 			fmt.Println("Cleared session context")
 			continue
 		case strings.HasPrefix(line, "/set"):

From ac145f75ca19ccdebd5277ae4c315d9e67f0b379 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 May 2024 12:41:03 -0700
Subject: [PATCH 13/18] return on part done

---
 server/download.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/download.go b/server/download.go
index 935af9c1..db4d1f4e 100644
--- a/server/download.go
+++ b/server/download.go
@@ -221,7 +221,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 		}
 		defer resp.Body.Close()
 
-		n, err := io.Copy(w, io.TeeReader(resp.Body, part))
+		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size)
 		if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
 			// rollback progress
 			b.Completed.Add(-n)

From c344da4c5a09521fce0c825dcef4df1c155b150d Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Tue, 14 May 2024 15:17:04 -0700
Subject: [PATCH 14/18] fix keepalive for non-interactive mode (#4438)

---
 cmd/cmd.go | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index f1429e1c..cae35f51 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -905,14 +905,15 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}
 
 	request := api.GenerateRequest{
-		Model:    opts.Model,
-		Prompt:   opts.Prompt,
-		Context:  generateContext,
-		Images:   opts.Images,
-		Format:   opts.Format,
-		System:   opts.System,
-		Template: opts.Template,
-		Options:  opts.Options,
+		Model:     opts.Model,
+		Prompt:    opts.Prompt,
+		Context:   generateContext,
+		Images:    opts.Images,
+		Format:    opts.Format,
+		System:    opts.System,
+		Template:  opts.Template,
+		Options:   opts.Options,
+		KeepAlive: opts.KeepAlive,
 	}
 
 	if err := client.Generate(ctx, &request, fn); err != nil {

From f2cf97d6f111031a712881eccb5fbe90fac787c7 Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Tue, 14 May 2024 15:34:29 -0700
Subject: [PATCH 15/18] fix typo in modelfile generation (#4439)

---
 server/routes.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/routes.go b/server/routes.go
index 123ef9a3..42b5c910 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -709,7 +709,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	}
 
 	var sb strings.Builder
-	fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
+	fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
 	fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
 	fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
 	fmt.Fprint(&sb, model.String())

From 853ae490e162e8703fde5425fead1ea8da09fdcf Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 15 May 2024 14:42:57 -0700
Subject: [PATCH 16/18] Sanitize the env var debug log

Only dump env vars we care about in the logs
---
 llm/server.go | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 87f5b545..11969997 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -317,8 +317,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 
 		slog.Info("starting llama server", "cmd", s.cmd.String())
-		// Log at debug as the environment is inherited and might contain sensitive information
-		slog.Debug("subprocess", "environment", s.cmd.Env)
+		if envconfig.Debug {
+			filteredEnv := []string{}
+			for _, ev := range s.cmd.Env {
+				if strings.HasPrefix(ev, "CUDA_") ||
+					strings.HasPrefix(ev, "ROCM_") ||
+					strings.HasPrefix(ev, "HIP_") ||
+					strings.HasPrefix(ev, "HSA_") ||
+					strings.HasPrefix(ev, "GGML_") ||
+					strings.HasPrefix(ev, "PATH=") ||
+					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
+					filteredEnv = append(filteredEnv, ev)
+				}
+			}
+			// Log at debug as the environment is inherited and might contain sensitive information
+			slog.Debug("subprocess", "environment", filteredEnv)
+		}
 
 		if err = s.cmd.Start(); err != nil {
 			// Detect permission denied and augment them essage about noexec

From d1692fd3e0b4a80ff55ba052b430207134df4714 Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Wed, 15 May 2024 15:43:16 -0700
Subject: [PATCH 17/18] fix the cpu estimatedTotal memory + get the expiry time
 for loading models (#4461)

---
 llm/server.go    | 1 +
 server/routes.go | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/llm/server.go b/llm/server.go
index 11969997..ccb1e419 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -89,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		cpuRunner = serverForCpu()
 		gpuCount = 0
+		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
diff --git a/server/routes.go b/server/routes.go
index 42b5c910..e991e774 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1161,6 +1161,14 @@ func (s *Server) ProcessHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
+		// The scheduler waits to set expiresAt, so if a model is loading it's
+		// possible that it will be set to the unix epoch. For those cases, just
+		// calculate the time w/ the sessionDuration instead.
+		var epoch time.Time
+		if v.expiresAt == epoch {
+			mr.ExpiresAt = time.Now().Add(v.sessionDuration)
+		}
+
 		models = append(models, mr)
 	}
 

From c48c1d7c4672a2483ddd938825bd5e5bac5dc6f1 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 15 May 2024 15:56:43 -0700
Subject: [PATCH 18/18] Port cuda/rocm skip build vars to linux

Windows already implements these, carry over to linux.
---
 llm/generate/gen_linux.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 63668bd2..c20a2568 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -156,7 +156,7 @@ if [ -z "${CUDART_LIB_DIR}" ]; then
     CUDART_LIB_DIR="${CUDA_LIB_DIR}"
 fi
 
-if [ -d "${CUDA_LIB_DIR}" ]; then
+if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     echo "CUDA libraries detected - building dynamic CUDA library"
     init_vars
     CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
@@ -218,7 +218,7 @@ if [ -z "${CLBlast_DIR}" ]; then
     fi
 fi
 
-if [ -d "${ROCM_PATH}" ]; then
+if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
     echo "ROCm libraries detected - building dynamic ROCm library"
     if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
         ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)