From 72700279e260694e6df0a24a672f0a6f3e3dc3bf Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 7 May 2024 16:46:15 -0700 Subject: [PATCH 1/5] Detect noexec and report a better error This will bubble up a much more informative error message if noexec is preventing us from running the subprocess --- llm/server.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llm/server.go b/llm/server.go index e2402256..b23a7749 100644 --- a/llm/server.go +++ b/llm/server.go @@ -307,6 +307,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr slog.Debug("subprocess", "environment", s.cmd.Env) if err = s.cmd.Start(); err != nil { + // Detect permission denied and augment them essage about noexec + if errors.Is(err, os.ErrPermission) { + finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir) + continue + } msg := "" if s.status != nil && s.status.LastErrMsg != "" { msg = s.status.LastErrMsg @@ -382,6 +387,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) { if s.status != nil && s.status.LastErrMsg != "" { msg = s.status.LastErrMsg } + if s.cmd.ProcessState.ExitCode() == -1 { + // Most likely a signal killed it, log some more details to try to help troubleshoot + slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String()) + } return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) } From 486a2c1d947880d22275756f70b96953ee1a2e40 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 8 May 2024 08:47:09 -0700 Subject: [PATCH 2/5] types/model: fix tag case --- types/model/name.go | 12 +++++++----- types/model/name_test.go | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/types/model/name.go b/types/model/name.go index 6d2a187b..b79374c3 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -290,12 +290,14 @@ func (n Name) Filepath() string { if !n.IsFullyQualified() { panic("illegal attempt to get filepath of invalid name") } - return strings.ToLower(filepath.Join( - n.Host, - n.Namespace, - n.Model, + return filepath.Join( + strings.ToLower(filepath.Join( + n.Host, + n.Namespace, + n.Model, + )), n.Tag, - )) + ) } // LogValue returns a slog.Value that represents the name as a string. diff --git a/types/model/name_test.go b/types/model/name_test.go index 19bc2e2d..fb584291 100644 --- a/types/model/name_test.go +++ b/types/model/name_test.go @@ -19,6 +19,16 @@ func TestParseNameParts(t *testing.T) { wantFilepath string wantValidDigest bool }{ + { + in: "registry.ollama.ai/library/dolphin-mistral:7b-v2.6-dpo-laser-q6_K", + want: Name{ + Host: "registry.ollama.ai", + Namespace: "library", + Model: "dolphin-mistral", + Tag: "7b-v2.6-dpo-laser-q6_K", + }, + wantFilepath: filepath.Join("registry.ollama.ai", "library", "dolphin-mistral", "7b-v2.6-dpo-laser-q6_K"), + }, { in: "scheme://host:port/namespace/model:tag", want: Name{ @@ -266,9 +276,9 @@ func TestFilepathAllocs(t *testing.T) { allocs := testing.AllocsPerRun(1000, func() { n.Filepath() }) - allowedAllocs := 2.0 + var allowedAllocs float64 = 3 if runtime.GOOS == "windows" { - allowedAllocs = 4 + allowedAllocs = 5 } if allocs > allowedAllocs { t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs) From b25976aeb8542d56f48bcb97003b256cba0f6237 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 8 May 2024 12:42:48 -0700 Subject: [PATCH 3/5] routes: fix show llava models --- server/images.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/server/images.go b/server/images.go index a96db8d1..2be1d366 100644 --- a/server/images.go +++ b/server/images.go @@ -68,6 +68,20 @@ func (m *Model) String() string { Args: m.ModelPath, }) + for _, adapter := range m.AdapterPaths { + modelfile.Commands = append(modelfile.Commands, model.Command{ + Name: "adapter", + Args: adapter, + }) + } + + for _, projector := range m.ProjectorPaths { + modelfile.Commands = append(modelfile.Commands, model.Command{ + Name: "model", + Args: projector, + }) + } + if m.Template != "" { modelfile.Commands = append(modelfile.Commands, model.Command{ Name: "template", @@ -82,20 +96,6 @@ func (m *Model) String() string { }) } - for _, adapter := range m.AdapterPaths { - modelfile.Commands = append(modelfile.Commands, model.Command{ - Name: "adapter", - Args: adapter, - }) - } - - for _, projector := range m.ProjectorPaths { - modelfile.Commands = append(modelfile.Commands, model.Command{ - Name: "projector", - Args: projector, - }) - } - for k, v := range m.Options { switch v := v.(type) { case []any: From cef45feaa4bb340e59dadf2bc504e2e39f32baa7 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Wed, 8 May 2024 13:14:00 -0700 Subject: [PATCH 4/5] Add preflight OPTIONS handling and update CORS config (#4086) * Add preflight OPTIONS handling and update CORS config - Implement early return with HTTP 204 (No Content) for OPTIONS requests in allowedHostsMiddleware to optimize preflight handling. - Extend CORS configuration to explicitly allow 'Authorization' headers and 'OPTIONS' method when OLLAMA_ORIGINS environment variable is set. * allow auth, content-type, and user-agent headers * Update routes.go --- server/routes.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/routes.go b/server/routes.go index e0459271..7dfeb513 100644 --- a/server/routes.go +++ b/server/routes.go @@ -935,6 +935,11 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc { } if allowedHost(host) { + if c.Request.Method == "OPTIONS" { + c.AbortWithStatus(http.StatusNoContent) + return + } + c.Next() return } @@ -947,6 +952,7 @@ func (s *Server) GenerateRoutes() http.Handler { config := cors.DefaultConfig() config.AllowWildcard = true config.AllowBrowserExtensions = true + config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"} config.AllowOrigins = envconfig.AllowOrigins r := gin.Default() From bee2f4a3b0e5dbf6611a399cd6b8f6b176b9d376 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 4 May 2024 09:15:31 -0700 Subject: [PATCH 5/5] Record GPU usage information This records more GPU usage information for eventual UX inclusion. --- format/bytes.go | 2 ++ llm/memory.go | 24 ++++++++++++------------ llm/server.go | 34 ++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/format/bytes.go b/format/bytes.go index 9fdc8bcf..13d8575e 100644 --- a/format/bytes.go +++ b/format/bytes.go @@ -53,6 +53,8 @@ func HumanBytes(b int64) string { func HumanBytes2(b uint64) string { switch { + case b >= GibiByte: + return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte) case b >= MebiByte: return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) case b >= KibiByte: diff --git a/llm/memory.go b/llm/memory.go index 005a15aa..6890b08c 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors // Split up the GPUs by type and try them for _, gpus := range allGpus.ByLibrary() { var layerCount int - layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) + layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) if opts.NumGPU < 0 { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { return true, estimatedVRAM @@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors return false, estimatedVRAM } -// Given a model and one or more GPU targets, predict how many layers and bytes we can load +// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) { - if gpus[0].Library == "cpu" { - return 0, 0 - } +func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) { var memoryAvailable uint64 for _, info := range gpus { memoryAvailable += info.FreeMemory @@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() - if memoryRequiredPartial > memoryAvailable { - slog.Debug("insufficient VRAM to load any model layers") - return 0, 0 - } - var memoryLayerOutput uint64 if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.size() @@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ), ), ) - return layerCount, uint64(memoryRequiredPartial) + if gpus[0].Library == "cpu" { + return 0, 0, memoryRequiredTotal + } + if memoryRequiredPartial > memoryAvailable { + slog.Debug("insufficient VRAM to load any model layers") + return 0, 0, memoryRequiredTotal + } + + return layerCount, memoryRequiredPartial, memoryRequiredTotal } diff --git a/llm/server.go b/llm/server.go index e2402256..d96be9a0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -49,7 +49,10 @@ type llmServer struct { options api.Options // TODO - this should be broken down by GPU - estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model + estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model + estimatedTotal uint64 // Total size of model + totalLayers uint64 + gpuCount int sem *semaphore.Weighted } @@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner := "" var estimatedVRAM uint64 + var estimatedTotal uint64 var systemMemory uint64 + gpuCount := len(gpus) if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner cpuRunner = serverForCpu() + gpuCount = 0 } else { if gpus[0].Library == "metal" { memInfo, err := gpu.GetCPUMem() @@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } var layers int - layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) + layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) if gpus[0].Library == "metal" && estimatedVRAM > systemMemory { // disable partial offloading when model is greater than total system memory as this @@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } else { slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) servers = []string{demandLib} + if strings.HasPrefix(demandLib, "cpu") { + // Omit the GPU flag to silence the warning + opts.NumGPU = -1 + } } } @@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr continue } + if strings.HasPrefix(servers[i], "cpu") { + // TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up + gpuCount = 0 + } + // Find an availableServers port, retry on each iterration in case the failure was a port conflict race port := 0 if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { @@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } s := &llmServer{ - port: port, - cmd: exec.Command(server, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - estimatedVRAM: estimatedVRAM, - sem: semaphore.NewWeighted(int64(numParallel)), + port: port, + cmd: exec.Command(server, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + estimatedVRAM: estimatedVRAM, + estimatedTotal: estimatedTotal, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: ggml.KV().BlockCount() + 1, + gpuCount: gpuCount, } s.cmd.Env = os.Environ()