From db2ffa79f10ebcb6cd702bfceb3533a97e892409 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Fri, 24 May 2024 20:30:42 +0200 Subject: [PATCH 1/5] Fix download retry issue --- server/download.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/download.go b/server/download.go index db4d1f4e..5a735abf 100644 --- a/server/download.go +++ b/server/download.go @@ -221,7 +221,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w } defer resp.Body.Close() - n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size) + n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed) if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { // rollback progress b.Completed.Add(-n) From 4cc3be30358efcbd463ec30c30998dacdb0cfb5c Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 24 May 2024 14:57:15 -0700 Subject: [PATCH 2/5] Move envconfig and consolidate env vars (#4608) --- app/lifecycle/logging.go | 2 +- cmd/cmd.go | 41 ++++++------- cmd/interactive.go | 3 +- {server/envconfig => envconfig}/config.go | 58 ++++++++++++++----- .../envconfig => envconfig}/config_test.go | 0 gpu/assets.go | 2 +- gpu/gpu.go | 2 +- llm/memory.go | 2 +- llm/server.go | 2 +- server/images.go | 2 +- server/routes.go | 10 ++-- server/routes_test.go | 4 +- server/sched.go | 2 +- server/sched_test.go | 2 +- 14 files changed, 81 insertions(+), 51 deletions(-) rename {server/envconfig => envconfig}/config.go (66%) rename {server/envconfig => envconfig}/config_test.go (100%) diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go index 4be90648..df2597a8 100644 --- a/app/lifecycle/logging.go +++ b/app/lifecycle/logging.go @@ -6,7 +6,7 @@ import ( "os" "path/filepath" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" ) func InitLogging() { diff --git a/cmd/cmd.go b/cmd/cmd.go index 5d919d9a..36d4af08 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -34,6 +34,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/auth" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/progress" @@ -1079,12 +1080,7 @@ func versionHandler(cmd *cobra.Command, _ []string) { } } -type EnvironmentVar struct { - Name string - Description string -} - -func appendEnvDocs(cmd *cobra.Command, envs []EnvironmentVar) { +func appendEnvDocs(cmd *cobra.Command, envs []envconfig.EnvVar) { if len(envs) == 0 { return } @@ -1093,7 +1089,7 @@ func appendEnvDocs(cmd *cobra.Command, envs []EnvironmentVar) { Environment Variables: ` for _, e := range envs { - envUsage += fmt.Sprintf(" %-16s %s\n", e.Name, e.Description) + envUsage += fmt.Sprintf(" %-24s %s\n", e.Name, e.Description) } cmd.SetUsageTemplate(cmd.UsageTemplate() + envUsage) @@ -1172,15 +1168,6 @@ func NewCLI() *cobra.Command { Args: cobra.ExactArgs(0), RunE: RunServer, } - serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + ` -Environment Variables: - - OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434") - OLLAMA_ORIGINS A comma separated list of allowed origins - OLLAMA_MODELS The path to the models directory (default "~/.ollama/models") - OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default "5m") - OLLAMA_DEBUG Set to 1 to enable additional debug logging -`) pullCmd := &cobra.Command{ Use: "pull MODEL", @@ -1233,9 +1220,9 @@ Environment Variables: RunE: DeleteHandler, } - ollamaHostEnv := EnvironmentVar{"OLLAMA_HOST", "The host:port or base URL of the Ollama server (e.g. http://localhost:11434)"} - ollamaNoHistoryEnv := EnvironmentVar{"OLLAMA_NOHISTORY", "Disable readline history"} - envs := []EnvironmentVar{ollamaHostEnv} + envVars := envconfig.AsMap() + + envs := []envconfig.EnvVar{envVars["OLLAMA_HOST"]} for _, cmd := range []*cobra.Command{ createCmd, @@ -1247,10 +1234,24 @@ Environment Variables: psCmd, copyCmd, deleteCmd, + serveCmd, } { switch cmd { case runCmd: - appendEnvDocs(cmd, []EnvironmentVar{ollamaHostEnv, ollamaNoHistoryEnv}) + appendEnvDocs(cmd, []envconfig.EnvVar{envVars["OLLAMA_HOST"], envVars["OLLAMA_NOHISTORY"]}) + case serveCmd: + appendEnvDocs(cmd, []envconfig.EnvVar{ + envVars["OLLAMA_DEBUG"], + envVars["OLLAMA_HOST"], + envVars["OLLAMA_KEEP_ALIVE"], + envVars["OLLAMA_MAX_LOADED_MODELS"], + envVars["OLLAMA_MAX_QUEUE"], + envVars["OLLAMA_MODELS"], + envVars["OLLAMA_NUM_PARALLEL"], + envVars["OLLAMA_NOPRUNE"], + envVars["OLLAMA_ORIGINS"], + envVars["OLLAMA_TMPDIR"], + }) default: appendEnvDocs(cmd, envs) } diff --git a/cmd/interactive.go b/cmd/interactive.go index 0a31efb5..c055df0e 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -15,6 +15,7 @@ import ( "golang.org/x/exp/slices" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/progress" "github.com/ollama/ollama/readline" "github.com/ollama/ollama/types/errtypes" @@ -183,7 +184,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { return err } - if os.Getenv("OLLAMA_NOHISTORY") != "" { + if envconfig.NoHistory { scanner.HistoryDisable() } diff --git a/server/envconfig/config.go b/envconfig/config.go similarity index 66% rename from server/envconfig/config.go rename to envconfig/config.go index ae7d89b2..2304c4f0 100644 --- a/server/envconfig/config.go +++ b/envconfig/config.go @@ -15,6 +15,10 @@ var ( AllowOrigins []string // Set via OLLAMA_DEBUG in the environment Debug bool + // Experimental flash attention + FlashAttention bool + // Set via OLLAMA_KEEP_ALIVE in the environment + KeepAlive string // Set via OLLAMA_LLM_LIBRARY in the environment LLMLibrary string // Set via OLLAMA_MAX_LOADED_MODELS in the environment @@ -23,6 +27,8 @@ var ( MaxQueuedRequests int // Set via OLLAMA_MAX_VRAM in the environment MaxVRAM uint64 + // Set via OLLAMA_NOHISTORY in the environment + NoHistory bool // Set via OLLAMA_NOPRUNE in the environment NoPrune bool // Set via OLLAMA_NUM_PARALLEL in the environment @@ -31,26 +37,42 @@ var ( RunnersDir string // Set via OLLAMA_TMPDIR in the environment TmpDir string - // Experimental flash attention - FlashAttention bool ) -func AsMap() map[string]string { - return map[string]string{ - "OLLAMA_ORIGINS": fmt.Sprintf("%v", AllowOrigins), - "OLLAMA_DEBUG": fmt.Sprintf("%v", Debug), - "OLLAMA_LLM_LIBRARY": fmt.Sprintf("%v", LLMLibrary), - "OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners), - "OLLAMA_MAX_QUEUE": fmt.Sprintf("%v", MaxQueuedRequests), - "OLLAMA_MAX_VRAM": fmt.Sprintf("%v", MaxVRAM), - "OLLAMA_NOPRUNE": fmt.Sprintf("%v", NoPrune), - "OLLAMA_NUM_PARALLEL": fmt.Sprintf("%v", NumParallel), - "OLLAMA_RUNNERS_DIR": fmt.Sprintf("%v", RunnersDir), - "OLLAMA_TMPDIR": fmt.Sprintf("%v", TmpDir), - "OLLAMA_FLASH_ATTENTION": fmt.Sprintf("%v", FlashAttention), +type EnvVar struct { + Name string + Value any + Description string +} + +func AsMap() map[string]EnvVar { + return map[string]EnvVar{ + "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, + "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"}, + "OLLAMA_HOST": {"OLLAMA_HOST", "", "IP Address for the ollama server (default 127.0.0.1:11434)"}, + "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, + "OLLAMA_LLM_LIBRARY": {"OLLAMA_ORIGINS", LLMLibrary, ""}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"}, + "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, + "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, ""}, + "OLLAMA_MODELS": {"OLLAMA_MODELS", "", "The path to the models directory"}, + "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, + "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"}, + "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, + "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, ""}, + "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, } } +func Values() map[string]string { + vals := make(map[string]string) + for k, v := range AsMap() { + vals[k] = fmt.Sprintf("%v", v.Value) + } + return vals +} + var defaultAllowOrigins = []string{ "localhost", "127.0.0.1", @@ -147,6 +169,10 @@ func LoadConfig() { } } + if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" { + NoHistory = true + } + if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" { NoPrune = true } @@ -181,4 +207,6 @@ func LoadConfig() { MaxQueuedRequests = p } } + + KeepAlive = clean("OLLAMA_KEEP_ALIVE") } diff --git a/server/envconfig/config_test.go b/envconfig/config_test.go similarity index 100% rename from server/envconfig/config_test.go rename to envconfig/config_test.go diff --git a/gpu/assets.go b/gpu/assets.go index 911a6977..e3fbe47c 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -13,7 +13,7 @@ import ( "syscall" "time" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" ) var ( diff --git a/gpu/gpu.go b/gpu/gpu.go index 781e23df..74160b60 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -21,7 +21,7 @@ import ( "unsafe" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" ) type handles struct { diff --git a/llm/memory.go b/llm/memory.go index acc2dd0b..ff64baf1 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -7,7 +7,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" ) // This algorithm looks for a complete fit to determine if we need to unload other models diff --git a/llm/server.go b/llm/server.go index 384d31ca..36fc727c 100644 --- a/llm/server.go +++ b/llm/server.go @@ -26,7 +26,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" ) type LlamaServer interface { diff --git a/server/images.go b/server/images.go index 520c899b..cf6edf95 100644 --- a/server/images.go +++ b/server/images.go @@ -28,7 +28,7 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" "github.com/ollama/ollama/version" diff --git a/server/routes.go b/server/routes.go index 4b3239e1..4ac284a4 100644 --- a/server/routes.go +++ b/server/routes.go @@ -26,11 +26,11 @@ import ( "golang.org/x/exp/slices" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" - "github.com/ollama/ollama/server/envconfig" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" "github.com/ollama/ollama/version" @@ -315,10 +315,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { } func getDefaultSessionDuration() time.Duration { - if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists { - v, err := strconv.Atoi(t) + if envconfig.KeepAlive != "" { + v, err := strconv.Atoi(envconfig.KeepAlive) if err != nil { - d, err := time.ParseDuration(t) + d, err := time.ParseDuration(envconfig.KeepAlive) if err != nil { return defaultSessionDuration } @@ -1025,7 +1025,7 @@ func Serve(ln net.Listener) error { level = slog.LevelDebug } - slog.Info("server config", "env", envconfig.AsMap()) + slog.Info("server config", "env", envconfig.Values()) handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ Level: level, AddSource: true, diff --git a/server/routes_test.go b/server/routes_test.go index a48819fe..37671d0c 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -209,14 +209,14 @@ func Test_Routes(t *testing.T) { }, } + t.Setenv("OLLAMA_MODELS", t.TempDir()) + s := &Server{} router := s.GenerateRoutes() httpSrv := httptest.NewServer(router) t.Cleanup(httpSrv.Close) - t.Setenv("OLLAMA_MODELS", t.TempDir()) - for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { u := httpSrv.URL + tc.Path diff --git a/server/sched.go b/server/sched.go index 8b97e354..8c72177f 100644 --- a/server/sched.go +++ b/server/sched.go @@ -16,7 +16,7 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" "golang.org/x/exp/slices" ) diff --git a/server/sched_test.go b/server/sched_test.go index addc1ad8..3ee1b989 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -15,7 +15,7 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/server/envconfig" + "github.com/ollama/ollama/envconfig" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) From 8f440d579aad22faf191ef2f7e9b38b4f614e070 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 24 May 2024 16:01:37 -0700 Subject: [PATCH 3/5] fix q5_0, q5_1 --- llm/ggml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/ggml.go b/llm/ggml.go index 9b6da425..3127eefc 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -127,7 +127,7 @@ func (t Tensor) blockSize() uint64 { switch t.Kind { case 0, 1, 24, 25, 26, 27, 28, 31: // F32, F16, I8, I16, I32, I64, F64, BF16 return 1 - case 2, 3, 8, 9, 20: // Q4_0, Q4_1, Q8_0, Q8_1, IQ4_NL + case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL return 32 default: // All others return 256 From d51f15257c24e3954d828bde3f25348c7561f440 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 24 May 2024 16:10:43 -0700 Subject: [PATCH 4/5] Update llm/ggml.go Co-authored-by: Bruce MacDonald --- llm/ggml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/ggml.go b/llm/ggml.go index 3127eefc..878800f3 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -125,7 +125,7 @@ type Tensor struct { func (t Tensor) blockSize() uint64 { switch t.Kind { - case 0, 1, 24, 25, 26, 27, 28, 31: // F32, F16, I8, I16, I32, I64, F64, BF16 + case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16 return 1 case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL return 32 From c4209d6d21dab00bb02713757e729a2449082b1f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 25 May 2024 09:23:28 -0700 Subject: [PATCH 5/5] Report better warning on client closed abort of load If the client closes the connection before we finish loading the model we abort, so lets make the log message clearer why to help users understand this failure mode --- llm/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 384d31ca..5ca9411e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -528,7 +528,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { for { select { case <-ctx.Done(): - slog.Info("context expired before server started") + slog.Warn("client connection closed before server finished loading, aborting load") return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err()) case err := <-s.done: msg := ""