diff --git a/envconfig/config.go b/envconfig/config.go index 868813ae..7fc01887 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -185,8 +185,6 @@ var ( ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) // Auth enables authentication between the Ollama client and server UseAuth = Bool("OLLAMA_AUTH") - // Enable the new memory estimation logic - NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES") ) func String(s string) func() string { @@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, - "OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"}, // Informational "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, diff --git a/llm/server.go b/llm/server.go index a22ae972..5caf1987 100644 --- a/llm/server.go +++ b/llm/server.go @@ -162,11 +162,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } } - newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates() - if newEstimates { - slog.Info("enabling new memory estimates") - } - // Verify the requested context size is <= the model training size trainCtx := f.KV().ContextLength() if opts.NumCtx > int(trainCtx) && trainCtx > 0 { @@ -434,7 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } }() - if newEstimates { + if textProcessor != nil { return &ollamaServer{llmServer: s}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil