From 40b8fdbdcacb41b9cf42869051df765f66750036 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 3 Apr 2025 10:25:23 -0700 Subject: [PATCH 1/7] arange --- kvcache/causal_test.go | 11 +++++++++++ ml/backend.go | 3 +++ ml/backend/ggml/ggml.go | 26 ++++++++++++++++++++++++++ model/models/gemma3/model_vision.go | 11 +---------- model/models/mllama/model.go | 11 +---------- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go index 07bc788b..af01bb6f 100644 --- a/kvcache/causal_test.go +++ b/kvcache/causal_test.go @@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { return out, nil } +func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor { + s := make([]float32, 0, int((stop-start)/step)) + for i := start; i < stop; i += step { + s = append(s, i) + } + + out, _ := c.FromFloatSlice(s, len(s)) + out.(*testTensor).dtype = dtype + return out +} + func (c *testContext) Input() ml.Context { return c } func (c *testContext) Layer(int) ml.Context { return c } diff --git a/ml/backend.go b/ml/backend.go index b2a83cfd..70c2fd8e 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -95,6 +95,9 @@ type Context interface { FromFloatSlice(s []float32, shape ...int) (Tensor, error) FromIntSlice(s []int32, shape ...int) (Tensor, error) + // Arange creates a 1D tensor with values within an interval (start, stop] increased by step. + Arange(start, stop, step float32, dtype DType) Tensor + Forward(...Tensor) Context Compute(...Tensor) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 94fc87a3..c486b747 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { return t, nil } +func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor { + switch dtype { + case ml.DTypeF32: + // ggml_arange creates a float32 tensor + return &Tensor{ + b: c.b, + t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)), + } + case ml.DTypeI32: + // ggml_cast does not support float32 to int32 conversion + arange := make([]int32, 0, int((stop-start)/step)) + for i := start; i < stop; i += step { + arange = append(arange, int32(i)) + } + + t, err := c.Input().FromIntSlice(arange, len(arange)) + if err != nil { + panic(err) + } + + return t + default: + panic("unsupported dtype for arange") + } +} + func (c *Context) Close() { if c != nil { for _, b := range *c.allocatedBuffers { diff --git a/model/models/gemma3/model_vision.go b/model/models/gemma3/model_vision.go index 636a363d..8b1a8eb0 100644 --- a/model/models/gemma3/model_vision.go +++ b/model/models/gemma3/model_vision.go @@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) - positions := make([]int32, numPatches) - for i := range positions { - positions[i] = int32(i) - } - - positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions)) - if err != nil { - panic(err) - } - + positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32) hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs)) for _, layer := range m.Layers { diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index e53eb184..a0fc6b69 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er return nil, err } - positions := make([]int32, 1601) - for i := range positions { - positions[i] = int32(i) - } - - positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions)) - if err != nil { - return nil, err - } - + positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32) crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio) return m.Projector.Forward(ctx, crossAttentionStates), nil } From 4e535e618846ffb00a2a6714c07847d6d2951453 Mon Sep 17 00:00:00 2001 From: Blake Mizerany Date: Fri, 18 Apr 2025 18:12:28 -0700 Subject: [PATCH 2/7] server/internal/registry: make pull send errors with Error field (#10326) Previously, the pull handler would send an error message in the Status field, this prevented the client from using the message as a signal to stop. In the case of the "run" command, it would follow the pull with a "show" which would print a nearly identical "not found" message for unresolved models. Fixes #10307 --- server/internal/registry/server.go | 12 +++++++----- server/internal/registry/server_test.go | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/server/internal/registry/server.go b/server/internal/registry/server.go index bd5f7dcd..af26fe1d 100644 --- a/server/internal/registry/server.go +++ b/server/internal/registry/server.go @@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error { } type progressUpdateJSON struct { + Error string `json:"error,omitempty,omitzero"` Status string `json:"status,omitempty,omitzero"` Digest blob.Digest `json:"digest,omitempty,omitzero"` Total int64 `json:"total,omitempty,omitzero"` @@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error { case err := <-done: flushProgress() if err != nil { - var status string if errors.Is(err, ollama.ErrModelNotFound) { - status = fmt.Sprintf("error: model %q not found", p.model()) + return &serverError{ + Status: 404, + Code: "not_found", + Message: fmt.Sprintf("model %q not found", p.model()), + } } else { - status = fmt.Sprintf("error: %v", err) + return err } - enc.Encode(progressUpdateJSON{Status: status}) - return nil } // Emulate old client pull progress (for now): diff --git a/server/internal/registry/server_test.go b/server/internal/registry/server_test.go index 61b57f11..15d8d828 100644 --- a/server/internal/registry/server_test.go +++ b/server/internal/registry/server_test.go @@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) { got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`) checkResponse(got, ` - {"status":"error: model \"unknown\" not found"} + {"code":"not_found","error":"model \"unknown\" not found"} `) got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`) @@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) { got = s.send(t, "POST", "/api/pull", `{"model": "://"}`) checkResponse(got, ` - {"status":"error: invalid or missing name: \"\""} + {"code":"bad_request","error":"invalid or missing name: \"\""} `) // Non-streaming pulls From 88738b357bcd25eea860b59bf7de2f6b94cfc352 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 18 Apr 2025 16:32:48 -0700 Subject: [PATCH 3/7] create tempdir in models directory the models directory should have plenty of storage and also ensure there's no cross-device copy --- server/create.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/create.go b/server/create.go index 4294554b..014e9916 100644 --- a/server/create.go +++ b/server/create.go @@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string { } func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) { - tmpDir, err := os.MkdirTemp("", "ollama-safetensors") + tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors") if err != nil { return nil, err } From 08065216425ba73828805756118f26b61cd03f28 Mon Sep 17 00:00:00 2001 From: greengrass821 Date: Mon, 21 Apr 2025 03:51:48 +0530 Subject: [PATCH 4/7] cmd: add support for escaping ~ in filepath (#10339) Co-authored-by: tooth paste --- cmd/interactive.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/interactive.go b/cmd/interactive.go index d85510d4..82a3bfcb 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string { "\\\\", "\\", // Escaped backslash "\\*", "*", // Escaped asterisk "\\?", "?", // Escaped question mark + "\\~", "~", // Escaped tilde ).Replace(fp) } From 2eb1fb3231063365408155d2fffce9d62ad3c5ee Mon Sep 17 00:00:00 2001 From: Richard Shiue <71320345+richardshiue@users.noreply.github.com> Date: Mon, 21 Apr 2025 06:38:06 +0800 Subject: [PATCH 5/7] readme: add AppFlowy to community integrations (#10335) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f14c8155..30019aeb 100644 --- a/README.md +++ b/README.md @@ -291,7 +291,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file) - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui) - [Ollamac](https://github.com/kevinhermawan/Ollamac) -- [big-AGI](https://github.com/enricoros/big-AGI) +- [big-AGI](https://github.com/enricoros/big-AGI) - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core) - [Amica](https://github.com/semperai/amica) - [chatd](https://github.com/BruceMacD/chatd) @@ -398,6 +398,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).) - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama) +- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable) ### Cloud From 424f648632c925ce14a75018c4dcab395e035993 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Tue, 22 Apr 2025 16:33:24 -0700 Subject: [PATCH 6/7] increase default context length to 4096 (#10364) * increase default context length to 4096 We lower the default numParallel from 4 to 2 and use these "savings" to double the default context length from 2048 to 4096. We're memory neutral in cases when we previously would've used numParallel == 4, but we add the following mitigation to handle some cases where we would have previously fallen back to 1x2048 due to low VRAM: we decide between 2048 and 4096 using a runtime check, choosing 2048 if we're on a one GPU system with total VRAM of <= 4 GB. We purposefully don't check the available VRAM because we don't want the context window size to change unexpectedly based on the available VRAM. We plan on making the default even larger, but this is a relatively low-risk change we can make to quickly double it. * fix tests add an explicit context length so they don't get truncated. The code that converts -1 from being a signal for doing a runtime check isn't running as part of these tests. * tweak small gpu message * clarify context length default also make it actually show up in `ollama serve --help` --- cmd/cmd.go | 1 + docs/faq.md | 6 +++--- envconfig/config.go | 18 ++++++++++++++++-- envconfig/config_test.go | 4 ++-- server/routes_generate_test.go | 9 +++++++++ server/sched.go | 22 +++++++++++++++++----- server/sched_test.go | 1 + 7 files changed, 49 insertions(+), 12 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 79ff87ac..befe578d 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], + envVars["OLLAMA_CONTEXT_LENGTH"], }) default: appendEnvDocs(cmd, envs) diff --git a/docs/faq.md b/docs/faq.md index f418da47..327afc6e 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 2048 tokens. +By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: @@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve To change this when using `ollama run`, use `/set parameter`: ```shell -/set parameter num_ctx 4096 +/set parameter num_ctx 8192 ``` When using the API, specify the `num_ctx` parameter: @@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "llama3.2", "prompt": "Why is the sky blue?", "options": { - "num_ctx": 4096 + "num_ctx": 8192 } }' ``` diff --git a/envconfig/config.go b/envconfig/config.go index fc702198..fcb0a694 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -169,7 +169,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048) + ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) ) func String(s string) func() string { @@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 { } } +func Int64(key string, defaultValue int64) func() int64 { + return func() int64 { + if s := Var(key); s != "" { + if n, err := strconv.ParseInt(s, 10, 64); err != nil { + slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) + } else { + return n + } + } + + return defaultValue + } +} + // Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) @@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, // Informational diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 5694eb8a..72bfb4df 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -278,8 +278,8 @@ func TestVar(t *testing.T) { } func TestContextLength(t *testing.T) { - cases := map[string]uint{ - "": 2048, + cases := map[string]int64{ + "": -1, "4096": 4096, } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 56121d41..dd77b574 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Help me write tests."}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { diff --git a/server/sched.go b/server/sched.go index f3978796..d5b19fbf 100644 --- a/server/sched.go +++ b/server/sched.go @@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3 // Default automatic value for parallel setting // Model will still need to fit in VRAM. If this setting won't fit // we'll back off down to 1 to try to get it to fit -var defaultParallel = 4 +var defaultParallel = 2 var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") @@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - req := &LlmRequest{ ctx: c, model: model, @@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) { }() } +const ( + defaultContextLength = 4096 + smallGpuContextLength = 2048 +) + func (s *Scheduler) processPending(ctx context.Context) { for { select { @@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) { gpus = s.getGpuFn() } + if pending.origNumCtx == -1 { + if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 { + slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength) + pending.opts.NumCtx = smallGpuContextLength + pending.origNumCtx = smallGpuContextLength + } else { + pending.opts.NumCtx = defaultContextLength + pending.origNumCtx = defaultContextLength + } + } + if envconfig.MaxRunners() <= 0 { // No user specified MaxRunners, so figure out what automatic setting to use // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs diff --git a/server/sched_test.go b/server/sched_test.go index 274e18ce..1b620329 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } + b.req.opts.NumCtx = 4096 b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return b } From 40b10eee6d62a32578ca7e884fb73d4c8bc644a0 Mon Sep 17 00:00:00 2001 From: Adrien Duermael Date: Wed, 23 Apr 2025 20:13:51 -0700 Subject: [PATCH 7/7] api: fix ImageData struct comment to expect raw image bytes (#10386) --- api/types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/types.go b/api/types.go index 53a9593e..7d8b6e53 100644 --- a/api/types.go +++ b/api/types.go @@ -76,7 +76,7 @@ type GenerateRequest struct { // this request. KeepAlive *Duration `json:"keep_alive,omitempty"` - // Images is an optional list of base64-encoded images accompanying this + // Images is an optional list of raw image bytes accompanying this // request, for multimodal models. Images []ImageData `json:"images,omitempty"`