Merge branch 'ollama:main' into main

2025-12-21 22:33:56 +00:00 · 2025-04-24 20:05:18 +08:00
parent d9472e31b7 40b10eee6d
commit e82cdb5f24
18 changed files with 105 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -313,7 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
 - [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-AGI) 
+- [big-AGI](https://github.com/enricoros/big-AGI)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
+- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)

 ### Cloud

--- a/api/types.go
+++ b/api/types.go
@@ -76,7 +76,7 @@ type GenerateRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

-	// Images is an optional list of base64-encoded images accompanying this
+	// Images is an optional list of raw image bytes accompanying this
 	// request, for multimodal models.
 	Images []ImageData `json:"images,omitempty"`

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string {
 		"\\\\", "\\", // Escaped backslash
 		"\\*", "*", // Escaped asterisk
 		"\\?", "?", // Escaped question mark
+		"\\~", "~", // Escaped tilde
 	).Replace(fp)
 }

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 2048 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:

 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```

 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
  }
 }'
 ```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )

 func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }

+func Int64(key string, defaultValue int64) func() int64 {
+	return func() int64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)

@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
 }

 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
-		"":     2048,
+	cases := map[string]int64{
+		"":     -1,
 		"4096": 4096,
 	}

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return out, nil
 }

+func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
+	s := make([]float32, 0, int((stop-start)/step))
+	for i := start; i < stop; i += step {
+		s = append(s, i)
+	}
+
+	out, _ := c.FromFloatSlice(s, len(s))
+	out.(*testTensor).dtype = dtype
+	return out
+}
+
 func (c *testContext) Input() ml.Context    { return c }
 func (c *testContext) Layer(int) ml.Context { return c }

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -95,6 +95,9 @@ type Context interface {
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
 	FromIntSlice(s []int32, shape ...int) (Tensor, error)

+	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
+	Arange(start, stop, step float32, dtype DType) Tensor
+
 	Forward(...Tensor) Context
 	Compute(...Tensor)

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return t, nil
 }

+func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
+	switch dtype {
+	case ml.DTypeF32:
+		// ggml_arange creates a float32 tensor
+		return &Tensor{
+			b: c.b,
+			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
+		}
+	case ml.DTypeI32:
+		// ggml_cast does not support float32 to int32 conversion
+		arange := make([]int32, 0, int((stop-start)/step))
+		for i := start; i < stop; i += step {
+			arange = append(arange, int32(i))
+		}
+
+		t, err := c.Input().FromIntSlice(arange, len(arange))
+		if err != nil {
+			panic(err)
+		}
+
+		return t
+	default:
+		panic("unsupported dtype for arange")
+	}
+}
+
 func (c *Context) Close() {
 	if c != nil {
 		for _, b := range *c.allocatedBuffers {
--- a/model/models/gemma3/model_vision.go
+++ b/model/models/gemma3/model_vision.go
@@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	positions := make([]int32, numPatches)
-	for i := range positions {
-		positions[i] = int32(i)
-	}
-
-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		panic(err)
-	}
-
+	positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
 	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))

 	for _, layer := range m.Layers {
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	positions := make([]int32, 1601)
-	for i := range positions {
-		positions[i] = int32(i)
-	}
-
-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		return nil, err
-	}
-
+	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
 	return m.Projector.Forward(ctx, crossAttentionStates), nil
 }
--- a/server/create.go
+++ b/server/create.go
@@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
 }

 func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
-	tmpDir, err := os.MkdirTemp("", "ollama-safetensors")
+	tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors")
 	if err != nil {
 		return nil, err
 	}
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 }

 type progressUpdateJSON struct {
+	Error     string      `json:"error,omitempty,omitzero"`
 	Status    string      `json:"status,omitempty,omitzero"`
 	Digest    blob.Digest `json:"digest,omitempty,omitzero"`
 	Total     int64       `json:"total,omitempty,omitzero"`
@@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 		case err := <-done:
 			flushProgress()
 			if err != nil {
-				var status string
 				if errors.Is(err, ollama.ErrModelNotFound) {
-					status = fmt.Sprintf("error: model %q not found", p.model())
+					return &serverError{
+						Status:  404,
+						Code:    "not_found",
+						Message: fmt.Sprintf("model %q not found", p.model()),
+					}
 				} else {
-					status = fmt.Sprintf("error: %v", err)
+					return err
 				}
-				enc.Encode(progressUpdateJSON{Status: status})
-				return nil
 			}

 			// Emulate old client pull progress (for now):
--- a/server/internal/registry/server_test.go
+++ b/server/internal/registry/server_test.go
@@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) {

 	got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`)
 	checkResponse(got, `
-		{"status":"error: model \"unknown\" not found"}
+		{"code":"not_found","error":"model \"unknown\" not found"}
 	`)

 	got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`)
@@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) {

 	got = s.send(t, "POST", "/api/pull", `{"model": "://"}`)
 	checkResponse(got, `
-		{"status":"error: invalid or missing name: \"\""}
+		{"code":"bad_request","error":"invalid or missing name: \"\""}
 	`)

 	// Non-streaming pulls
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
--- a/server/sched.go
+++ b/server/sched.go
@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 4
+var defaultParallel = 2

 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")

@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }

+const (
+	defaultContextLength  = 4096
+	smallGpuContextLength = 2048
+)
+
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}

+					if pending.origNumCtx == -1 {
+						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
+							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
+							pending.opts.NumCtx = smallGpuContextLength
+							pending.origNumCtx = smallGpuContextLength
+						} else {
+							pending.opts.NumCtx = defaultContextLength
+							pending.origNumCtx = defaultContextLength
+						}
+					}
+
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
+	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }