Merge branch 'ollama:main' into main

2025-12-21 22:33:56 +00:00 · 2025-04-24 20:05:18 +08:00
parent d9472e31b7 40b10eee6d
commit e82cdb5f24
18 changed files with 105 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -313,7 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
 - [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-AGI) 
+- [big-AGI](https://github.com/enricoros/big-AGI)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 ### Cloud
--- a/api/types.go
+++ b/api/types.go
@@ -76,7 +76,7 @@ type GenerateRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
-	// Images is an optional list of base64-encoded images accompanying this
+	// Images is an optional list of raw image bytes accompanying this
 	// request, for multimodal models.
 	Images []ImageData `json:"images,omitempty"`
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
 				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string {
 		"\\\\", "\\", // Escaped backslash
 		"\\*", "*", // Escaped asterisk
 		"\\?", "?", // Escaped question mark
 		"\\~", "~", // Escaped tilde
 	).Replace(fp)
 }
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
 ## How can I specify the context window size?
-By default, Ollama uses a context window size of 2048 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 
 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:
 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```
 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
  }
 }'
 ```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )
 func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }
 func Int64(key string, defaultValue int64) func() int64 {
 	return func() int64 {
 		if s := Var(key); s != "" {
 			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
 				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
 			} else {
 				return n
 			}
 		}
 		return defaultValue
 	}
 }
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
 }
 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
+	cases := map[string]int64{
-		"":     2048,
+		"":     -1,
 		"4096": 4096,
 	}
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return out, nil
 }
 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 	s := make([]float32, 0, int((stop-start)/step))
 	for i := start; i < stop; i += step {
 		s = append(s, i)
 	}
 	out, _ := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
 func (c *testContext) Input() ml.Context    { return c }
 func (c *testContext) Layer(int) ml.Context { return c }
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -95,6 +95,9 @@ type Context interface {
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
 	FromIntSlice(s []int32, shape ...int) (Tensor, error)
 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
 	Forward(...Tensor) Context
 	Compute(...Tensor)
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return t, nil
 }
 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 	switch dtype {
 	case ml.DTypeF32:
 		// ggml_arange creates a float32 tensor
 		return &Tensor{
 			b: c.b,
 			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
 		}
 	case ml.DTypeI32:
 		// ggml_cast does not support float32 to int32 conversion
 		arange := make([]int32, 0, int((stop-start)/step))
 		for i := start; i < stop; i += step {
 			arange = append(arange, int32(i))
 		}
 		t, err := c.Input().FromIntSlice(arange, len(arange))
 		if err != nil {
 			panic(err)
 		}
 		return t
 	default:
 		panic("unsupported dtype for arange")
 	}
 }
 func (c *Context) Close() {
 	if c != nil {
 		for _, b := range *c.allocatedBuffers {
--- a/model/models/gemma3/model_vision.go
+++ b/model/models/gemma3/model_vision.go
@@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-	positions := make([]int32, numPatches)
+	positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
 	for i := range positions {
 		positions[i] = int32(i)
 	}
 	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
 	if err != nil {
 		panic(err)
 	}
 	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
 	for _, layer := range m.Layers {
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}
-	positions := make([]int32, 1601)
+	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	for i := range positions {
 		positions[i] = int32(i)
 	}
 	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
 	if err != nil {
 		return nil, err
 	}
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
 	return m.Projector.Forward(ctx, crossAttentionStates), nil
 }
--- a/server/create.go
+++ b/server/create.go
@@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
 }
 func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
-	tmpDir, err := os.MkdirTemp("", "ollama-safetensors")
+	tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors")
 	if err != nil {
 		return nil, err
 	}
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 }
 type progressUpdateJSON struct {
 	Error     string      `json:"error,omitempty,omitzero"`
 	Status    string      `json:"status,omitempty,omitzero"`
 	Digest    blob.Digest `json:"digest,omitempty,omitzero"`
 	Total     int64       `json:"total,omitempty,omitzero"`
@@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 		case err := <-done:
 			flushProgress()
 			if err != nil {
 				var status string
 				if errors.Is(err, ollama.ErrModelNotFound) {
-					status = fmt.Sprintf("error: model %q not found", p.model())
+					return &serverError{
 						Status:  404,
 						Code:    "not_found",
 						Message: fmt.Sprintf("model %q not found", p.model()),
 					}
 				} else {
-					status = fmt.Sprintf("error: %v", err)
+					return err
 				}
 				enc.Encode(progressUpdateJSON{Status: status})
 				return nil
 			}
 			// Emulate old client pull progress (for now):
--- a/server/internal/registry/server_test.go
+++ b/server/internal/registry/server_test.go
@@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) {
 	got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`)
 	checkResponse(got, `
-		{"status":"error: model \"unknown\" not found"}
+		{"code":"not_found","error":"model \"unknown\" not found"}
 	`)
 	got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`)
@@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) {
 	got = s.send(t, "POST", "/api/pull", `{"model": "://"}`)
 	checkResponse(got, `
-		{"status":"error: invalid or missing name: \"\""}
+		{"code":"bad_request","error":"invalid or missing name: \"\""}
 	`)
 	// Non-streaming pulls
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
--- a/server/sched.go
+++ b/server/sched.go
@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 4
+var defaultParallel = 2
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }
 const (
 	defaultContextLength  = 4096
 	smallGpuContextLength = 2048
 )
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}
 					if pending.origNumCtx == -1 {
 						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
 							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
 							pending.opts.NumCtx = smallGpuContextLength
 							pending.origNumCtx = smallGpuContextLength
 						} else {
 							pending.opts.NumCtx = defaultContextLength
 							pending.origNumCtx = defaultContextLength
 						}
 					}
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
 	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }