Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2025-04-24 20:05:18 +08:00
committed by GitHub
18 changed files with 105 additions and 42 deletions

View File

@@ -313,7 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file) - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui) - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac) - [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-AGI) - [big-AGI](https://github.com/enricoros/big-AGI)
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core) - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica) - [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd) - [chatd](https://github.com/BruceMacD/chatd)
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).) - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama) - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
### Cloud ### Cloud

View File

@@ -76,7 +76,7 @@ type GenerateRequest struct {
// this request. // this request.
KeepAlive *Duration `json:"keep_alive,omitempty"` KeepAlive *Duration `json:"keep_alive,omitempty"`
// Images is an optional list of base64-encoded images accompanying this // Images is an optional list of raw image bytes accompanying this
// request, for multimodal models. // request, for multimodal models.
Images []ImageData `json:"images,omitempty"` Images []ImageData `json:"images,omitempty"`

View File

@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"], envVars["OLLAMA_LOAD_TIMEOUT"],
envVars["OLLAMA_CONTEXT_LENGTH"],
}) })
default: default:
appendEnvDocs(cmd, envs) appendEnvDocs(cmd, envs)

View File

@@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string {
"\\\\", "\\", // Escaped backslash "\\\\", "\\", // Escaped backslash
"\\*", "*", // Escaped asterisk "\\*", "*", // Escaped asterisk
"\\?", "?", // Escaped question mark "\\?", "?", // Escaped question mark
"\\~", "~", // Escaped tilde
).Replace(fp) ).Replace(fp)
} }

View File

@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size? ## How can I specify the context window size?
By default, Ollama uses a context window size of 2048 tokens. By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
To change this when using `ollama run`, use `/set parameter`: To change this when using `ollama run`, use `/set parameter`:
```shell ```shell
/set parameter num_ctx 4096 /set parameter num_ctx 8192
``` ```
When using the API, specify the `num_ctx` parameter: When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "llama3.2", "model": "llama3.2",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"options": { "options": {
"num_ctx": 4096 "num_ctx": 8192
} }
}' }'
``` ```

View File

@@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine // Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE") NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length // ContextLength sets the default context length
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048) ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
) )
func String(s string) func() string { func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
} }
} }
func Int64(key string, defaultValue int64) func() int64 {
return func() int64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU // Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
// Informational // Informational

View File

@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
} }
func TestContextLength(t *testing.T) { func TestContextLength(t *testing.T) {
cases := map[string]uint{ cases := map[string]int64{
"": 2048, "": -1,
"4096": 4096, "4096": 4096,
} }

View File

@@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
return out, nil return out, nil
} }
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
s := make([]float32, 0, int((stop-start)/step))
for i := start; i < stop; i += step {
s = append(s, i)
}
out, _ := c.FromFloatSlice(s, len(s))
out.(*testTensor).dtype = dtype
return out
}
func (c *testContext) Input() ml.Context { return c } func (c *testContext) Input() ml.Context { return c }
func (c *testContext) Layer(int) ml.Context { return c } func (c *testContext) Layer(int) ml.Context { return c }

View File

@@ -95,6 +95,9 @@ type Context interface {
FromFloatSlice(s []float32, shape ...int) (Tensor, error) FromFloatSlice(s []float32, shape ...int) (Tensor, error)
FromIntSlice(s []int32, shape ...int) (Tensor, error) FromIntSlice(s []int32, shape ...int) (Tensor, error)
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
Arange(start, stop, step float32, dtype DType) Tensor
Forward(...Tensor) Context Forward(...Tensor) Context
Compute(...Tensor) Compute(...Tensor)

View File

@@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
return t, nil return t, nil
} }
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
switch dtype {
case ml.DTypeF32:
// ggml_arange creates a float32 tensor
return &Tensor{
b: c.b,
t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
}
case ml.DTypeI32:
// ggml_cast does not support float32 to int32 conversion
arange := make([]int32, 0, int((stop-start)/step))
for i := start; i < stop; i += step {
arange = append(arange, int32(i))
}
t, err := c.Input().FromIntSlice(arange, len(arange))
if err != nil {
panic(err)
}
return t
default:
panic("unsupported dtype for arange")
}
}
func (c *Context) Close() { func (c *Context) Close() {
if c != nil { if c != nil {
for _, b := range *c.allocatedBuffers { for _, b := range *c.allocatedBuffers {

View File

@@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize) hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
positions := make([]int32, numPatches) positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
for i := range positions {
positions[i] = int32(i)
}
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
if err != nil {
panic(err)
}
hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs)) hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
for _, layer := range m.Layers { for _, layer := range m.Layers {

View File

@@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
positions := make([]int32, 1601) positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
for i := range positions {
positions[i] = int32(i)
}
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
if err != nil {
return nil, err
}
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio) crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
return m.Projector.Forward(ctx, crossAttentionStates), nil return m.Projector.Forward(ctx, crossAttentionStates), nil
} }

View File

@@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
} }
func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) { func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
tmpDir, err := os.MkdirTemp("", "ollama-safetensors") tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors")
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
} }
type progressUpdateJSON struct { type progressUpdateJSON struct {
Error string `json:"error,omitempty,omitzero"`
Status string `json:"status,omitempty,omitzero"` Status string `json:"status,omitempty,omitzero"`
Digest blob.Digest `json:"digest,omitempty,omitzero"` Digest blob.Digest `json:"digest,omitempty,omitzero"`
Total int64 `json:"total,omitempty,omitzero"` Total int64 `json:"total,omitempty,omitzero"`
@@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
case err := <-done: case err := <-done:
flushProgress() flushProgress()
if err != nil { if err != nil {
var status string
if errors.Is(err, ollama.ErrModelNotFound) { if errors.Is(err, ollama.ErrModelNotFound) {
status = fmt.Sprintf("error: model %q not found", p.model()) return &serverError{
Status: 404,
Code: "not_found",
Message: fmt.Sprintf("model %q not found", p.model()),
}
} else { } else {
status = fmt.Sprintf("error: %v", err) return err
} }
enc.Encode(progressUpdateJSON{Status: status})
return nil
} }
// Emulate old client pull progress (for now): // Emulate old client pull progress (for now):

View File

@@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) {
got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`) got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`)
checkResponse(got, ` checkResponse(got, `
{"status":"error: model \"unknown\" not found"} {"code":"not_found","error":"model \"unknown\" not found"}
`) `)
got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`) got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`)
@@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) {
got = s.send(t, "POST", "/api/pull", `{"model": "://"}`) got = s.send(t, "POST", "/api/pull", `{"model": "://"}`)
checkResponse(got, ` checkResponse(got, `
{"status":"error: invalid or missing name: \"\""} {"code":"bad_request","error":"invalid or missing name: \"\""}
`) `)
// Non-streaming pulls // Non-streaming pulls

View File

@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"}, {Role: "user", Content: "Hello!"},
}, },
Stream: &stream, Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
}) })
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"}, {Role: "user", Content: "Hello!"},
}, },
Stream: &stream, Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
}) })
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Help me write tests."}, {Role: "user", Content: "Help me write tests."},
}, },
Stream: &stream, Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
}) })
if w.Code != http.StatusOK { if w.Code != http.StatusOK {

View File

@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
// Default automatic value for parallel setting // Default automatic value for parallel setting
// Model will still need to fit in VRAM. If this setting won't fit // Model will still need to fit in VRAM. If this setting won't fit
// we'll back off down to 1 to try to get it to fit // we'll back off down to 1 to try to get it to fit
var defaultParallel = 4 var defaultParallel = 2
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner // context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
req := &LlmRequest{ req := &LlmRequest{
ctx: c, ctx: c,
model: model, model: model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
}() }()
} }
const (
defaultContextLength = 4096
smallGpuContextLength = 2048
)
func (s *Scheduler) processPending(ctx context.Context) { func (s *Scheduler) processPending(ctx context.Context) {
for { for {
select { select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn() gpus = s.getGpuFn()
} }
if pending.origNumCtx == -1 {
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
pending.opts.NumCtx = smallGpuContextLength
pending.origNumCtx = smallGpuContextLength
} else {
pending.opts.NumCtx = defaultContextLength
pending.origNumCtx = defaultContextLength
}
}
if envconfig.MaxRunners() <= 0 { if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use // No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs

View File

@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
b.req.opts.NumCtx = 4096
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return b return b
} }