Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2025-04-24 20:05:18 +08:00
committed by GitHub
18 changed files with 105 additions and 42 deletions

View File

@@ -313,7 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-AGI)
- [big-AGI](https://github.com/enricoros/big-AGI)
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
### Cloud

View File

@@ -76,7 +76,7 @@ type GenerateRequest struct {
// this request.
KeepAlive *Duration `json:"keep_alive,omitempty"`
// Images is an optional list of base64-encoded images accompanying this
// Images is an optional list of raw image bytes accompanying this
// request, for multimodal models.
Images []ImageData `json:"images,omitempty"`

View File

@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
envVars["OLLAMA_CONTEXT_LENGTH"],
})
default:
appendEnvDocs(cmd, envs)

View File

@@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string {
"\\\\", "\\", // Escaped backslash
"\\*", "*", // Escaped asterisk
"\\?", "?", // Escaped question mark
"\\~", "~", // Escaped tilde
).Replace(fp)
}

View File

@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size?
By default, Ollama uses a context window size of 2048 tokens.
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
To change this when using `ollama run`, use `/set parameter`:
```shell
/set parameter num_ctx 4096
/set parameter num_ctx 8192
```
When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 4096
"num_ctx": 8192
}
}'
```

View File

@@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
)
func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
}
}
func Int64(key string, defaultValue int64) func() int64 {
return func() int64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
// Informational

View File

@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
}
func TestContextLength(t *testing.T) {
cases := map[string]uint{
"": 2048,
cases := map[string]int64{
"": -1,
"4096": 4096,
}

View File

@@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
return out, nil
}
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
s := make([]float32, 0, int((stop-start)/step))
for i := start; i < stop; i += step {
s = append(s, i)
}
out, _ := c.FromFloatSlice(s, len(s))
out.(*testTensor).dtype = dtype
return out
}
func (c *testContext) Input() ml.Context { return c }
func (c *testContext) Layer(int) ml.Context { return c }

View File

@@ -95,6 +95,9 @@ type Context interface {
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
FromIntSlice(s []int32, shape ...int) (Tensor, error)
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
Arange(start, stop, step float32, dtype DType) Tensor
Forward(...Tensor) Context
Compute(...Tensor)

View File

@@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
return t, nil
}
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
switch dtype {
case ml.DTypeF32:
// ggml_arange creates a float32 tensor
return &Tensor{
b: c.b,
t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
}
case ml.DTypeI32:
// ggml_cast does not support float32 to int32 conversion
arange := make([]int32, 0, int((stop-start)/step))
for i := start; i < stop; i += step {
arange = append(arange, int32(i))
}
t, err := c.Input().FromIntSlice(arange, len(arange))
if err != nil {
panic(err)
}
return t
default:
panic("unsupported dtype for arange")
}
}
func (c *Context) Close() {
if c != nil {
for _, b := range *c.allocatedBuffers {

View File

@@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
positions := make([]int32, numPatches)
for i := range positions {
positions[i] = int32(i)
}
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
if err != nil {
panic(err)
}
positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
for _, layer := range m.Layers {

View File

@@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err
}
positions := make([]int32, 1601)
for i := range positions {
positions[i] = int32(i)
}
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
if err != nil {
return nil, err
}
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
return m.Projector.Forward(ctx, crossAttentionStates), nil
}

View File

@@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
}
func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
tmpDir, err := os.MkdirTemp("", "ollama-safetensors")
tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors")
if err != nil {
return nil, err
}

View File

@@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
}
type progressUpdateJSON struct {
Error string `json:"error,omitempty,omitzero"`
Status string `json:"status,omitempty,omitzero"`
Digest blob.Digest `json:"digest,omitempty,omitzero"`
Total int64 `json:"total,omitempty,omitzero"`
@@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
case err := <-done:
flushProgress()
if err != nil {
var status string
if errors.Is(err, ollama.ErrModelNotFound) {
status = fmt.Sprintf("error: model %q not found", p.model())
return &serverError{
Status: 404,
Code: "not_found",
Message: fmt.Sprintf("model %q not found", p.model()),
}
} else {
status = fmt.Sprintf("error: %v", err)
return err
}
enc.Encode(progressUpdateJSON{Status: status})
return nil
}
// Emulate old client pull progress (for now):

View File

@@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) {
got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`)
checkResponse(got, `
{"status":"error: model \"unknown\" not found"}
{"code":"not_found","error":"model \"unknown\" not found"}
`)
got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`)
@@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) {
got = s.send(t, "POST", "/api/pull", `{"model": "://"}`)
checkResponse(got, `
{"status":"error: invalid or missing name: \"\""}
{"code":"bad_request","error":"invalid or missing name: \"\""}
`)
// Non-streaming pulls

View File

@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Help me write tests."},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {

View File

@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
// Model will still need to fit in VRAM. If this setting won't fit
// we'll back off down to 1 to try to get it to fit
var defaultParallel = 4
var defaultParallel = 2
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
req := &LlmRequest{
ctx: c,
model: model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
}()
}
const (
defaultContextLength = 4096
smallGpuContextLength = 2048
)
func (s *Scheduler) processPending(ctx context.Context) {
for {
select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn()
}
if pending.origNumCtx == -1 {
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
pending.opts.NumCtx = smallGpuContextLength
pending.origNumCtx = smallGpuContextLength
} else {
pending.opts.NumCtx = defaultContextLength
pending.origNumCtx = defaultContextLength
}
}
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs

View File

@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
b.req.opts.NumCtx = 4096
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return b
}