mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -313,7 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
||||
- [big-AGI](https://github.com/enricoros/big-AGI)
|
||||
- [big-AGI](https://github.com/enricoros/big-AGI)
|
||||
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
||||
- [Amica](https://github.com/semperai/amica)
|
||||
- [chatd](https://github.com/BruceMacD/chatd)
|
||||
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
|
||||
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
|
||||
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
||||
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
||||
|
||||
### Cloud
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ type GenerateRequest struct {
|
||||
// this request.
|
||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||
|
||||
// Images is an optional list of base64-encoded images accompanying this
|
||||
// Images is an optional list of raw image bytes accompanying this
|
||||
// request, for multimodal models.
|
||||
Images []ImageData `json:"images,omitempty"`
|
||||
|
||||
|
||||
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
|
||||
envVars["OLLAMA_LLM_LIBRARY"],
|
||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||
envVars["OLLAMA_CONTEXT_LENGTH"],
|
||||
})
|
||||
default:
|
||||
appendEnvDocs(cmd, envs)
|
||||
|
||||
@@ -503,6 +503,7 @@ func normalizeFilePath(fp string) string {
|
||||
"\\\\", "\\", // Escaped backslash
|
||||
"\\*", "*", // Escaped asterisk
|
||||
"\\?", "?", // Escaped question mark
|
||||
"\\~", "~", // Escaped tilde
|
||||
).Replace(fp)
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
|
||||
|
||||
## How can I specify the context window size?
|
||||
|
||||
By default, Ollama uses a context window size of 2048 tokens.
|
||||
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
|
||||
|
||||
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||
|
||||
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
||||
To change this when using `ollama run`, use `/set parameter`:
|
||||
|
||||
```shell
|
||||
/set parameter num_ctx 4096
|
||||
/set parameter num_ctx 8192
|
||||
```
|
||||
|
||||
When using the API, specify the `num_ctx` parameter:
|
||||
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3.2",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"options": {
|
||||
"num_ctx": 4096
|
||||
"num_ctx": 8192
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -169,7 +169,7 @@ var (
|
||||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// ContextLength sets the default context length
|
||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
|
||||
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||
}
|
||||
}
|
||||
|
||||
func Int64(key string, defaultValue int64) func() int64 {
|
||||
return func() int64 {
|
||||
if s := Var(key); s != "" {
|
||||
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
|
||||
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
||||
} else {
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
return defaultValue
|
||||
}
|
||||
}
|
||||
|
||||
// Set aside VRAM per GPU
|
||||
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||
|
||||
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
|
||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
|
||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||
|
||||
// Informational
|
||||
|
||||
@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestContextLength(t *testing.T) {
|
||||
cases := map[string]uint{
|
||||
"": 2048,
|
||||
cases := map[string]int64{
|
||||
"": -1,
|
||||
"4096": 4096,
|
||||
}
|
||||
|
||||
|
||||
@@ -424,6 +424,17 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||
s := make([]float32, 0, int((stop-start)/step))
|
||||
for i := start; i < stop; i += step {
|
||||
s = append(s, i)
|
||||
}
|
||||
|
||||
out, _ := c.FromFloatSlice(s, len(s))
|
||||
out.(*testTensor).dtype = dtype
|
||||
return out
|
||||
}
|
||||
|
||||
func (c *testContext) Input() ml.Context { return c }
|
||||
func (c *testContext) Layer(int) ml.Context { return c }
|
||||
|
||||
|
||||
@@ -95,6 +95,9 @@ type Context interface {
|
||||
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
||||
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
||||
|
||||
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
|
||||
Arange(start, stop, step float32, dtype DType) Tensor
|
||||
|
||||
Forward(...Tensor) Context
|
||||
Compute(...Tensor)
|
||||
|
||||
|
||||
@@ -696,6 +696,32 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||
switch dtype {
|
||||
case ml.DTypeF32:
|
||||
// ggml_arange creates a float32 tensor
|
||||
return &Tensor{
|
||||
b: c.b,
|
||||
t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
|
||||
}
|
||||
case ml.DTypeI32:
|
||||
// ggml_cast does not support float32 to int32 conversion
|
||||
arange := make([]int32, 0, int((stop-start)/step))
|
||||
for i := start; i < stop; i += step {
|
||||
arange = append(arange, int32(i))
|
||||
}
|
||||
|
||||
t, err := c.Input().FromIntSlice(arange, len(arange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return t
|
||||
default:
|
||||
panic("unsupported dtype for arange")
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Context) Close() {
|
||||
if c != nil {
|
||||
for _, b := range *c.allocatedBuffers {
|
||||
|
||||
@@ -92,16 +92,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
||||
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
|
||||
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
positions := make([]int32, numPatches)
|
||||
for i := range positions {
|
||||
positions[i] = int32(i)
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
|
||||
hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
|
||||
|
||||
for _, layer := range m.Layers {
|
||||
|
||||
@@ -93,16 +93,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions := make([]int32, 1601)
|
||||
for i := range positions {
|
||||
positions[i] = int32(i)
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
|
||||
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||
return m.Projector.Forward(ctx, crossAttentionStates), nil
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
|
||||
}
|
||||
|
||||
func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, isAdapter bool, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
|
||||
tmpDir, err := os.MkdirTemp("", "ollama-safetensors")
|
||||
tmpDir, err := os.MkdirTemp(envconfig.Models(), "ollama-safetensors")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -244,6 +244,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
|
||||
}
|
||||
|
||||
type progressUpdateJSON struct {
|
||||
Error string `json:"error,omitempty,omitzero"`
|
||||
Status string `json:"status,omitempty,omitzero"`
|
||||
Digest blob.Digest `json:"digest,omitempty,omitzero"`
|
||||
Total int64 `json:"total,omitempty,omitzero"`
|
||||
@@ -348,14 +349,15 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
|
||||
case err := <-done:
|
||||
flushProgress()
|
||||
if err != nil {
|
||||
var status string
|
||||
if errors.Is(err, ollama.ErrModelNotFound) {
|
||||
status = fmt.Sprintf("error: model %q not found", p.model())
|
||||
return &serverError{
|
||||
Status: 404,
|
||||
Code: "not_found",
|
||||
Message: fmt.Sprintf("model %q not found", p.model()),
|
||||
}
|
||||
} else {
|
||||
status = fmt.Sprintf("error: %v", err)
|
||||
return err
|
||||
}
|
||||
enc.Encode(progressUpdateJSON{Status: status})
|
||||
return nil
|
||||
}
|
||||
|
||||
// Emulate old client pull progress (for now):
|
||||
|
||||
@@ -221,7 +221,7 @@ func TestServerPull(t *testing.T) {
|
||||
|
||||
got = s.send(t, "POST", "/api/pull", `{"model": "unknown"}`)
|
||||
checkResponse(got, `
|
||||
{"status":"error: model \"unknown\" not found"}
|
||||
{"code":"not_found","error":"model \"unknown\" not found"}
|
||||
`)
|
||||
|
||||
got = s.send(t, "DELETE", "/api/pull", `{"model": "smol"}`)
|
||||
@@ -235,7 +235,7 @@ func TestServerPull(t *testing.T) {
|
||||
|
||||
got = s.send(t, "POST", "/api/pull", `{"model": "://"}`)
|
||||
checkResponse(got, `
|
||||
{"status":"error: invalid or missing name: \"\""}
|
||||
{"code":"bad_request","error":"invalid or missing name: \"\""}
|
||||
`)
|
||||
|
||||
// Non-streaming pulls
|
||||
|
||||
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
|
||||
{Role: "user", Content: "Help me write tests."},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
|
||||
@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
|
||||
// Default automatic value for parallel setting
|
||||
// Model will still need to fit in VRAM. If this setting won't fit
|
||||
// we'll back off down to 1 to try to get it to fit
|
||||
var defaultParallel = 4
|
||||
var defaultParallel = 2
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: model,
|
||||
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
|
||||
}()
|
||||
}
|
||||
|
||||
const (
|
||||
defaultContextLength = 4096
|
||||
smallGpuContextLength = 2048
|
||||
)
|
||||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
gpus = s.getGpuFn()
|
||||
}
|
||||
|
||||
if pending.origNumCtx == -1 {
|
||||
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
|
||||
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
|
||||
pending.opts.NumCtx = smallGpuContextLength
|
||||
pending.origNumCtx = smallGpuContextLength
|
||||
} else {
|
||||
pending.opts.NumCtx = defaultContextLength
|
||||
pending.origNumCtx = defaultContextLength
|
||||
}
|
||||
}
|
||||
|
||||
if envconfig.MaxRunners() <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||
|
||||
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
b.req.opts.NumCtx = 4096
|
||||
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||
return b
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user