From 8a75e9ee151933d71182d31125cec0fb821d0183 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 7 Aug 2025 02:33:09 +0800 Subject: [PATCH 01/17] Update downloading to pulling in api.md (#11170) update api.md to make it consist with code. https://github.com/ollama/ollama/blob/main/server/download.go#L447 --- docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 683db357..f11d59ed 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co ```json { - "status": "downloading digestname", + "status": "pulling digestname", "digest": "digestname", "total": 2142590208, "completed": 241970 From fa8be9e35ce88ef28cd59062f1c8b647a8261bfc Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 6 Aug 2025 13:31:22 -0700 Subject: [PATCH 02/17] clean up debugging (#11756) --- ml/backend/ggml/ggml.go | 56 ----------------------------------------- 1 file changed, 56 deletions(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 15c210dc..36fa5907 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -239,12 +239,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor { for _, bt := range bts { if _, ok := ctxs[bt]; !ok { - // slog.Info("XXX before ggml_init") ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors), no_alloc: true, }) - // slog.Info("XXX after ggml_init") } targets[t.source.Name] = append(targets[t.source.Name], t.target) @@ -543,8 +541,6 @@ func (b *Backend) NewContextSize(n int) ml.Context { var allocatedBuffers []*C.struct_ggml_backend_buffer - // slog.Info("XXX before ggml_init") - // defer slog.Info("XXX after ggml_init") return &Context{ b: b, maxGraphNodes: n, @@ -1407,55 +1403,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor { return t } - -// TODO - DRY this out with New if possible -func newTestBackend(size int) *Backend { - var cpus []*C.struct_ggml_backend_device - for _, d := range devices() { - switch C.ggml_backend_dev_type(d) { - case C.GGML_BACKEND_DEVICE_TYPE_CPU: - if len(cpus) == 0 { - // only the first cpu device should be used - cpus = append(cpus, d) - break - } - } - } - var schedBackends []*C.struct_ggml_backend - var schedBufts []*C.struct_ggml_backend_buffer_type - b := C.ggml_backend_dev_init(cpus[0], nil) - bt := C.ggml_backend_get_default_buffer_type(b) - C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU()))) - // C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING - schedBackends = append(schedBackends, b) - schedBufts = append(schedBufts, bt) - return &Backend{ - meta: nil, - sched: C.ggml_backend_sched_new( - (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])), - (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])), - C.int(len(schedBackends)), - C.size_t(max(8192, size)), - false, - false, - ), - input: bt, - maxGraphNodes: max(8192, size), - schedBackends: schedBackends, - schedBufts: schedBufts, - } -} - -func newTestContext(b *Backend, n int) *Context { - n = max(8192, n) - // slog.Info("XXX before ggml_init") - // defer slog.Info("XXX after ggml_init") - return &Context{ - b: b, - maxGraphNodes: n, - ctx: C.ggml_init(C.struct_ggml_init_params{ - mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false), - no_alloc: true, - }), - } -} From 203c137810846865f6358b25c8937bc4b55dfda4 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Wed, 6 Aug 2025 15:50:30 -0700 Subject: [PATCH 03/17] openai: allow for content _and_ tool calls in the same message Previously our OpenAI chat completions compat layer assumed that tool calls and content would never be provided together, but this is not a correct assumption. Content is only optional when tool calls are present, but tool calls and content can be provided together Fixes: https://github.com/ollama/ollama/issues/11704 --- openai/openai.go | 29 ++++++++++++++++++++++++++++- openai/openai_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/openai/openai.go b/openai/openai.go index d065de8f..95486ef9 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -403,7 +403,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { for _, msg := range r.Messages { switch content := msg.Content.(type) { case string: - messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning}) + toolCalls, err := fromCompletionToolCall(msg.ToolCalls) + if err != nil { + return nil, err + } + messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls}) case []any: for _, c := range content { data, ok := c.(map[string]any) @@ -454,7 +458,17 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, errors.New("invalid message format") } } + // since we might have added multiple messages above, if we have tools + // calls we'll add them to the last message + if len(messages) > 0 && len(msg.ToolCalls) > 0 { + toolCalls, err := fromCompletionToolCall(msg.ToolCalls) + if err != nil { + return nil, err + } + messages[len(messages)-1].ToolCalls = toolCalls + } default: + // content is only optional if tool calls are present if msg.ToolCalls == nil { return nil, fmt.Errorf("invalid message content type: %T", content) } @@ -549,6 +563,19 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { }, nil } +func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) { + apiToolCalls := make([]api.ToolCall, len(toolCalls)) + for i, tc := range toolCalls { + apiToolCalls[i].Function.Name = tc.Function.Name + err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments) + if err != nil { + return nil, errors.New("invalid tool call arguments") + } + } + + return apiToolCalls, nil +} + func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { options := make(map[string]any) diff --git a/openai/openai_test.go b/openai/openai_test.go index 471b4737..96a94f52 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -235,6 +235,45 @@ func TestChatMiddleware(t *testing.T) { Stream: &False, }, }, + { + name: "chat handler with tools and content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + Content: "Let's see what the weather is like in Paris", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, { name: "chat handler with streaming tools", body: `{ From 44bc36d06301bbc23ea3cd4af935e24cfb945f33 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Wed, 6 Aug 2025 16:55:57 -0700 Subject: [PATCH 04/17] docs: update the faq (#11760) --- docs/faq.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index a6ad6f6e..900ffba4 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 4096 tokens. +By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens. -This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: +This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: ```shell OLLAMA_CONTEXT_LENGTH=8192 ollama serve @@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{ }' ``` +Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly. + ## How can I tell if my model was loaded onto the GPU? Use the `ollama ps` command to see what models are currently loaded into memory. @@ -57,8 +59,8 @@ ollama ps > **Output**: > > ``` -> NAME ID SIZE PROCESSOR UNTIL -> llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now +> NAME ID SIZE PROCESSOR CONTEXT UNTIL +> gpt-oss:20b 05afbac4bad6 16 GB 100% GPU 8192 4 minutes from now > ``` The `Processor` column will show which memory the model was loaded in to: @@ -148,9 +150,11 @@ docker build -t ollama-with-ca . docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca ``` -## Does Ollama send my prompts and answers back to ollama.com? +## Does Ollama send my prompts and responses back to ollama.com? -No. Ollama runs locally, and conversation data does not leave your machine. +If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored. + +If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine. ## How can I expose Ollama on my network? @@ -345,4 +349,4 @@ Ollama for Windows and macOS register as a login item during installation. You - Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove **MacOS Ventura (v13) and later** -- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable. \ No newline at end of file +- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable. From 759dd78dd600ebf751ff922939256d4f1ec2394d Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Wed, 6 Aug 2025 17:00:24 -0700 Subject: [PATCH 05/17] openai: when converting role=tool messages, propagate the tool name Added support for converting both `name` and `tool_call_id` fields, which different clients might provide. `name` is a legacy field from the OpenAI completions API. For `tool_call_id` we inspect previous messages and look for a matching tool call ID and grab its name Issue: https://github.com/ollama/ollama/issues/11704 --- openai/openai.go | 36 +++++++++++++++--- openai/openai_test.go | 88 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 5 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index 95486ef9..17ef6e82 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -34,10 +34,12 @@ type ErrorResponse struct { } type Message struct { - Role string `json:"role"` - Content any `json:"content"` - Reasoning string `json:"reasoning,omitempty"` - ToolCalls []ToolCall `json:"tool_calls,omitempty"` + Role string `json:"role"` + Content any `json:"content"` + Reasoning string `json:"reasoning,omitempty"` + ToolCalls []ToolCall `json:"tool_calls,omitempty"` + Name string `json:"name,omitempty"` + ToolCallID string `json:"tool_call_id,omitempty"` } type Choice struct { @@ -401,13 +403,20 @@ func toModel(r api.ShowResponse, m string) Model { func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var messages []api.Message for _, msg := range r.Messages { + toolName := "" + if strings.ToLower(msg.Role) == "tool" { + toolName = msg.Name + if toolName == "" && msg.ToolCallID != "" { + toolName = nameFromToolCallID(r.Messages, msg.ToolCallID) + } + } switch content := msg.Content.(type) { case string: toolCalls, err := fromCompletionToolCall(msg.ToolCalls) if err != nil { return nil, err } - messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls}) + messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName}) case []any: for _, c := range content { data, ok := c.(map[string]any) @@ -466,6 +475,9 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, err } messages[len(messages)-1].ToolCalls = toolCalls + if toolName != "" { + messages[len(messages)-1].ToolName = toolName + } } default: // content is only optional if tool calls are present @@ -563,6 +575,20 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { }, nil } +func nameFromToolCallID(messages []Message, toolCallID string) string { + // iterate backwards to be more resilient to duplicate tool call IDs (this + // follows "last one wins") + for i := len(messages) - 1; i >= 0; i-- { + msg := messages[i] + for _, tc := range msg.ToolCalls { + if tc.ID == toolCallID { + return tc.Function.Name + } + } + } + return "" +} + func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) { apiToolCalls := make([]api.ToolCall, len(toolCalls)) for i, tc := range toolCalls { diff --git a/openai/openai_test.go b/openai/openai_test.go index 96a94f52..83057135 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -274,6 +274,94 @@ func TestChatMiddleware(t *testing.T) { Stream: &False, }, }, + { + name: "tool response with call ID", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}, + {"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + { + Role: "tool", + Content: "The weather in Paris is 20 degrees Celsius", + ToolName: "get_current_weather", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "tool response with name", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}, + {"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + { + Role: "tool", + Content: "The weather in Paris is 20 degrees Celsius", + ToolName: "get_current_weather", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, { name: "chat handler with streaming tools", body: `{ From 735c41f9ca38fd2507c3e8e93efe6bbb94455a6f Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Wed, 6 Aug 2025 18:54:20 -0700 Subject: [PATCH 06/17] openai: always provide reasoning We were missing passing along thinking if content was nil (as opposed to empty string) Also added a test for content not being passed, which was the real cause of , since with the way `Content` is typed, not passing it and empty string are distinct --- openai/openai.go | 3 +- openai/openai_test.go | 77 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/openai/openai.go b/openai/openai.go index 17ef6e82..50fdb81e 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -478,6 +478,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { if toolName != "" { messages[len(messages)-1].ToolName = toolName } + messages[len(messages)-1].Thinking = msg.Reasoning } default: // content is only optional if tool calls are present @@ -493,7 +494,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, errors.New("invalid tool call arguments") } } - messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls}) + messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls}) } } diff --git a/openai/openai_test.go b/openai/openai_test.go index 83057135..0d7f016b 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -274,6 +274,83 @@ func TestChatMiddleware(t *testing.T) { Stream: &False, }, }, + { + name: "chat handler with tools and empty content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "chat handler with tools and thinking content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + Thinking: "Let's see what the weather is like in Paris", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, { name: "tool response with call ID", body: `{ From f2e9c9aff5f59b21a5d9a9668408732b3de01e20 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 7 Aug 2025 13:49:26 -0700 Subject: [PATCH 07/17] server: Reduce gpt-oss context length for small VRAM GPUs gpt-oss works best with a context length of at least 8k. However, for GPUs with limited amount of VRAM, there is a significant performance hit to this increased context. In these cases, we switch to the Ollama default of 4k --- server/routes.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/server/routes.go b/server/routes.go index 991e9200..acefea31 100644 --- a/server/routes.go +++ b/server/routes.go @@ -30,6 +30,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" @@ -50,11 +51,16 @@ func experimentEnabled(name string) bool { var useClient2 = experimentEnabled("client2") +// Low VRAM mode is based on the sum of total VRAM (not free) and triggers +// reduced context length on some models +var lowVRAMThreshold uint64 = 20 * format.GibiByte + var mode string = gin.DebugMode type Server struct { - addr net.Addr - sched *Scheduler + addr net.Addr + sched *Scheduler + lowVRAM bool } func init() { @@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, err } - // This model requires a minimum context to function effectively - if slices.Contains(model.Config.ModelFamilies, "gptoss") { + // This model is much more capable with a larger context, so set that + // unless it would penalize performance too much + if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") { opts.NumCtx = max(opts.NumCtx, 8192) } @@ -1382,6 +1389,15 @@ func Serve(ln net.Listener) error { gpus := discover.GetGPUInfo() gpus.LogDetails() + var totalVRAM uint64 + for _, gpu := range gpus { + totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead() + } + if totalVRAM < lowVRAMThreshold { + s.lowVRAM = true + slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold)) + } + err = srvr.Serve(ln) // If server is closed from the signal handler, wait for the ctx to be done // otherwise error out quickly From 114c3f22657750cfb57f70c4a0d6e7389fb7a9fe Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 7 Aug 2025 15:06:57 -0700 Subject: [PATCH 08/17] tests: add integration coverage for oss-gpt (#11696) Also wires up support to override the default "smol" model --- integration/utils_test.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/integration/utils_test.go b/integration/utils_test.go index 3d726123..727825a4 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -28,7 +28,7 @@ import ( "github.com/stretchr/testify/require" ) -const ( +var ( smol = "llama3.2:1b" ) @@ -37,6 +37,7 @@ var ( // Note: add newer models at the top of the list to test them first ollamaEngineChatModels = []string{ + "gpt-oss:20b", "gemma3n:e2b", "mistral-small3.2:latest", "deepseek-r1:1.5b", @@ -126,6 +127,7 @@ var ( "gemma3n", "glm4", "goliath", + "gpt-oss:20b", "granite-code", "granite3-dense", "granite3-guardian", @@ -255,8 +257,13 @@ var ( } ) -func Init() { +func init() { lifecycle.InitLogging() + custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL") + if custom != "" { + slog.Info("setting smol test model to " + custom) + smol = custom + } } func FindPort() string { From d7f4f788d1c0f51919b417a2a1e6cb5dc7553773 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 6 Aug 2025 11:39:08 -0700 Subject: [PATCH 09/17] ggml: Use GGML's typedef'ed pointer types For many backend data structures, GGML defines a typedef of a pointer type and returns these from functions. In most cases, CGo understands that these are interchangable but some parts of Go (such as generics) think they are two different types. We should prefer the form that GGML uses. --- ml/backend/ggml/ggml.go | 48 ++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 36fa5907..a0bca1c8 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -33,9 +33,9 @@ import ( "golang.org/x/sync/errgroup" ) -func devices() []*C.struct_ggml_backend_device { +func devices() []C.ggml_backend_dev_t { ggml.OnceLoad() - ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count()) + ds := make([]C.ggml_backend_dev_t, C.ggml_backend_dev_count()) for i := range ds { ds[i] = C.ggml_backend_dev_get(C.size_t(i)) } @@ -53,23 +53,23 @@ type Backend struct { // to the name that is used by the model definition tensorLoadTargets map[string][]string - sched *C.struct_ggml_backend_sched - schedBackends []*C.struct_ggml_backend - schedBufts []*C.struct_ggml_backend_buffer_type + sched C.ggml_backend_sched_t + schedBackends []C.ggml_backend_t + schedBufts []C.ggml_backend_buffer_type_t tensors map[string]*C.struct_ggml_tensor // input is the backend used for inputs - input *C.struct_ggml_backend_buffer_type + input C.ggml_backend_buffer_type_t // layers is the backend used for repeating layers - layers map[int]*C.struct_ggml_backend_buffer_type + layers map[int]C.ggml_backend_buffer_type_t // requiredMemory is the cumulative memory allocations needed by the backend requiredMemory *ml.BackendMemory // btDeviceMemory maps from a buffer type to the memory allocations associated with that device - btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory + btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory flashAttention bool @@ -100,14 +100,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ) var requiredMemory ml.BackendMemory - btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory) + btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory) type deviceBufferType struct { - d *C.struct_ggml_backend_device - bts []*C.struct_ggml_backend_buffer_type + d C.ggml_backend_dev_t + bts []C.ggml_backend_buffer_type_t } - var cpus, accels, gpus []*C.struct_ggml_backend_device + var cpus, accels, gpus []C.ggml_backend_dev_t for _, d := range devices() { switch C.ggml_backend_dev_type(d) { case C.GGML_BACKEND_DEVICE_TYPE_CPU: @@ -149,7 +149,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { bt := C.ggml_backend_dev_buffer_type(d) gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{ d: d, - bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...), + bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...), }) btDeviceMemory[bt] = &requiredMemory.GPUs[i] requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d)) @@ -235,8 +235,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { targets := make(map[string][]string) // contexts are shared by tensors of the same buffer type - ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context) - createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor { + ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context) + createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor { for _, bt := range bts { if _, ok := ctxs[bt]; !ok { ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ @@ -330,7 +330,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } // allocate buffers for each context - bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs)) + bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs)) for bt, c := range ctxs { if C.ggml_get_first_tensor(c) == nil { continue @@ -388,11 +388,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } // map devices to backend buffer types so new tensors can be assigned to the correct device - deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type) + deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t) // create backends and buffer types used for the compute graph scheduler - var schedBackends []*C.struct_ggml_backend - var schedBufts []*C.struct_ggml_backend_buffer_type + var schedBackends []C.ggml_backend_t + var schedBufts []C.ggml_backend_buffer_type_t for _, d := range append(gpus, append(accels, cpus...)...) { b := C.ggml_backend_dev_init(d, nil) bt := C.ggml_backend_get_default_buffer_type(b) @@ -426,8 +426,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { schedBackends: schedBackends, schedBufts: schedBufts, input: deviceBufferTypes[input.d], - layers: func() map[int]*C.struct_ggml_backend_buffer_type { - m := make(map[int]*C.struct_ggml_backend_buffer_type) + layers: func() map[int]C.ggml_backend_buffer_type_t { + m := make(map[int]C.ggml_backend_buffer_type_t) for i, layer := range layers { m[i] = deviceBufferTypes[layer.d] } @@ -539,7 +539,7 @@ func (b *Backend) NewContextSize(n int) ml.Context { panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes)) } - var allocatedBuffers []*C.struct_ggml_backend_buffer + var allocatedBuffers []C.ggml_backend_buffer_t return &Context{ b: b, @@ -568,11 +568,11 @@ type Context struct { graph *C.struct_ggml_cgraph // buft is the buffer type used for new tensors - buft *C.struct_ggml_backend_buffer_type + buft C.ggml_backend_buffer_type_t // allocatedBuffers are buffers for tensors that we have allocated in this context // so that we can free them when we close the context - allocatedBuffers *[]*C.struct_ggml_backend_buffer + allocatedBuffers *[]C.ggml_backend_buffer_t // maxGraphNodes is the maximum allowed number of graph nodes in this context maxGraphNodes int From 756c78cfc77902130385e14b5d4a35a99d06497b Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 17 Apr 2025 17:12:01 -0700 Subject: [PATCH 10/17] ggml: Support closing backends In order to iteratively find the best memory allocation, we need to be able to free backend memory so we can try again. --- ml/backend.go | 3 ++ ml/backend/ggml/ggml.go | 79 ++++++++++++++++++++++++----------- runner/ollamarunner/cache.go | 4 ++ runner/ollamarunner/runner.go | 9 ++++ 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/ml/backend.go b/ml/backend.go index fcb7db5e..6e76d32d 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -15,6 +15,9 @@ import ( ) type Backend interface { + // Close frees all memory associated with this backend + Close() + Load(ctx context.Context, progress func(float32)) error // BackendMemory returns the memory allocations that were made for this model diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index a0bca1c8..aa241e9b 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -19,6 +19,7 @@ import ( "slices" "strconv" "strings" + "sync" "sync/atomic" "unicode" "unsafe" @@ -33,15 +34,33 @@ import ( "golang.org/x/sync/errgroup" ) -func devices() []C.ggml_backend_dev_t { - ggml.OnceLoad() - ds := make([]C.ggml_backend_dev_t, C.ggml_backend_dev_count()) - for i := range ds { - ds[i] = C.ggml_backend_dev_get(C.size_t(i)) - } +var ( + cpus, accels, gpus []C.ggml_backend_dev_t + backends map[C.ggml_backend_dev_t]C.ggml_backend_t +) - return ds -} +var initDevices = sync.OnceFunc(func() { + ggml.OnceLoad() + + backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t) + for i := range C.ggml_backend_dev_count() { + d := C.ggml_backend_dev_get(i) + + switch C.ggml_backend_dev_type(d) { + case C.GGML_BACKEND_DEVICE_TYPE_CPU: + if len(cpus) == 0 { + // only the first cpu device should be used + cpus = append(cpus, d) + } + case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: + accels = append(accels, d) + case C.GGML_BACKEND_DEVICE_TYPE_GPU: + gpus = append(gpus, d) + } + + backends[d] = C.ggml_backend_dev_init(d, nil) + } +}) type Backend struct { // modelPath is the location of the model data @@ -75,6 +94,9 @@ type Backend struct { // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler maxGraphNodes int + + // weightBuffers are the GGML contexts and buffers for allocating weights + weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t } func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { @@ -99,6 +121,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { "num_key_values", len(meta.KV()), ) + initDevices() + var requiredMemory ml.BackendMemory btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory) @@ -107,21 +131,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { bts []C.ggml_backend_buffer_type_t } - var cpus, accels, gpus []C.ggml_backend_dev_t - for _, d := range devices() { - switch C.ggml_backend_dev_type(d) { - case C.GGML_BACKEND_DEVICE_TYPE_CPU: - if len(cpus) == 0 { - // only the first cpu device should be used - cpus = append(cpus, d) - } - case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: - accels = append(accels, d) - case C.GGML_BACKEND_DEVICE_TYPE_GPU: - gpus = append(gpus, d) - } - } - blocks := int(meta.KV().BlockCount()) // create list of buffer types for the cpu @@ -348,6 +357,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } if b == nil { + for _, b := range bbs { + C.ggml_backend_buffer_free(b) + } + + for _, ctx := range ctxs { + C.ggml_free(ctx) + } + panic(ml.ErrNoMem{BackendMemory: requiredMemory}) } @@ -394,7 +411,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { var schedBackends []C.ggml_backend_t var schedBufts []C.ggml_backend_buffer_type_t for _, d := range append(gpus, append(accels, cpus...)...) { - b := C.ggml_backend_dev_init(d, nil) + b := backends[d] bt := C.ggml_backend_get_default_buffer_type(b) deviceBufferTypes[d] = bt @@ -436,6 +453,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { requiredMemory: &requiredMemory, btDeviceMemory: btDeviceMemory, maxGraphNodes: maxGraphNodes, + weightBuffers: bbs, }, nil } @@ -443,6 +461,19 @@ func init() { ml.RegisterBackend("ggml", New) } +func (b *Backend) Close() { + if b == nil { + return + } + + for ctx, b := range b.weightBuffers { + C.ggml_backend_buffer_free(b) + C.ggml_free(ctx) + } + + C.ggml_backend_sched_free(b.sched) +} + func (b *Backend) Load(ctx context.Context, progress func(float32)) error { var doneBytes atomic.Uint64 totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 43880a41..8c8a29d8 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -70,6 +70,10 @@ func kvCacheTypeFromStr(s string) ml.DType { } func (c *InputCache) Close() { + if c == nil { + return + } + c.cache.Close() } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a7a889f1..cebe30de 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -877,6 +877,15 @@ func (s *Server) load( ) { err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache) if err != nil { + var noMem ml.ErrNoMem + if errors.As(err, &noMem) { + // We can't yet handle this but in the future we will + s.cache.Close() + if s.model != nil { + s.model.Backend().Close() + } + } + panic(err) } From 79f6376f5b4d1a27254ae2c34188bbf9bd2087da Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 23 Jul 2025 14:18:24 -0700 Subject: [PATCH 11/17] ggml: No-alloc mode Callers can set a backend buffer type to be no-alloc, meaning that it does not allocate memory for tensors or operations. This can be used for calculating memory requirements. Tensors and graphs must be recreated with no-alloc set to false before loading data. Defaults to false for newly created backend buffer types. --- llama/patches/0026-ggml-No-alloc-mode.patch | 99 ++++++++++++++++++++ ml/backend/ggml/ggml/include/ggml-backend.h | 1 + ml/backend/ggml/ggml/src/ggml-backend-impl.h | 2 + ml/backend/ggml/ggml/src/ggml-backend.cpp | 19 +++- 4 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 llama/patches/0026-ggml-No-alloc-mode.patch diff --git a/llama/patches/0026-ggml-No-alloc-mode.patch b/llama/patches/0026-ggml-No-alloc-mode.patch new file mode 100644 index 00000000..2a8dd07e --- /dev/null +++ b/llama/patches/0026-ggml-No-alloc-mode.patch @@ -0,0 +1,99 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Wed, 23 Jul 2025 11:58:49 -0700 +Subject: [PATCH] ggml: No-alloc mode + +Callers can set a backend buffer type to be no-alloc, meaning that +it does not allocate memory for tensors or operations. This can +be used for calculating memory requirements. Tensors and graphs +must be recreated with no-alloc set to false before loading data. + +Defaults to false for newly created backend buffer types. +--- + ggml/include/ggml-backend.h | 1 + + ggml/src/ggml-backend-impl.h | 2 ++ + ggml/src/ggml-backend.cpp | 19 ++++++++++++++++++- + 3 files changed, 21 insertions(+), 1 deletion(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index 48839339..3903c3cb 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -35,6 +35,7 @@ extern "C" { + // + + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); ++ GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index c36c12d6..81749a5a 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -32,6 +32,7 @@ extern "C" { + struct ggml_backend_buffer_type_i iface; + ggml_backend_dev_t device; + void * context; ++ bool no_alloc; + }; + + // +@@ -63,6 +64,7 @@ extern "C" { + void * context; + size_t size; + enum ggml_backend_buffer_usage usage; ++ bool no_alloc; + }; + + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index be335e8c..84928bc3 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); + } + ++void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) { ++ buft->no_alloc = !alloc; ++} ++ + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + if (size == 0) { + // return a dummy buffer for zero-sized allocations + return ggml_backend_buffer_init(buft, {}, NULL, 0); + } + ++ if (buft->no_alloc) { ++ ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size); ++ buf->no_alloc = true; ++ return buf; ++ } ++ + return buft->iface.alloc_buffer(buft, size); + } + +@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init( + /* .buft = */ buft, + /* .context = */ context, + /* .size = */ size, +- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY ++ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY, ++ /* .no_alloc = */ false + }; + + return buffer; +@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + return NULL; + } + ++ // If we aren't allocating memory, return a placeholder non-NULL pointer ++ // that meets alignment requirements ++ if (buffer->no_alloc) { ++ return (void *)ggml_backend_buffer_get_alignment(buffer); ++ } ++ + void * base = buffer->iface.get_base(buffer); + + GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 48839339..3903c3cb 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -35,6 +35,7 @@ extern "C" { // GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index c36c12d6..81749a5a 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -32,6 +32,7 @@ extern "C" { struct ggml_backend_buffer_type_i iface; ggml_backend_dev_t device; void * context; + bool no_alloc; }; // @@ -63,6 +64,7 @@ extern "C" { void * context; size_t size; enum ggml_backend_buffer_usage usage; + bool no_alloc; }; GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index be335e8c..84928bc3 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } +void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) { + buft->no_alloc = !alloc; +} + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { if (size == 0) { // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } + if (buft->no_alloc) { + ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size); + buf->no_alloc = true; + return buf; + } + return buft->iface.alloc_buffer(buft, size); } @@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, - /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY, + /* .no_alloc = */ false }; return buffer; @@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return NULL; } + // If we aren't allocating memory, return a placeholder non-NULL pointer + // that meets alignment requirements + if (buffer->no_alloc) { + return (void *)ggml_backend_buffer_get_alignment(buffer); + } + void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); From 2c776f07805022221e7640a643777818528d0a27 Mon Sep 17 00:00:00 2001 From: Michael Vorburger Date: Sun, 10 Aug 2025 03:12:30 +0200 Subject: [PATCH 12/17] CONTRIBUTING: Explicitly note docs:... as a good example (#11755) --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 455e7c69..10190e04 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,6 +66,7 @@ Examples: llm/backend/mlx: support the llama architecture CONTRIBUTING: provide clarity on good commit messages, and bad + docs: simplify manual installation with shorter curl commands Bad Examples: From ea7657b54a000b9cf381e6e83463f50aaa40a161 Mon Sep 17 00:00:00 2001 From: Daniel Andersen Date: Mon, 11 Aug 2025 22:59:38 +0200 Subject: [PATCH 13/17] sched: Add support for grouping GPUs (#10678) This patch modifies Ollama to allow grouping GPUs to memory-fit to the requested model, instead of the former algorithm of using one GPU distributing over all available GPUs. Benefits: - Lower amount of (PCIe-)bus communication between GPUs - especially when they are not very high speed - Allowing unallocated GPUs to get into power-saving mode. - Significantly reduce VRAM allocation when using more than 2 GPUs in a system - Due to the reduced memory allocation, you can run more models simultaneously. --- server/sched.go | 58 +++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/server/sched.go b/server/sched.go index 2842bb3a..40e6e5f7 100644 --- a/server/sched.go +++ b/server/sched.go @@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool { // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // opts.NumCtx accordingly func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { - var estimatedVRAM uint64 - var numParallelToTry []int if *numParallel <= 0 { // If no specific parallel setting was provided, try larger then smaller, always end with 1 @@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn } for _, gl := range gpus.ByLibrary() { - var ok bool sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them - // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups + // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) - // First attempt to fit the model into a single GPU - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if !envconfig.SchedSpread() { - for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) + if !envconfig.SchedSpread() { + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + // Try to pack into as few GPUs as possible, starting from 1 GPU + for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { + gpuSubset := sgl[:numGPUs] + ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p) + + if ok { + slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", numGPUs) *numParallel = p - return []discover.GpuInfo{g} + return gpuSubset } } } - } + } else { + // TODO future refinements + // - if multiple Libraries, see if any single GPU in any Library will fit + // - try subsets of GPUs instead of just falling back to 1 or all in a family - // TODO future refinements - // - if multiple Libraries, see if any single GPU in any Library will fit - // - try subsets of GPUs instead of just falling back to 1 or all in a family - - // Now try all the GPUs - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) - *numParallel = p - return sgl + // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set) + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { + slog.Info("new model will fit in available VRAM, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", len(sgl)) + *numParallel = p + return sgl + } } } } From ee04dbba51a4299b6ff4bb19f758eeacbf2b35d8 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Mon, 11 Aug 2025 14:09:13 -0700 Subject: [PATCH 14/17] server: fix error when parsing bad harmony tool calls Thanks @moll for reporting! Fixes: #11781 --- server/routes.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/routes.go b/server/routes.go index 991e9200..8d5ca12d 100644 --- a/server/routes.go +++ b/server/routes.go @@ -364,7 +364,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } @@ -1655,7 +1656,8 @@ func (s *Server) ChatHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}} From 8f4ec9ab289fd2a1f96384926a7f7bfd888d4ef9 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 11 Aug 2025 14:45:45 -0700 Subject: [PATCH 15/17] discover: CPU supports flash attention We already run flash attention on CPUs in cases where we have partial offloading but were disabling it if running on pure CPU, which is unnecessary. --- discover/types.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/discover/types.go b/discover/types.go index c5212d94..13a030fd 100644 --- a/discover/types.go +++ b/discover/types.go @@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int { // For each GPU, check if it does NOT support flash attention func (l GpuInfoList) FlashAttentionSupported() bool { for _, gpu := range l { - supportsFA := gpu.Library == "metal" || + supportsFA := gpu.Library == "cpu" || + gpu.Library == "metal" || (gpu.Library == "cuda" && gpu.DriverMajor >= 7) || gpu.Library == "rocm" From d0cf6c82811c2268a396888347ff95087a618d56 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 12 Aug 2025 11:02:01 -0700 Subject: [PATCH 16/17] fix(openai): handle reasoning_effort (#11868) --- api/types.go | 8 ++++---- openai/openai.go | 11 +++++++---- server/prompt.go | 8 ++++---- server/routes.go | 16 ++++++++-------- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/api/types.go b/api/types.go index 0f99de18..0309ebbe 100644 --- a/api/types.go +++ b/api/types.go @@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool { return ok } -// AsBool returns the value as a bool (true if enabled in any way) -func (t *ThinkValue) AsBool() bool { +// Bool returns the value as a bool (true if enabled in any way) +func (t *ThinkValue) Bool() bool { if t == nil || t.Value == nil { return false } @@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool { } } -// AsString returns the value as a string -func (t *ThinkValue) AsString() string { +// String returns the value as a string +func (t *ThinkValue) String() string { if t == nil || t.Value == nil { return "" } diff --git a/openai/openai.go b/openai/openai.go index 50fdb81e..13b9c425 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -103,6 +103,7 @@ type ChatCompletionRequest struct { ResponseFormat *ResponseFormat `json:"response_format"` Tools []api.Tool `json:"tools"` Reasoning *Reasoning `json:"reasoning,omitempty"` + ReasoningEffort *string `json:"reasoning_effort,omitempty"` } type ChatCompletion struct { @@ -541,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { options["top_p"] = 1.0 } - if r.Reasoning != nil { - options["reasoning"] = *r.Reasoning.Effort - } - var format json.RawMessage if r.ResponseFormat != nil { switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) { @@ -560,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var think *api.ThinkValue if r.Reasoning != nil { + options["reasoning"] = *r.Reasoning.Effort think = &api.ThinkValue{ Value: *r.Reasoning.Effort, } + } else if r.ReasoningEffort != nil { + options["reasoning"] = *r.ReasoningEffort + think = &api.ThinkValue{ + Value: *r.ReasoningEffort, + } } return &api.ChatRequest{ diff --git a/server/prompt.go b/server/prompt.go index 5d6c3e27..f1d8020e 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } var b bytes.Buffer if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { @@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { return "", nil, err diff --git a/server/routes.go b/server/routes.go index d8d1e301..3c044cd0 100644 --- a/server/routes.go +++ b/server/routes.go @@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -213,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { if req.Suffix != "" { caps = append(caps, model.CapabilityInsert) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) // TODO(drifkin): consider adding a warning if it's false and the model // doesn't support thinking. It's not strictly required, but it can be a @@ -288,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt}) } - values.Think = req.Think != nil && req.Think.AsBool() + values.Think = req.Think != nil && req.Think.Bool() values.ThinkLevel = "" if req.Think != nil { - values.ThinkLevel = req.Think.AsString() + values.ThinkLevel = req.Think.String() } values.IsThinkSet = req.Think != nil @@ -317,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { var thinkingState *thinking.Parser if !useHarmony { openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag, @@ -1547,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) { if len(req.Tools) > 0 { caps = append(caps, model.CapabilityTools) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) } @@ -1601,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -1620,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) { var thinkingState *thinking.Parser openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag, From a343ae53a4fa0f75a37e1653df3b61a9ed1b843d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 11 Aug 2025 17:01:07 -0700 Subject: [PATCH 17/17] ggml: Use ordinal IDs for AMD GPUs on Linux when UUID is unavailable Some AMD GPUs do not provide UUIDs and report only "XX". In these cases, we should use the ordinal ID as an alternate identifier. This is the same as we always need to do on Windows for AMD. In addition, this prints out the ID for each GPU when enumerating them for easier debugging in the future. --- .../patches/0017-ggml-Export-GPU-UUIDs.patch | 136 ++++++++++++------ .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 92 +++++++----- 2 files changed, 148 insertions(+), 80 deletions(-) diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch index b7d56b0d..2bd938a3 100644 --- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch +++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch @@ -7,12 +7,12 @@ This enables matching up devices and information reported by the backend with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). --- ggml/include/ggml-backend.h | 1 + - ggml/src/ggml-cuda/ggml-cuda.cu | 39 ++++++++++++++++++++++++++++++++ + ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++--- ggml/src/ggml-metal/ggml-metal.m | 1 + - 3 files changed, 41 insertions(+) + 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index 74e46716..48839339 100644 +index 74e467163..48839339d 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -152,6 +152,7 @@ extern "C" { @@ -24,10 +24,93 @@ index 74e46716..48839339 100644 size_t memory_total; enum ggml_backend_dev_type type; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cb0d8528..d6960174 100644 +index cb0d8528d..1492368de 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context { +@@ -173,6 +173,51 @@ static int ggml_cuda_parse_id(char devName[]) { + } + #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + ++static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) { ++ char id[64]; ++ ++ #if !defined(GGML_USE_HIP) ++ snprintf(id, sizeof(id), ++ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", ++ (unsigned char)prop.uuid.bytes[0], ++ (unsigned char)prop.uuid.bytes[1], ++ (unsigned char)prop.uuid.bytes[2], ++ (unsigned char)prop.uuid.bytes[3], ++ (unsigned char)prop.uuid.bytes[4], ++ (unsigned char)prop.uuid.bytes[5], ++ (unsigned char)prop.uuid.bytes[6], ++ (unsigned char)prop.uuid.bytes[7], ++ (unsigned char)prop.uuid.bytes[8], ++ (unsigned char)prop.uuid.bytes[9], ++ (unsigned char)prop.uuid.bytes[10], ++ (unsigned char)prop.uuid.bytes[11], ++ (unsigned char)prop.uuid.bytes[12], ++ (unsigned char)prop.uuid.bytes[13], ++ (unsigned char)prop.uuid.bytes[14], ++ (unsigned char)prop.uuid.bytes[15] ++ ); ++ #else ++ #ifdef _WIN32 ++ snprintf(id, sizeof(id), "%d", device_num); ++ #else ++ try { ++ std::string uuid = std::string(prop.uuid.bytes, 16); ++ ++ size_t pos = 0; ++ unsigned long long v = stoull(uuid, &pos, 16); ++ if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-')) ++ throw std::invalid_argument("invalid uuid"); ++ ++ snprintf(id, sizeof(id), "GPU-%016llx", v); ++ } catch (const std::exception &e) { ++ snprintf(id, sizeof(id), "%d", device_num); ++ } ++ #endif ++ #endif ++ ++ return id; ++} ++ + static ggml_cuda_device_info ggml_cuda_init() { + #ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: +@@ -261,22 +306,24 @@ static ggml_cuda_device_info ggml_cuda_init() { + info.devices[id].cc += prop.minor * 0x10; + } + } +- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n", ++ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n", + id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, +- device_vmm ? "yes" : "no", prop.warpSize); ++ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str()); + #elif defined(GGML_USE_MUSA) + // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. + info.devices[id].warp_size = 32; + info.devices[id].smpbo = prop.sharedMemPerBlockOptin; + info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; + info.devices[id].cc += prop.minor * 0x10; +- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", +- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); ++ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", ++ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ++ ggml_cuda_parse_uuid(prop, id).c_str()); + #else + info.devices[id].smpbo = prop.sharedMemPerBlockOptin; + info.devices[id].cc = 100*prop.major + 10*prop.minor; +- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", +- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); ++ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", ++ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ++ ggml_cuda_parse_uuid(prop, id).c_str()); + #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + } + +@@ -2884,6 +2931,7 @@ struct ggml_backend_cuda_device_context { int device; std::string name; std::string description; @@ -35,7 +118,7 @@ index cb0d8528..d6960174 100644 }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t +@@ -2896,6 +2944,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t return ctx->description.c_str(); } @@ -47,7 +130,7 @@ index cb0d8528..d6960174 100644 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); -@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -2910,6 +2963,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { props->name = ggml_backend_cuda_device_get_name(dev); props->description = ggml_backend_cuda_device_get_description(dev); @@ -55,47 +138,16 @@ index cb0d8528..d6960174 100644 props->type = ggml_backend_cuda_device_get_type(dev); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); -@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -3457,6 +3511,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { + cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; ++ dev_ctx->id = ggml_cuda_parse_uuid(prop, i); -+ #if !defined(GGML_USE_HIP) -+ char id[64]; -+ snprintf(id, sizeof(id), -+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", -+ (unsigned char)prop.uuid.bytes[0], -+ (unsigned char)prop.uuid.bytes[1], -+ (unsigned char)prop.uuid.bytes[2], -+ (unsigned char)prop.uuid.bytes[3], -+ (unsigned char)prop.uuid.bytes[4], -+ (unsigned char)prop.uuid.bytes[5], -+ (unsigned char)prop.uuid.bytes[6], -+ (unsigned char)prop.uuid.bytes[7], -+ (unsigned char)prop.uuid.bytes[8], -+ (unsigned char)prop.uuid.bytes[9], -+ (unsigned char)prop.uuid.bytes[10], -+ (unsigned char)prop.uuid.bytes[11], -+ (unsigned char)prop.uuid.bytes[12], -+ (unsigned char)prop.uuid.bytes[13], -+ (unsigned char)prop.uuid.bytes[14], -+ (unsigned char)prop.uuid.bytes[15] -+ ); -+ dev_ctx->id = id; -+ #else -+ #ifdef _WIN32 -+ char id[16]; -+ snprintf(id, sizeof(id), "%d", i); -+ dev_ctx->id = id; -+ #else -+ dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16); -+ #endif -+ #endif -+ ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 1b56f858..a9eeebc6 100644 +index 1b56f858c..a9eeebc6a 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 080e7467..496973ad 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -175,6 +175,51 @@ static int ggml_cuda_parse_id(char devName[]) { } #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) { + char id[64]; + + #if !defined(GGML_USE_HIP) + snprintf(id, sizeof(id), + "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + (unsigned char)prop.uuid.bytes[0], + (unsigned char)prop.uuid.bytes[1], + (unsigned char)prop.uuid.bytes[2], + (unsigned char)prop.uuid.bytes[3], + (unsigned char)prop.uuid.bytes[4], + (unsigned char)prop.uuid.bytes[5], + (unsigned char)prop.uuid.bytes[6], + (unsigned char)prop.uuid.bytes[7], + (unsigned char)prop.uuid.bytes[8], + (unsigned char)prop.uuid.bytes[9], + (unsigned char)prop.uuid.bytes[10], + (unsigned char)prop.uuid.bytes[11], + (unsigned char)prop.uuid.bytes[12], + (unsigned char)prop.uuid.bytes[13], + (unsigned char)prop.uuid.bytes[14], + (unsigned char)prop.uuid.bytes[15] + ); + #else + #ifdef _WIN32 + snprintf(id, sizeof(id), "%d", device_num); + #else + try { + std::string uuid = std::string(prop.uuid.bytes, 16); + + size_t pos = 0; + unsigned long long v = stoull(uuid, &pos, 16); + if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-')) + throw std::invalid_argument("invalid uuid"); + + snprintf(id, sizeof(id), "GPU-%016llx", v); + } catch (const std::exception &e) { + snprintf(id, sizeof(id), "%d", device_num); + } + #endif + #endif + + return id; +} + static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -263,22 +308,24 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc += prop.minor * 0x10; } } - GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n", + GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n", id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, - device_vmm ? "yes" : "no", prop.warpSize); + device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str()); #elif defined(GGML_USE_MUSA) // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. info.devices[id].warp_size = 32; info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; info.devices[id].cc += prop.minor * 0x10; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", - id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + ggml_cuda_parse_uuid(prop, id).c_str()); #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", - id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + ggml_cuda_parse_uuid(prop, id).c_str()); #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } @@ -3475,38 +3522,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; - - #if !defined(GGML_USE_HIP) - char id[64]; - snprintf(id, sizeof(id), - "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - (unsigned char)prop.uuid.bytes[0], - (unsigned char)prop.uuid.bytes[1], - (unsigned char)prop.uuid.bytes[2], - (unsigned char)prop.uuid.bytes[3], - (unsigned char)prop.uuid.bytes[4], - (unsigned char)prop.uuid.bytes[5], - (unsigned char)prop.uuid.bytes[6], - (unsigned char)prop.uuid.bytes[7], - (unsigned char)prop.uuid.bytes[8], - (unsigned char)prop.uuid.bytes[9], - (unsigned char)prop.uuid.bytes[10], - (unsigned char)prop.uuid.bytes[11], - (unsigned char)prop.uuid.bytes[12], - (unsigned char)prop.uuid.bytes[13], - (unsigned char)prop.uuid.bytes[14], - (unsigned char)prop.uuid.bytes[15] - ); - dev_ctx->id = id; - #else - #ifdef _WIN32 - char id[16]; - snprintf(id, sizeof(id), "%d", i); - dev_ctx->id = id; - #else - dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16); - #endif - #endif + dev_ctx->id = ggml_cuda_parse_uuid(prop, i); ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface,