diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 455e7c69..10190e04 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,6 +66,7 @@ Examples: llm/backend/mlx: support the llama architecture CONTRIBUTING: provide clarity on good commit messages, and bad + docs: simplify manual installation with shorter curl commands Bad Examples: diff --git a/api/types.go b/api/types.go index 0f99de18..0309ebbe 100644 --- a/api/types.go +++ b/api/types.go @@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool { return ok } -// AsBool returns the value as a bool (true if enabled in any way) -func (t *ThinkValue) AsBool() bool { +// Bool returns the value as a bool (true if enabled in any way) +func (t *ThinkValue) Bool() bool { if t == nil || t.Value == nil { return false } @@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool { } } -// AsString returns the value as a string -func (t *ThinkValue) AsString() string { +// String returns the value as a string +func (t *ThinkValue) String() string { if t == nil || t.Value == nil { return "" } diff --git a/discover/types.go b/discover/types.go index c5212d94..13a030fd 100644 --- a/discover/types.go +++ b/discover/types.go @@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int { // For each GPU, check if it does NOT support flash attention func (l GpuInfoList) FlashAttentionSupported() bool { for _, gpu := range l { - supportsFA := gpu.Library == "metal" || + supportsFA := gpu.Library == "cpu" || + gpu.Library == "metal" || (gpu.Library == "cuda" && gpu.DriverMajor >= 7) || gpu.Library == "rocm" diff --git a/docs/api.md b/docs/api.md index 683db357..f11d59ed 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co ```json { - "status": "downloading digestname", + "status": "pulling digestname", "digest": "digestname", "total": 2142590208, "completed": 241970 diff --git a/docs/faq.md b/docs/faq.md index a6ad6f6e..900ffba4 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 4096 tokens. +By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens. -This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: +This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: ```shell OLLAMA_CONTEXT_LENGTH=8192 ollama serve @@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{ }' ``` +Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly. + ## How can I tell if my model was loaded onto the GPU? Use the `ollama ps` command to see what models are currently loaded into memory. @@ -57,8 +59,8 @@ ollama ps > **Output**: > > ``` -> NAME ID SIZE PROCESSOR UNTIL -> llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now +> NAME ID SIZE PROCESSOR CONTEXT UNTIL +> gpt-oss:20b 05afbac4bad6 16 GB 100% GPU 8192 4 minutes from now > ``` The `Processor` column will show which memory the model was loaded in to: @@ -148,9 +150,11 @@ docker build -t ollama-with-ca . docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca ``` -## Does Ollama send my prompts and answers back to ollama.com? +## Does Ollama send my prompts and responses back to ollama.com? -No. Ollama runs locally, and conversation data does not leave your machine. +If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored. + +If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine. ## How can I expose Ollama on my network? @@ -345,4 +349,4 @@ Ollama for Windows and macOS register as a login item during installation. You - Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove **MacOS Ventura (v13) and later** -- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable. \ No newline at end of file +- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable. diff --git a/integration/utils_test.go b/integration/utils_test.go index 3d726123..727825a4 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -28,7 +28,7 @@ import ( "github.com/stretchr/testify/require" ) -const ( +var ( smol = "llama3.2:1b" ) @@ -37,6 +37,7 @@ var ( // Note: add newer models at the top of the list to test them first ollamaEngineChatModels = []string{ + "gpt-oss:20b", "gemma3n:e2b", "mistral-small3.2:latest", "deepseek-r1:1.5b", @@ -126,6 +127,7 @@ var ( "gemma3n", "glm4", "goliath", + "gpt-oss:20b", "granite-code", "granite3-dense", "granite3-guardian", @@ -255,8 +257,13 @@ var ( } ) -func Init() { +func init() { lifecycle.InitLogging() + custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL") + if custom != "" { + slog.Info("setting smol test model to " + custom) + smol = custom + } } func FindPort() string { diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch index b7d56b0d..2bd938a3 100644 --- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch +++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch @@ -7,12 +7,12 @@ This enables matching up devices and information reported by the backend with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). --- ggml/include/ggml-backend.h | 1 + - ggml/src/ggml-cuda/ggml-cuda.cu | 39 ++++++++++++++++++++++++++++++++ + ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++--- ggml/src/ggml-metal/ggml-metal.m | 1 + - 3 files changed, 41 insertions(+) + 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index 74e46716..48839339 100644 +index 74e467163..48839339d 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -152,6 +152,7 @@ extern "C" { @@ -24,10 +24,93 @@ index 74e46716..48839339 100644 size_t memory_total; enum ggml_backend_dev_type type; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cb0d8528..d6960174 100644 +index cb0d8528d..1492368de 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context { +@@ -173,6 +173,51 @@ static int ggml_cuda_parse_id(char devName[]) { + } + #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + ++static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) { ++ char id[64]; ++ ++ #if !defined(GGML_USE_HIP) ++ snprintf(id, sizeof(id), ++ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", ++ (unsigned char)prop.uuid.bytes[0], ++ (unsigned char)prop.uuid.bytes[1], ++ (unsigned char)prop.uuid.bytes[2], ++ (unsigned char)prop.uuid.bytes[3], ++ (unsigned char)prop.uuid.bytes[4], ++ (unsigned char)prop.uuid.bytes[5], ++ (unsigned char)prop.uuid.bytes[6], ++ (unsigned char)prop.uuid.bytes[7], ++ (unsigned char)prop.uuid.bytes[8], ++ (unsigned char)prop.uuid.bytes[9], ++ (unsigned char)prop.uuid.bytes[10], ++ (unsigned char)prop.uuid.bytes[11], ++ (unsigned char)prop.uuid.bytes[12], ++ (unsigned char)prop.uuid.bytes[13], ++ (unsigned char)prop.uuid.bytes[14], ++ (unsigned char)prop.uuid.bytes[15] ++ ); ++ #else ++ #ifdef _WIN32 ++ snprintf(id, sizeof(id), "%d", device_num); ++ #else ++ try { ++ std::string uuid = std::string(prop.uuid.bytes, 16); ++ ++ size_t pos = 0; ++ unsigned long long v = stoull(uuid, &pos, 16); ++ if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-')) ++ throw std::invalid_argument("invalid uuid"); ++ ++ snprintf(id, sizeof(id), "GPU-%016llx", v); ++ } catch (const std::exception &e) { ++ snprintf(id, sizeof(id), "%d", device_num); ++ } ++ #endif ++ #endif ++ ++ return id; ++} ++ + static ggml_cuda_device_info ggml_cuda_init() { + #ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: +@@ -261,22 +306,24 @@ static ggml_cuda_device_info ggml_cuda_init() { + info.devices[id].cc += prop.minor * 0x10; + } + } +- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n", ++ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n", + id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, +- device_vmm ? "yes" : "no", prop.warpSize); ++ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str()); + #elif defined(GGML_USE_MUSA) + // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. + info.devices[id].warp_size = 32; + info.devices[id].smpbo = prop.sharedMemPerBlockOptin; + info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; + info.devices[id].cc += prop.minor * 0x10; +- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", +- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); ++ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", ++ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ++ ggml_cuda_parse_uuid(prop, id).c_str()); + #else + info.devices[id].smpbo = prop.sharedMemPerBlockOptin; + info.devices[id].cc = 100*prop.major + 10*prop.minor; +- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", +- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); ++ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", ++ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ++ ggml_cuda_parse_uuid(prop, id).c_str()); + #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + } + +@@ -2884,6 +2931,7 @@ struct ggml_backend_cuda_device_context { int device; std::string name; std::string description; @@ -35,7 +118,7 @@ index cb0d8528..d6960174 100644 }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t +@@ -2896,6 +2944,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t return ctx->description.c_str(); } @@ -47,7 +130,7 @@ index cb0d8528..d6960174 100644 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); -@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -2910,6 +2963,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { props->name = ggml_backend_cuda_device_get_name(dev); props->description = ggml_backend_cuda_device_get_description(dev); @@ -55,47 +138,16 @@ index cb0d8528..d6960174 100644 props->type = ggml_backend_cuda_device_get_type(dev); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); -@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -3457,6 +3511,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { + cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; ++ dev_ctx->id = ggml_cuda_parse_uuid(prop, i); -+ #if !defined(GGML_USE_HIP) -+ char id[64]; -+ snprintf(id, sizeof(id), -+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", -+ (unsigned char)prop.uuid.bytes[0], -+ (unsigned char)prop.uuid.bytes[1], -+ (unsigned char)prop.uuid.bytes[2], -+ (unsigned char)prop.uuid.bytes[3], -+ (unsigned char)prop.uuid.bytes[4], -+ (unsigned char)prop.uuid.bytes[5], -+ (unsigned char)prop.uuid.bytes[6], -+ (unsigned char)prop.uuid.bytes[7], -+ (unsigned char)prop.uuid.bytes[8], -+ (unsigned char)prop.uuid.bytes[9], -+ (unsigned char)prop.uuid.bytes[10], -+ (unsigned char)prop.uuid.bytes[11], -+ (unsigned char)prop.uuid.bytes[12], -+ (unsigned char)prop.uuid.bytes[13], -+ (unsigned char)prop.uuid.bytes[14], -+ (unsigned char)prop.uuid.bytes[15] -+ ); -+ dev_ctx->id = id; -+ #else -+ #ifdef _WIN32 -+ char id[16]; -+ snprintf(id, sizeof(id), "%d", i); -+ dev_ctx->id = id; -+ #else -+ dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16); -+ #endif -+ #endif -+ ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 1b56f858..a9eeebc6 100644 +index 1b56f858c..a9eeebc6a 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen diff --git a/llama/patches/0026-ggml-No-alloc-mode.patch b/llama/patches/0026-ggml-No-alloc-mode.patch new file mode 100644 index 00000000..2a8dd07e --- /dev/null +++ b/llama/patches/0026-ggml-No-alloc-mode.patch @@ -0,0 +1,99 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Wed, 23 Jul 2025 11:58:49 -0700 +Subject: [PATCH] ggml: No-alloc mode + +Callers can set a backend buffer type to be no-alloc, meaning that +it does not allocate memory for tensors or operations. This can +be used for calculating memory requirements. Tensors and graphs +must be recreated with no-alloc set to false before loading data. + +Defaults to false for newly created backend buffer types. +--- + ggml/include/ggml-backend.h | 1 + + ggml/src/ggml-backend-impl.h | 2 ++ + ggml/src/ggml-backend.cpp | 19 ++++++++++++++++++- + 3 files changed, 21 insertions(+), 1 deletion(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index 48839339..3903c3cb 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -35,6 +35,7 @@ extern "C" { + // + + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); ++ GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index c36c12d6..81749a5a 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -32,6 +32,7 @@ extern "C" { + struct ggml_backend_buffer_type_i iface; + ggml_backend_dev_t device; + void * context; ++ bool no_alloc; + }; + + // +@@ -63,6 +64,7 @@ extern "C" { + void * context; + size_t size; + enum ggml_backend_buffer_usage usage; ++ bool no_alloc; + }; + + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index be335e8c..84928bc3 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); + } + ++void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) { ++ buft->no_alloc = !alloc; ++} ++ + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + if (size == 0) { + // return a dummy buffer for zero-sized allocations + return ggml_backend_buffer_init(buft, {}, NULL, 0); + } + ++ if (buft->no_alloc) { ++ ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size); ++ buf->no_alloc = true; ++ return buf; ++ } ++ + return buft->iface.alloc_buffer(buft, size); + } + +@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init( + /* .buft = */ buft, + /* .context = */ context, + /* .size = */ size, +- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY ++ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY, ++ /* .no_alloc = */ false + }; + + return buffer; +@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + return NULL; + } + ++ // If we aren't allocating memory, return a placeholder non-NULL pointer ++ // that meets alignment requirements ++ if (buffer->no_alloc) { ++ return (void *)ggml_backend_buffer_get_alignment(buffer); ++ } ++ + void * base = buffer->iface.get_base(buffer); + + GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); diff --git a/ml/backend.go b/ml/backend.go index fcb7db5e..6e76d32d 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -15,6 +15,9 @@ import ( ) type Backend interface { + // Close frees all memory associated with this backend + Close() + Load(ctx context.Context, progress func(float32)) error // BackendMemory returns the memory allocations that were made for this model diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 15c210dc..aa241e9b 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -19,6 +19,7 @@ import ( "slices" "strconv" "strings" + "sync" "sync/atomic" "unicode" "unsafe" @@ -33,15 +34,33 @@ import ( "golang.org/x/sync/errgroup" ) -func devices() []*C.struct_ggml_backend_device { - ggml.OnceLoad() - ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count()) - for i := range ds { - ds[i] = C.ggml_backend_dev_get(C.size_t(i)) - } +var ( + cpus, accels, gpus []C.ggml_backend_dev_t + backends map[C.ggml_backend_dev_t]C.ggml_backend_t +) - return ds -} +var initDevices = sync.OnceFunc(func() { + ggml.OnceLoad() + + backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t) + for i := range C.ggml_backend_dev_count() { + d := C.ggml_backend_dev_get(i) + + switch C.ggml_backend_dev_type(d) { + case C.GGML_BACKEND_DEVICE_TYPE_CPU: + if len(cpus) == 0 { + // only the first cpu device should be used + cpus = append(cpus, d) + } + case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: + accels = append(accels, d) + case C.GGML_BACKEND_DEVICE_TYPE_GPU: + gpus = append(gpus, d) + } + + backends[d] = C.ggml_backend_dev_init(d, nil) + } +}) type Backend struct { // modelPath is the location of the model data @@ -53,28 +72,31 @@ type Backend struct { // to the name that is used by the model definition tensorLoadTargets map[string][]string - sched *C.struct_ggml_backend_sched - schedBackends []*C.struct_ggml_backend - schedBufts []*C.struct_ggml_backend_buffer_type + sched C.ggml_backend_sched_t + schedBackends []C.ggml_backend_t + schedBufts []C.ggml_backend_buffer_type_t tensors map[string]*C.struct_ggml_tensor // input is the backend used for inputs - input *C.struct_ggml_backend_buffer_type + input C.ggml_backend_buffer_type_t // layers is the backend used for repeating layers - layers map[int]*C.struct_ggml_backend_buffer_type + layers map[int]C.ggml_backend_buffer_type_t // requiredMemory is the cumulative memory allocations needed by the backend requiredMemory *ml.BackendMemory // btDeviceMemory maps from a buffer type to the memory allocations associated with that device - btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory + btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory flashAttention bool // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler maxGraphNodes int + + // weightBuffers are the GGML contexts and buffers for allocating weights + weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t } func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { @@ -99,27 +121,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { "num_key_values", len(meta.KV()), ) + initDevices() + var requiredMemory ml.BackendMemory - btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory) + btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory) type deviceBufferType struct { - d *C.struct_ggml_backend_device - bts []*C.struct_ggml_backend_buffer_type - } - - var cpus, accels, gpus []*C.struct_ggml_backend_device - for _, d := range devices() { - switch C.ggml_backend_dev_type(d) { - case C.GGML_BACKEND_DEVICE_TYPE_CPU: - if len(cpus) == 0 { - // only the first cpu device should be used - cpus = append(cpus, d) - } - case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: - accels = append(accels, d) - case C.GGML_BACKEND_DEVICE_TYPE_GPU: - gpus = append(gpus, d) - } + d C.ggml_backend_dev_t + bts []C.ggml_backend_buffer_type_t } blocks := int(meta.KV().BlockCount()) @@ -149,7 +158,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { bt := C.ggml_backend_dev_buffer_type(d) gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{ d: d, - bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...), + bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...), }) btDeviceMemory[bt] = &requiredMemory.GPUs[i] requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d)) @@ -235,16 +244,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { targets := make(map[string][]string) // contexts are shared by tensors of the same buffer type - ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context) - createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor { + ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context) + createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor { for _, bt := range bts { if _, ok := ctxs[bt]; !ok { - // slog.Info("XXX before ggml_init") ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors), no_alloc: true, }) - // slog.Info("XXX after ggml_init") } targets[t.source.Name] = append(targets[t.source.Name], t.target) @@ -332,7 +339,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } // allocate buffers for each context - bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs)) + bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs)) for bt, c := range ctxs { if C.ggml_get_first_tensor(c) == nil { continue @@ -350,6 +357,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } if b == nil { + for _, b := range bbs { + C.ggml_backend_buffer_free(b) + } + + for _, ctx := range ctxs { + C.ggml_free(ctx) + } + panic(ml.ErrNoMem{BackendMemory: requiredMemory}) } @@ -390,13 +405,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } // map devices to backend buffer types so new tensors can be assigned to the correct device - deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type) + deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t) // create backends and buffer types used for the compute graph scheduler - var schedBackends []*C.struct_ggml_backend - var schedBufts []*C.struct_ggml_backend_buffer_type + var schedBackends []C.ggml_backend_t + var schedBufts []C.ggml_backend_buffer_type_t for _, d := range append(gpus, append(accels, cpus...)...) { - b := C.ggml_backend_dev_init(d, nil) + b := backends[d] bt := C.ggml_backend_get_default_buffer_type(b) deviceBufferTypes[d] = bt @@ -428,8 +443,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { schedBackends: schedBackends, schedBufts: schedBufts, input: deviceBufferTypes[input.d], - layers: func() map[int]*C.struct_ggml_backend_buffer_type { - m := make(map[int]*C.struct_ggml_backend_buffer_type) + layers: func() map[int]C.ggml_backend_buffer_type_t { + m := make(map[int]C.ggml_backend_buffer_type_t) for i, layer := range layers { m[i] = deviceBufferTypes[layer.d] } @@ -438,6 +453,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { requiredMemory: &requiredMemory, btDeviceMemory: btDeviceMemory, maxGraphNodes: maxGraphNodes, + weightBuffers: bbs, }, nil } @@ -445,6 +461,19 @@ func init() { ml.RegisterBackend("ggml", New) } +func (b *Backend) Close() { + if b == nil { + return + } + + for ctx, b := range b.weightBuffers { + C.ggml_backend_buffer_free(b) + C.ggml_free(ctx) + } + + C.ggml_backend_sched_free(b.sched) +} + func (b *Backend) Load(ctx context.Context, progress func(float32)) error { var doneBytes atomic.Uint64 totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset @@ -541,10 +570,8 @@ func (b *Backend) NewContextSize(n int) ml.Context { panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes)) } - var allocatedBuffers []*C.struct_ggml_backend_buffer + var allocatedBuffers []C.ggml_backend_buffer_t - // slog.Info("XXX before ggml_init") - // defer slog.Info("XXX after ggml_init") return &Context{ b: b, maxGraphNodes: n, @@ -572,11 +599,11 @@ type Context struct { graph *C.struct_ggml_cgraph // buft is the buffer type used for new tensors - buft *C.struct_ggml_backend_buffer_type + buft C.ggml_backend_buffer_type_t // allocatedBuffers are buffers for tensors that we have allocated in this context // so that we can free them when we close the context - allocatedBuffers *[]*C.struct_ggml_backend_buffer + allocatedBuffers *[]C.ggml_backend_buffer_t // maxGraphNodes is the maximum allowed number of graph nodes in this context maxGraphNodes int @@ -1407,55 +1434,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor { return t } - -// TODO - DRY this out with New if possible -func newTestBackend(size int) *Backend { - var cpus []*C.struct_ggml_backend_device - for _, d := range devices() { - switch C.ggml_backend_dev_type(d) { - case C.GGML_BACKEND_DEVICE_TYPE_CPU: - if len(cpus) == 0 { - // only the first cpu device should be used - cpus = append(cpus, d) - break - } - } - } - var schedBackends []*C.struct_ggml_backend - var schedBufts []*C.struct_ggml_backend_buffer_type - b := C.ggml_backend_dev_init(cpus[0], nil) - bt := C.ggml_backend_get_default_buffer_type(b) - C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU()))) - // C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING - schedBackends = append(schedBackends, b) - schedBufts = append(schedBufts, bt) - return &Backend{ - meta: nil, - sched: C.ggml_backend_sched_new( - (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])), - (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])), - C.int(len(schedBackends)), - C.size_t(max(8192, size)), - false, - false, - ), - input: bt, - maxGraphNodes: max(8192, size), - schedBackends: schedBackends, - schedBufts: schedBufts, - } -} - -func newTestContext(b *Backend, n int) *Context { - n = max(8192, n) - // slog.Info("XXX before ggml_init") - // defer slog.Info("XXX after ggml_init") - return &Context{ - b: b, - maxGraphNodes: n, - ctx: C.ggml_init(C.struct_ggml_init_params{ - mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false), - no_alloc: true, - }), - } -} diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 48839339..3903c3cb 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -35,6 +35,7 @@ extern "C" { // GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index c36c12d6..81749a5a 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -32,6 +32,7 @@ extern "C" { struct ggml_backend_buffer_type_i iface; ggml_backend_dev_t device; void * context; + bool no_alloc; }; // @@ -63,6 +64,7 @@ extern "C" { void * context; size_t size; enum ggml_backend_buffer_usage usage; + bool no_alloc; }; GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index be335e8c..84928bc3 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } +void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) { + buft->no_alloc = !alloc; +} + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { if (size == 0) { // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } + if (buft->no_alloc) { + ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size); + buf->no_alloc = true; + return buf; + } + return buft->iface.alloc_buffer(buft, size); } @@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, - /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY, + /* .no_alloc = */ false }; return buffer; @@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return NULL; } + // If we aren't allocating memory, return a placeholder non-NULL pointer + // that meets alignment requirements + if (buffer->no_alloc) { + return (void *)ggml_backend_buffer_get_alignment(buffer); + } + void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 080e7467..496973ad 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -175,6 +175,51 @@ static int ggml_cuda_parse_id(char devName[]) { } #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) { + char id[64]; + + #if !defined(GGML_USE_HIP) + snprintf(id, sizeof(id), + "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + (unsigned char)prop.uuid.bytes[0], + (unsigned char)prop.uuid.bytes[1], + (unsigned char)prop.uuid.bytes[2], + (unsigned char)prop.uuid.bytes[3], + (unsigned char)prop.uuid.bytes[4], + (unsigned char)prop.uuid.bytes[5], + (unsigned char)prop.uuid.bytes[6], + (unsigned char)prop.uuid.bytes[7], + (unsigned char)prop.uuid.bytes[8], + (unsigned char)prop.uuid.bytes[9], + (unsigned char)prop.uuid.bytes[10], + (unsigned char)prop.uuid.bytes[11], + (unsigned char)prop.uuid.bytes[12], + (unsigned char)prop.uuid.bytes[13], + (unsigned char)prop.uuid.bytes[14], + (unsigned char)prop.uuid.bytes[15] + ); + #else + #ifdef _WIN32 + snprintf(id, sizeof(id), "%d", device_num); + #else + try { + std::string uuid = std::string(prop.uuid.bytes, 16); + + size_t pos = 0; + unsigned long long v = stoull(uuid, &pos, 16); + if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-')) + throw std::invalid_argument("invalid uuid"); + + snprintf(id, sizeof(id), "GPU-%016llx", v); + } catch (const std::exception &e) { + snprintf(id, sizeof(id), "%d", device_num); + } + #endif + #endif + + return id; +} + static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -263,22 +308,24 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc += prop.minor * 0x10; } } - GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n", + GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n", id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, - device_vmm ? "yes" : "no", prop.warpSize); + device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str()); #elif defined(GGML_USE_MUSA) // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. info.devices[id].warp_size = 32; info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; info.devices[id].cc += prop.minor * 0x10; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", - id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + ggml_cuda_parse_uuid(prop, id).c_str()); #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", - id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + ggml_cuda_parse_uuid(prop, id).c_str()); #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } @@ -3475,38 +3522,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; - - #if !defined(GGML_USE_HIP) - char id[64]; - snprintf(id, sizeof(id), - "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - (unsigned char)prop.uuid.bytes[0], - (unsigned char)prop.uuid.bytes[1], - (unsigned char)prop.uuid.bytes[2], - (unsigned char)prop.uuid.bytes[3], - (unsigned char)prop.uuid.bytes[4], - (unsigned char)prop.uuid.bytes[5], - (unsigned char)prop.uuid.bytes[6], - (unsigned char)prop.uuid.bytes[7], - (unsigned char)prop.uuid.bytes[8], - (unsigned char)prop.uuid.bytes[9], - (unsigned char)prop.uuid.bytes[10], - (unsigned char)prop.uuid.bytes[11], - (unsigned char)prop.uuid.bytes[12], - (unsigned char)prop.uuid.bytes[13], - (unsigned char)prop.uuid.bytes[14], - (unsigned char)prop.uuid.bytes[15] - ); - dev_ctx->id = id; - #else - #ifdef _WIN32 - char id[16]; - snprintf(id, sizeof(id), "%d", i); - dev_ctx->id = id; - #else - dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16); - #endif - #endif + dev_ctx->id = ggml_cuda_parse_uuid(prop, i); ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, diff --git a/openai/openai.go b/openai/openai.go index d065de8f..13b9c425 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -34,10 +34,12 @@ type ErrorResponse struct { } type Message struct { - Role string `json:"role"` - Content any `json:"content"` - Reasoning string `json:"reasoning,omitempty"` - ToolCalls []ToolCall `json:"tool_calls,omitempty"` + Role string `json:"role"` + Content any `json:"content"` + Reasoning string `json:"reasoning,omitempty"` + ToolCalls []ToolCall `json:"tool_calls,omitempty"` + Name string `json:"name,omitempty"` + ToolCallID string `json:"tool_call_id,omitempty"` } type Choice struct { @@ -101,6 +103,7 @@ type ChatCompletionRequest struct { ResponseFormat *ResponseFormat `json:"response_format"` Tools []api.Tool `json:"tools"` Reasoning *Reasoning `json:"reasoning,omitempty"` + ReasoningEffort *string `json:"reasoning_effort,omitempty"` } type ChatCompletion struct { @@ -401,9 +404,20 @@ func toModel(r api.ShowResponse, m string) Model { func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var messages []api.Message for _, msg := range r.Messages { + toolName := "" + if strings.ToLower(msg.Role) == "tool" { + toolName = msg.Name + if toolName == "" && msg.ToolCallID != "" { + toolName = nameFromToolCallID(r.Messages, msg.ToolCallID) + } + } switch content := msg.Content.(type) { case string: - messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning}) + toolCalls, err := fromCompletionToolCall(msg.ToolCalls) + if err != nil { + return nil, err + } + messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName}) case []any: for _, c := range content { data, ok := c.(map[string]any) @@ -454,7 +468,21 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, errors.New("invalid message format") } } + // since we might have added multiple messages above, if we have tools + // calls we'll add them to the last message + if len(messages) > 0 && len(msg.ToolCalls) > 0 { + toolCalls, err := fromCompletionToolCall(msg.ToolCalls) + if err != nil { + return nil, err + } + messages[len(messages)-1].ToolCalls = toolCalls + if toolName != "" { + messages[len(messages)-1].ToolName = toolName + } + messages[len(messages)-1].Thinking = msg.Reasoning + } default: + // content is only optional if tool calls are present if msg.ToolCalls == nil { return nil, fmt.Errorf("invalid message content type: %T", content) } @@ -467,7 +495,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, errors.New("invalid tool call arguments") } } - messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls}) + messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls}) } } @@ -514,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { options["top_p"] = 1.0 } - if r.Reasoning != nil { - options["reasoning"] = *r.Reasoning.Effort - } - var format json.RawMessage if r.ResponseFormat != nil { switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) { @@ -533,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var think *api.ThinkValue if r.Reasoning != nil { + options["reasoning"] = *r.Reasoning.Effort think = &api.ThinkValue{ Value: *r.Reasoning.Effort, } + } else if r.ReasoningEffort != nil { + options["reasoning"] = *r.ReasoningEffort + think = &api.ThinkValue{ + Value: *r.ReasoningEffort, + } } return &api.ChatRequest{ @@ -549,6 +579,33 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { }, nil } +func nameFromToolCallID(messages []Message, toolCallID string) string { + // iterate backwards to be more resilient to duplicate tool call IDs (this + // follows "last one wins") + for i := len(messages) - 1; i >= 0; i-- { + msg := messages[i] + for _, tc := range msg.ToolCalls { + if tc.ID == toolCallID { + return tc.Function.Name + } + } + } + return "" +} + +func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) { + apiToolCalls := make([]api.ToolCall, len(toolCalls)) + for i, tc := range toolCalls { + apiToolCalls[i].Function.Name = tc.Function.Name + err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments) + if err != nil { + return nil, errors.New("invalid tool call arguments") + } + } + + return apiToolCalls, nil +} + func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { options := make(map[string]any) diff --git a/openai/openai_test.go b/openai/openai_test.go index 471b4737..0d7f016b 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -235,6 +235,210 @@ func TestChatMiddleware(t *testing.T) { Stream: &False, }, }, + { + name: "chat handler with tools and content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + Content: "Let's see what the weather is like in Paris", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "chat handler with tools and empty content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "chat handler with tools and thinking content", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + Thinking: "Let's see what the weather is like in Paris", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "tool response with call ID", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}, + {"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + { + Role: "tool", + Content: "The weather in Paris is 20 degrees Celsius", + ToolName: "get_current_weather", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "tool response with name", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}, + {"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]any{ + "location": "Paris, France", + "format": "celsius", + }, + }, + }, + }, + }, + { + Role: "tool", + Content: "The weather in Paris is 20 degrees Celsius", + ToolName: "get_current_weather", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, { name: "chat handler with streaming tools", body: `{ diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 43880a41..8c8a29d8 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -70,6 +70,10 @@ func kvCacheTypeFromStr(s string) ml.DType { } func (c *InputCache) Close() { + if c == nil { + return + } + c.cache.Close() } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a7a889f1..cebe30de 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -877,6 +877,15 @@ func (s *Server) load( ) { err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache) if err != nil { + var noMem ml.ErrNoMem + if errors.As(err, &noMem) { + // We can't yet handle this but in the future we will + s.cache.Close() + if s.model != nil { + s.model.Backend().Close() + } + } + panic(err) } diff --git a/server/prompt.go b/server/prompt.go index 5d6c3e27..f1d8020e 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } var b bytes.Buffer if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { @@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { return "", nil, err diff --git a/server/routes.go b/server/routes.go index 991e9200..3c044cd0 100644 --- a/server/routes.go +++ b/server/routes.go @@ -30,6 +30,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" @@ -50,11 +51,16 @@ func experimentEnabled(name string) bool { var useClient2 = experimentEnabled("client2") +// Low VRAM mode is based on the sum of total VRAM (not free) and triggers +// reduced context length on some models +var lowVRAMThreshold uint64 = 20 * format.GibiByte + var mode string = gin.DebugMode type Server struct { - addr net.Addr - sched *Scheduler + addr net.Addr + sched *Scheduler + lowVRAM bool } func init() { @@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, err } - // This model requires a minimum context to function effectively - if slices.Contains(model.Config.ModelFamilies, "gptoss") { + // This model is much more capable with a larger context, so set that + // unless it would penalize performance too much + if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") { opts.NumCtx = max(opts.NumCtx, 8192) } @@ -198,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -206,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { if req.Suffix != "" { caps = append(caps, model.CapabilityInsert) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) // TODO(drifkin): consider adding a warning if it's false and the model // doesn't support thinking. It's not strictly required, but it can be a @@ -281,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt}) } - values.Think = req.Think != nil && req.Think.AsBool() + values.Think = req.Think != nil && req.Think.Bool() values.ThinkLevel = "" if req.Think != nil { - values.ThinkLevel = req.Think.AsString() + values.ThinkLevel = req.Think.String() } values.IsThinkSet = req.Think != nil @@ -310,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { var thinkingState *thinking.Parser if !useHarmony { openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag, @@ -364,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } @@ -1382,6 +1390,15 @@ func Serve(ln net.Listener) error { gpus := discover.GetGPUInfo() gpus.LogDetails() + var totalVRAM uint64 + for _, gpu := range gpus { + totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead() + } + if totalVRAM < lowVRAMThreshold { + s.lowVRAM = true + slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold)) + } + err = srvr.Serve(ln) // If server is closed from the signal handler, wait for the ctx to be done // otherwise error out quickly @@ -1530,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) { if len(req.Tools) > 0 { caps = append(caps, model.CapabilityTools) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) } @@ -1584,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -1603,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) { var thinkingState *thinking.Parser openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag, @@ -1655,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}} diff --git a/server/sched.go b/server/sched.go index 2842bb3a..40e6e5f7 100644 --- a/server/sched.go +++ b/server/sched.go @@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool { // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // opts.NumCtx accordingly func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { - var estimatedVRAM uint64 - var numParallelToTry []int if *numParallel <= 0 { // If no specific parallel setting was provided, try larger then smaller, always end with 1 @@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn } for _, gl := range gpus.ByLibrary() { - var ok bool sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them - // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups + // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) - // First attempt to fit the model into a single GPU - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if !envconfig.SchedSpread() { - for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) + if !envconfig.SchedSpread() { + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + // Try to pack into as few GPUs as possible, starting from 1 GPU + for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { + gpuSubset := sgl[:numGPUs] + ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p) + + if ok { + slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", numGPUs) *numParallel = p - return []discover.GpuInfo{g} + return gpuSubset } } } - } + } else { + // TODO future refinements + // - if multiple Libraries, see if any single GPU in any Library will fit + // - try subsets of GPUs instead of just falling back to 1 or all in a family - // TODO future refinements - // - if multiple Libraries, see if any single GPU in any Library will fit - // - try subsets of GPUs instead of just falling back to 1 or all in a family - - // Now try all the GPUs - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) - *numParallel = p - return sgl + // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set) + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { + slog.Info("new model will fit in available VRAM, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", len(sgl)) + *numParallel = p + return sgl + } } } }