Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2025-08-13 12:36:50 +08:00
committed by GitHub
21 changed files with 729 additions and 249 deletions

View File

@@ -66,6 +66,7 @@ Examples:
llm/backend/mlx: support the llama architecture llm/backend/mlx: support the llama architecture
CONTRIBUTING: provide clarity on good commit messages, and bad CONTRIBUTING: provide clarity on good commit messages, and bad
docs: simplify manual installation with shorter curl commands
Bad Examples: Bad Examples:

View File

@@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
return ok return ok
} }
// AsBool returns the value as a bool (true if enabled in any way) // Bool returns the value as a bool (true if enabled in any way)
func (t *ThinkValue) AsBool() bool { func (t *ThinkValue) Bool() bool {
if t == nil || t.Value == nil { if t == nil || t.Value == nil {
return false return false
} }
@@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
} }
} }
// AsString returns the value as a string // String returns the value as a string
func (t *ThinkValue) AsString() string { func (t *ThinkValue) String() string {
if t == nil || t.Value == nil { if t == nil || t.Value == nil {
return "" return ""
} }

View File

@@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int {
// For each GPU, check if it does NOT support flash attention // For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool { func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l { for _, gpu := range l {
supportsFA := gpu.Library == "metal" || supportsFA := gpu.Library == "cpu" ||
gpu.Library == "metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) || (gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm" gpu.Library == "rocm"

View File

@@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co
```json ```json
{ {
"status": "downloading digestname", "status": "pulling digestname",
"digest": "digestname", "digest": "digestname",
"total": 2142590208, "total": 2142590208,
"completed": 241970 "completed": 241970

View File

@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size? ## How can I specify the context window size?
By default, Ollama uses a context window size of 4096 tokens. By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
```shell ```shell
OLLAMA_CONTEXT_LENGTH=8192 ollama serve OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{
}' }'
``` ```
Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
## How can I tell if my model was loaded onto the GPU? ## How can I tell if my model was loaded onto the GPU?
Use the `ollama ps` command to see what models are currently loaded into memory. Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -57,8 +59,8 @@ ollama ps
> **Output**: > **Output**:
> >
> ``` > ```
> NAME ID SIZE PROCESSOR UNTIL > NAME ID SIZE PROCESSOR CONTEXT UNTIL
> llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now > gpt-oss:20b 05afbac4bad6 16 GB 100% GPU 8192 4 minutes from now
> ``` > ```
The `Processor` column will show which memory the model was loaded in to: The `Processor` column will show which memory the model was loaded in to:
@@ -148,9 +150,11 @@ docker build -t ollama-with-ca .
docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
``` ```
## Does Ollama send my prompts and answers back to ollama.com? ## Does Ollama send my prompts and responses back to ollama.com?
No. Ollama runs locally, and conversation data does not leave your machine. If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
## How can I expose Ollama on my network? ## How can I expose Ollama on my network?

View File

@@ -28,7 +28,7 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
const ( var (
smol = "llama3.2:1b" smol = "llama3.2:1b"
) )
@@ -37,6 +37,7 @@ var (
// Note: add newer models at the top of the list to test them first // Note: add newer models at the top of the list to test them first
ollamaEngineChatModels = []string{ ollamaEngineChatModels = []string{
"gpt-oss:20b",
"gemma3n:e2b", "gemma3n:e2b",
"mistral-small3.2:latest", "mistral-small3.2:latest",
"deepseek-r1:1.5b", "deepseek-r1:1.5b",
@@ -126,6 +127,7 @@ var (
"gemma3n", "gemma3n",
"glm4", "glm4",
"goliath", "goliath",
"gpt-oss:20b",
"granite-code", "granite-code",
"granite3-dense", "granite3-dense",
"granite3-guardian", "granite3-guardian",
@@ -255,8 +257,13 @@ var (
} }
) )
func Init() { func init() {
lifecycle.InitLogging() lifecycle.InitLogging()
custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL")
if custom != "" {
slog.Info("setting smol test model to " + custom)
smol = custom
}
} }
func FindPort() string { func FindPort() string {

View File

@@ -7,12 +7,12 @@ This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
--- ---
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 39 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.m | 1 + ggml/src/ggml-metal/ggml-metal.m | 1 +
3 files changed, 41 insertions(+) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 74e46716..48839339 100644 index 74e467163..48839339d 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" { @@ -152,6 +152,7 @@ extern "C" {
@@ -24,43 +24,17 @@ index 74e46716..48839339 100644
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; enum ggml_backend_dev_type type;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..d6960174 100644 index cb0d8528d..1492368de 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context { @@ -173,6 +173,51 @@ static int ggml_cuda_parse_id(char devName[]) {
int device;
std::string name;
std::string description;
+ std::string id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
} }
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { +static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+ return ctx->id.c_str();
+}
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
+ props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ #if !defined(GGML_USE_HIP)
+ char id[64]; + char id[64];
+
+ #if !defined(GGML_USE_HIP)
+ snprintf(id, sizeof(id), + snprintf(id, sizeof(id),
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ (unsigned char)prop.uuid.bytes[0], + (unsigned char)prop.uuid.bytes[0],
@@ -80,22 +54,100 @@ index cb0d8528..d6960174 100644
+ (unsigned char)prop.uuid.bytes[14], + (unsigned char)prop.uuid.bytes[14],
+ (unsigned char)prop.uuid.bytes[15] + (unsigned char)prop.uuid.bytes[15]
+ ); + );
+ dev_ctx->id = id;
+ #else + #else
+ #ifdef _WIN32 + #ifdef _WIN32
+ char id[16]; + snprintf(id, sizeof(id), "%d", device_num);
+ snprintf(id, sizeof(id), "%d", i);
+ dev_ctx->id = id;
+ #else + #else
+ dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16); + try {
+ std::string uuid = std::string(prop.uuid.bytes, 16);
+
+ size_t pos = 0;
+ unsigned long long v = stoull(uuid, &pos, 16);
+ if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
+ throw std::invalid_argument("invalid uuid");
+
+ snprintf(id, sizeof(id), "GPU-%016llx", v);
+ } catch (const std::exception &e) {
+ snprintf(id, sizeof(id), "%d", device_num);
+ }
+ #endif + #endif
+ #endif + #endif
+ +
+ return id;
+}
+
static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -261,22 +306,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
- device_vmm ? "yes" : "no", prop.warpSize);
+ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str());
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
}
@@ -2884,6 +2931,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
+ std::string id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2944,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
}
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+ return ctx->id.c_str();
+}
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2963,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
+ props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3457,6 +3511,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..a9eeebc6 100644 index 1b56f858c..a9eeebc6a 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen

View File

@@ -0,0 +1,99 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 23 Jul 2025 11:58:49 -0700
Subject: [PATCH] ggml: No-alloc mode
Callers can set a backend buffer type to be no-alloc, meaning that
it does not allocate memory for tensors or operations. This can
be used for calculating memory requirements. Tensors and graphs
must be recreated with no-alloc set to false before loading data.
Defaults to false for newly created backend buffer types.
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-backend-impl.h | 2 ++
ggml/src/ggml-backend.cpp | 19 ++++++++++++++++++-
3 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 48839339..3903c3cb 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -35,6 +35,7 @@ extern "C" {
//
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
+ GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc);
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index c36c12d6..81749a5a 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -32,6 +32,7 @@ extern "C" {
struct ggml_backend_buffer_type_i iface;
ggml_backend_dev_t device;
void * context;
+ bool no_alloc;
};
//
@@ -63,6 +64,7 @@ extern "C" {
void * context;
size_t size;
enum ggml_backend_buffer_usage usage;
+ bool no_alloc;
};
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index be335e8c..84928bc3 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name(buft);
}
+void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
+ buft->no_alloc = !alloc;
+}
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
if (size == 0) {
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0);
}
+ if (buft->no_alloc) {
+ ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+ buf->no_alloc = true;
+ return buf;
+ }
+
return buft->iface.alloc_buffer(buft, size);
}
@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY,
+ /* .no_alloc = */ false
};
return buffer;
@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL;
}
+ // If we aren't allocating memory, return a placeholder non-NULL pointer
+ // that meets alignment requirements
+ if (buffer->no_alloc) {
+ return (void *)ggml_backend_buffer_get_alignment(buffer);
+ }
+
void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");

View File

@@ -15,6 +15,9 @@ import (
) )
type Backend interface { type Backend interface {
// Close frees all memory associated with this backend
Close()
Load(ctx context.Context, progress func(float32)) error Load(ctx context.Context, progress func(float32)) error
// BackendMemory returns the memory allocations that were made for this model // BackendMemory returns the memory allocations that were made for this model

View File

@@ -19,6 +19,7 @@ import (
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
"sync"
"sync/atomic" "sync/atomic"
"unicode" "unicode"
"unsafe" "unsafe"
@@ -33,15 +34,33 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
) )
func devices() []*C.struct_ggml_backend_device { var (
cpus, accels, gpus []C.ggml_backend_dev_t
backends map[C.ggml_backend_dev_t]C.ggml_backend_t
)
var initDevices = sync.OnceFunc(func() {
ggml.OnceLoad() ggml.OnceLoad()
ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
for i := range ds { backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
ds[i] = C.ggml_backend_dev_get(C.size_t(i)) for i := range C.ggml_backend_dev_count() {
d := C.ggml_backend_dev_get(i)
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
if len(cpus) == 0 {
// only the first cpu device should be used
cpus = append(cpus, d)
}
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
accels = append(accels, d)
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
gpus = append(gpus, d)
} }
return ds backends[d] = C.ggml_backend_dev_init(d, nil)
} }
})
type Backend struct { type Backend struct {
// modelPath is the location of the model data // modelPath is the location of the model data
@@ -53,28 +72,31 @@ type Backend struct {
// to the name that is used by the model definition // to the name that is used by the model definition
tensorLoadTargets map[string][]string tensorLoadTargets map[string][]string
sched *C.struct_ggml_backend_sched sched C.ggml_backend_sched_t
schedBackends []*C.struct_ggml_backend schedBackends []C.ggml_backend_t
schedBufts []*C.struct_ggml_backend_buffer_type schedBufts []C.ggml_backend_buffer_type_t
tensors map[string]*C.struct_ggml_tensor tensors map[string]*C.struct_ggml_tensor
// input is the backend used for inputs // input is the backend used for inputs
input *C.struct_ggml_backend_buffer_type input C.ggml_backend_buffer_type_t
// layers is the backend used for repeating layers // layers is the backend used for repeating layers
layers map[int]*C.struct_ggml_backend_buffer_type layers map[int]C.ggml_backend_buffer_type_t
// requiredMemory is the cumulative memory allocations needed by the backend // requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory requiredMemory *ml.BackendMemory
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device // btDeviceMemory maps from a buffer type to the memory allocations associated with that device
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
flashAttention bool flashAttention bool
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
maxGraphNodes int maxGraphNodes int
// weightBuffers are the GGML contexts and buffers for allocating weights
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
} }
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
@@ -99,27 +121,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
"num_key_values", len(meta.KV()), "num_key_values", len(meta.KV()),
) )
initDevices()
var requiredMemory ml.BackendMemory var requiredMemory ml.BackendMemory
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory) btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
type deviceBufferType struct { type deviceBufferType struct {
d *C.struct_ggml_backend_device d C.ggml_backend_dev_t
bts []*C.struct_ggml_backend_buffer_type bts []C.ggml_backend_buffer_type_t
}
var cpus, accels, gpus []*C.struct_ggml_backend_device
for _, d := range devices() {
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
if len(cpus) == 0 {
// only the first cpu device should be used
cpus = append(cpus, d)
}
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
accels = append(accels, d)
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
gpus = append(gpus, d)
}
} }
blocks := int(meta.KV().BlockCount()) blocks := int(meta.KV().BlockCount())
@@ -149,7 +158,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
bt := C.ggml_backend_dev_buffer_type(d) bt := C.ggml_backend_dev_buffer_type(d)
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{ gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
d: d, d: d,
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...), bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
}) })
btDeviceMemory[bt] = &requiredMemory.GPUs[i] btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d)) requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
@@ -235,16 +244,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
targets := make(map[string][]string) targets := make(map[string][]string)
// contexts are shared by tensors of the same buffer type // contexts are shared by tensors of the same buffer type
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context) ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor { createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
for _, bt := range bts { for _, bt := range bts {
if _, ok := ctxs[bt]; !ok { if _, ok := ctxs[bt]; !ok {
// slog.Info("XXX before ggml_init")
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors), mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
no_alloc: true, no_alloc: true,
}) })
// slog.Info("XXX after ggml_init")
} }
targets[t.source.Name] = append(targets[t.source.Name], t.target) targets[t.source.Name] = append(targets[t.source.Name], t.target)
@@ -332,7 +339,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
} }
// allocate buffers for each context // allocate buffers for each context
bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs)) bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
for bt, c := range ctxs { for bt, c := range ctxs {
if C.ggml_get_first_tensor(c) == nil { if C.ggml_get_first_tensor(c) == nil {
continue continue
@@ -350,6 +357,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
} }
if b == nil { if b == nil {
for _, b := range bbs {
C.ggml_backend_buffer_free(b)
}
for _, ctx := range ctxs {
C.ggml_free(ctx)
}
panic(ml.ErrNoMem{BackendMemory: requiredMemory}) panic(ml.ErrNoMem{BackendMemory: requiredMemory})
} }
@@ -390,13 +405,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
} }
// map devices to backend buffer types so new tensors can be assigned to the correct device // map devices to backend buffer types so new tensors can be assigned to the correct device
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type) deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
// create backends and buffer types used for the compute graph scheduler // create backends and buffer types used for the compute graph scheduler
var schedBackends []*C.struct_ggml_backend var schedBackends []C.ggml_backend_t
var schedBufts []*C.struct_ggml_backend_buffer_type var schedBufts []C.ggml_backend_buffer_type_t
for _, d := range append(gpus, append(accels, cpus...)...) { for _, d := range append(gpus, append(accels, cpus...)...) {
b := C.ggml_backend_dev_init(d, nil) b := backends[d]
bt := C.ggml_backend_get_default_buffer_type(b) bt := C.ggml_backend_get_default_buffer_type(b)
deviceBufferTypes[d] = bt deviceBufferTypes[d] = bt
@@ -428,8 +443,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
schedBackends: schedBackends, schedBackends: schedBackends,
schedBufts: schedBufts, schedBufts: schedBufts,
input: deviceBufferTypes[input.d], input: deviceBufferTypes[input.d],
layers: func() map[int]*C.struct_ggml_backend_buffer_type { layers: func() map[int]C.ggml_backend_buffer_type_t {
m := make(map[int]*C.struct_ggml_backend_buffer_type) m := make(map[int]C.ggml_backend_buffer_type_t)
for i, layer := range layers { for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d] m[i] = deviceBufferTypes[layer.d]
} }
@@ -438,6 +453,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
requiredMemory: &requiredMemory, requiredMemory: &requiredMemory,
btDeviceMemory: btDeviceMemory, btDeviceMemory: btDeviceMemory,
maxGraphNodes: maxGraphNodes, maxGraphNodes: maxGraphNodes,
weightBuffers: bbs,
}, nil }, nil
} }
@@ -445,6 +461,19 @@ func init() {
ml.RegisterBackend("ggml", New) ml.RegisterBackend("ggml", New)
} }
func (b *Backend) Close() {
if b == nil {
return
}
for ctx, b := range b.weightBuffers {
C.ggml_backend_buffer_free(b)
C.ggml_free(ctx)
}
C.ggml_backend_sched_free(b.sched)
}
func (b *Backend) Load(ctx context.Context, progress func(float32)) error { func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
var doneBytes atomic.Uint64 var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
@@ -541,10 +570,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes)) panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
} }
var allocatedBuffers []*C.struct_ggml_backend_buffer var allocatedBuffers []C.ggml_backend_buffer_t
// slog.Info("XXX before ggml_init")
// defer slog.Info("XXX after ggml_init")
return &Context{ return &Context{
b: b, b: b,
maxGraphNodes: n, maxGraphNodes: n,
@@ -572,11 +599,11 @@ type Context struct {
graph *C.struct_ggml_cgraph graph *C.struct_ggml_cgraph
// buft is the buffer type used for new tensors // buft is the buffer type used for new tensors
buft *C.struct_ggml_backend_buffer_type buft C.ggml_backend_buffer_type_t
// allocatedBuffers are buffers for tensors that we have allocated in this context // allocatedBuffers are buffers for tensors that we have allocated in this context
// so that we can free them when we close the context // so that we can free them when we close the context
allocatedBuffers *[]*C.struct_ggml_backend_buffer allocatedBuffers *[]C.ggml_backend_buffer_t
// maxGraphNodes is the maximum allowed number of graph nodes in this context // maxGraphNodes is the maximum allowed number of graph nodes in this context
maxGraphNodes int maxGraphNodes int
@@ -1407,55 +1434,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
return t return t
} }
// TODO - DRY this out with New if possible
func newTestBackend(size int) *Backend {
var cpus []*C.struct_ggml_backend_device
for _, d := range devices() {
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
if len(cpus) == 0 {
// only the first cpu device should be used
cpus = append(cpus, d)
break
}
}
}
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
b := C.ggml_backend_dev_init(cpus[0], nil)
bt := C.ggml_backend_get_default_buffer_type(b)
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
return &Backend{
meta: nil,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(max(8192, size)),
false,
false,
),
input: bt,
maxGraphNodes: max(8192, size),
schedBackends: schedBackends,
schedBufts: schedBufts,
}
}
func newTestContext(b *Backend, n int) *Context {
n = max(8192, n)
// slog.Info("XXX before ggml_init")
// defer slog.Info("XXX after ggml_init")
return &Context{
b: b,
maxGraphNodes: n,
ctx: C.ggml_init(C.struct_ggml_init_params{
mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
no_alloc: true,
}),
}
}

View File

@@ -35,6 +35,7 @@ extern "C" {
// //
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc);
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);

View File

@@ -32,6 +32,7 @@ extern "C" {
struct ggml_backend_buffer_type_i iface; struct ggml_backend_buffer_type_i iface;
ggml_backend_dev_t device; ggml_backend_dev_t device;
void * context; void * context;
bool no_alloc;
}; };
// //
@@ -63,6 +64,7 @@ extern "C" {
void * context; void * context;
size_t size; size_t size;
enum ggml_backend_buffer_usage usage; enum ggml_backend_buffer_usage usage;
bool no_alloc;
}; };
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(

View File

@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name(buft); return buft->iface.get_name(buft);
} }
void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
buft->no_alloc = !alloc;
}
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
if (size == 0) { if (size == 0) {
// return a dummy buffer for zero-sized allocations // return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0); return ggml_backend_buffer_init(buft, {}, NULL, 0);
} }
if (buft->no_alloc) {
ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
buf->no_alloc = true;
return buf;
}
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft, /* .buft = */ buft,
/* .context = */ context, /* .context = */ context,
/* .size = */ size, /* .size = */ size,
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY,
/* .no_alloc = */ false
}; };
return buffer; return buffer;
@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL; return NULL;
} }
// If we aren't allocating memory, return a placeholder non-NULL pointer
// that meets alignment requirements
if (buffer->no_alloc) {
return (void *)ggml_backend_buffer_get_alignment(buffer);
}
void * base = buffer->iface.get_base(buffer); void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");

View File

@@ -175,6 +175,51 @@ static int ggml_cuda_parse_id(char devName[]) {
} }
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
char id[64];
#if !defined(GGML_USE_HIP)
snprintf(id, sizeof(id),
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
(unsigned char)prop.uuid.bytes[0],
(unsigned char)prop.uuid.bytes[1],
(unsigned char)prop.uuid.bytes[2],
(unsigned char)prop.uuid.bytes[3],
(unsigned char)prop.uuid.bytes[4],
(unsigned char)prop.uuid.bytes[5],
(unsigned char)prop.uuid.bytes[6],
(unsigned char)prop.uuid.bytes[7],
(unsigned char)prop.uuid.bytes[8],
(unsigned char)prop.uuid.bytes[9],
(unsigned char)prop.uuid.bytes[10],
(unsigned char)prop.uuid.bytes[11],
(unsigned char)prop.uuid.bytes[12],
(unsigned char)prop.uuid.bytes[13],
(unsigned char)prop.uuid.bytes[14],
(unsigned char)prop.uuid.bytes[15]
);
#else
#ifdef _WIN32
snprintf(id, sizeof(id), "%d", device_num);
#else
try {
std::string uuid = std::string(prop.uuid.bytes, 16);
size_t pos = 0;
unsigned long long v = stoull(uuid, &pos, 16);
if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
throw std::invalid_argument("invalid uuid");
snprintf(id, sizeof(id), "GPU-%016llx", v);
} catch (const std::exception &e) {
snprintf(id, sizeof(id), "%d", device_num);
}
#endif
#endif
return id;
}
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__ #ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards: // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -263,22 +308,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
} }
} }
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n", GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
device_vmm ? "yes" : "no", prop.warpSize); device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
#elif defined(GGML_USE_MUSA) #elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32; info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
@@ -3475,38 +3522,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
#if !defined(GGML_USE_HIP)
char id[64];
snprintf(id, sizeof(id),
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
(unsigned char)prop.uuid.bytes[0],
(unsigned char)prop.uuid.bytes[1],
(unsigned char)prop.uuid.bytes[2],
(unsigned char)prop.uuid.bytes[3],
(unsigned char)prop.uuid.bytes[4],
(unsigned char)prop.uuid.bytes[5],
(unsigned char)prop.uuid.bytes[6],
(unsigned char)prop.uuid.bytes[7],
(unsigned char)prop.uuid.bytes[8],
(unsigned char)prop.uuid.bytes[9],
(unsigned char)prop.uuid.bytes[10],
(unsigned char)prop.uuid.bytes[11],
(unsigned char)prop.uuid.bytes[12],
(unsigned char)prop.uuid.bytes[13],
(unsigned char)prop.uuid.bytes[14],
(unsigned char)prop.uuid.bytes[15]
);
dev_ctx->id = id;
#else
#ifdef _WIN32
char id[16];
snprintf(id, sizeof(id), "%d", i);
dev_ctx->id = id;
#else
dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
#endif
#endif
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,

View File

@@ -38,6 +38,8 @@ type Message struct {
Content any `json:"content"` Content any `json:"content"`
Reasoning string `json:"reasoning,omitempty"` Reasoning string `json:"reasoning,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"` ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Name string `json:"name,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
} }
type Choice struct { type Choice struct {
@@ -101,6 +103,7 @@ type ChatCompletionRequest struct {
ResponseFormat *ResponseFormat `json:"response_format"` ResponseFormat *ResponseFormat `json:"response_format"`
Tools []api.Tool `json:"tools"` Tools []api.Tool `json:"tools"`
Reasoning *Reasoning `json:"reasoning,omitempty"` Reasoning *Reasoning `json:"reasoning,omitempty"`
ReasoningEffort *string `json:"reasoning_effort,omitempty"`
} }
type ChatCompletion struct { type ChatCompletion struct {
@@ -401,9 +404,20 @@ func toModel(r api.ShowResponse, m string) Model {
func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var messages []api.Message var messages []api.Message
for _, msg := range r.Messages { for _, msg := range r.Messages {
toolName := ""
if strings.ToLower(msg.Role) == "tool" {
toolName = msg.Name
if toolName == "" && msg.ToolCallID != "" {
toolName = nameFromToolCallID(r.Messages, msg.ToolCallID)
}
}
switch content := msg.Content.(type) { switch content := msg.Content.(type) {
case string: case string:
messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning}) toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
if err != nil {
return nil, err
}
messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName})
case []any: case []any:
for _, c := range content { for _, c := range content {
data, ok := c.(map[string]any) data, ok := c.(map[string]any)
@@ -454,7 +468,21 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
return nil, errors.New("invalid message format") return nil, errors.New("invalid message format")
} }
} }
// since we might have added multiple messages above, if we have tools
// calls we'll add them to the last message
if len(messages) > 0 && len(msg.ToolCalls) > 0 {
toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
if err != nil {
return nil, err
}
messages[len(messages)-1].ToolCalls = toolCalls
if toolName != "" {
messages[len(messages)-1].ToolName = toolName
}
messages[len(messages)-1].Thinking = msg.Reasoning
}
default: default:
// content is only optional if tool calls are present
if msg.ToolCalls == nil { if msg.ToolCalls == nil {
return nil, fmt.Errorf("invalid message content type: %T", content) return nil, fmt.Errorf("invalid message content type: %T", content)
} }
@@ -467,7 +495,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
return nil, errors.New("invalid tool call arguments") return nil, errors.New("invalid tool call arguments")
} }
} }
messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls}) messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls})
} }
} }
@@ -514,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
options["top_p"] = 1.0 options["top_p"] = 1.0
} }
if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
}
var format json.RawMessage var format json.RawMessage
if r.ResponseFormat != nil { if r.ResponseFormat != nil {
switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) { switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
@@ -533,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var think *api.ThinkValue var think *api.ThinkValue
if r.Reasoning != nil { if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
think = &api.ThinkValue{ think = &api.ThinkValue{
Value: *r.Reasoning.Effort, Value: *r.Reasoning.Effort,
} }
} else if r.ReasoningEffort != nil {
options["reasoning"] = *r.ReasoningEffort
think = &api.ThinkValue{
Value: *r.ReasoningEffort,
}
} }
return &api.ChatRequest{ return &api.ChatRequest{
@@ -549,6 +579,33 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
}, nil }, nil
} }
func nameFromToolCallID(messages []Message, toolCallID string) string {
// iterate backwards to be more resilient to duplicate tool call IDs (this
// follows "last one wins")
for i := len(messages) - 1; i >= 0; i-- {
msg := messages[i]
for _, tc := range msg.ToolCalls {
if tc.ID == toolCallID {
return tc.Function.Name
}
}
}
return ""
}
func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
apiToolCalls := make([]api.ToolCall, len(toolCalls))
for i, tc := range toolCalls {
apiToolCalls[i].Function.Name = tc.Function.Name
err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments)
if err != nil {
return nil, errors.New("invalid tool call arguments")
}
}
return apiToolCalls, nil
}
func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
options := make(map[string]any) options := make(map[string]any)

View File

@@ -235,6 +235,210 @@ func TestChatMiddleware(t *testing.T) {
Stream: &False, Stream: &False,
}, },
}, },
{
name: "chat handler with tools and content",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris Today?"},
{"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris Today?",
},
{
Role: "assistant",
Content: "Let's see what the weather is like in Paris",
ToolCalls: []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: map[string]any{
"location": "Paris, France",
"format": "celsius",
},
},
},
},
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "chat handler with tools and empty content",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris Today?"},
{"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris Today?",
},
{
Role: "assistant",
ToolCalls: []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: map[string]any{
"location": "Paris, France",
"format": "celsius",
},
},
},
},
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "chat handler with tools and thinking content",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris Today?"},
{"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris Today?",
},
{
Role: "assistant",
Thinking: "Let's see what the weather is like in Paris",
ToolCalls: []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: map[string]any{
"location": "Paris, France",
"format": "celsius",
},
},
},
},
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "tool response with call ID",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris Today?"},
{"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
{"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"}
]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris Today?",
},
{
Role: "assistant",
ToolCalls: []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: map[string]any{
"location": "Paris, France",
"format": "celsius",
},
},
},
},
},
{
Role: "tool",
Content: "The weather in Paris is 20 degrees Celsius",
ToolName: "get_current_weather",
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "tool response with name",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris Today?"},
{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
{"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"}
]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris Today?",
},
{
Role: "assistant",
ToolCalls: []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: map[string]any{
"location": "Paris, France",
"format": "celsius",
},
},
},
},
},
{
Role: "tool",
Content: "The weather in Paris is 20 degrees Celsius",
ToolName: "get_current_weather",
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{ {
name: "chat handler with streaming tools", name: "chat handler with streaming tools",
body: `{ body: `{

View File

@@ -70,6 +70,10 @@ func kvCacheTypeFromStr(s string) ml.DType {
} }
func (c *InputCache) Close() { func (c *InputCache) Close() {
if c == nil {
return
}
c.cache.Close() c.cache.Close()
} }

View File

@@ -877,6 +877,15 @@ func (s *Server) load(
) { ) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache) err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil { if err != nil {
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
// We can't yet handle this but in the future we will
s.cache.Close()
if s.model != nil {
s.model.Backend().Close()
}
}
panic(err) panic(err)
} }

View File

@@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
thinkVal := false thinkVal := false
thinkLevel := "" thinkLevel := ""
if think != nil { if think != nil {
thinkVal = think.AsBool() thinkVal = think.Bool()
thinkLevel = think.AsString() thinkLevel = think.String()
} }
var b bytes.Buffer var b bytes.Buffer
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
@@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
thinkVal := false thinkVal := false
thinkLevel := "" thinkLevel := ""
if think != nil { if think != nil {
thinkVal = think.AsBool() thinkVal = think.Bool()
thinkLevel = think.AsString() thinkLevel = think.String()
} }
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
return "", nil, err return "", nil, err

View File

@@ -30,6 +30,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
@@ -50,11 +51,16 @@ func experimentEnabled(name string) bool {
var useClient2 = experimentEnabled("client2") var useClient2 = experimentEnabled("client2")
// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
// reduced context length on some models
var lowVRAMThreshold uint64 = 20 * format.GibiByte
var mode string = gin.DebugMode var mode string = gin.DebugMode
type Server struct { type Server struct {
addr net.Addr addr net.Addr
sched *Scheduler sched *Scheduler
lowVRAM bool
} }
func init() { func init() {
@@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
return nil, nil, nil, err return nil, nil, nil, err
} }
// This model requires a minimum context to function effectively // This model is much more capable with a larger context, so set that
if slices.Contains(model.Config.ModelFamilies, "gptoss") { // unless it would penalize performance too much
if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") {
opts.NumCtx = max(opts.NumCtx, 8192) opts.NumCtx = max(opts.NumCtx, 8192)
} }
@@ -198,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
// Validate Think value: string values currently only allowed for gptoss models // Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony { if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return return
} }
@@ -206,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
if req.Suffix != "" { if req.Suffix != "" {
caps = append(caps, model.CapabilityInsert) caps = append(caps, model.CapabilityInsert)
} }
if req.Think != nil && req.Think.AsBool() { if req.Think != nil && req.Think.Bool() {
caps = append(caps, model.CapabilityThinking) caps = append(caps, model.CapabilityThinking)
// TODO(drifkin): consider adding a warning if it's false and the model // TODO(drifkin): consider adding a warning if it's false and the model
// doesn't support thinking. It's not strictly required, but it can be a // doesn't support thinking. It's not strictly required, but it can be a
@@ -281,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt}) values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
} }
values.Think = req.Think != nil && req.Think.AsBool() values.Think = req.Think != nil && req.Think.Bool()
values.ThinkLevel = "" values.ThinkLevel = ""
if req.Think != nil { if req.Think != nil {
values.ThinkLevel = req.Think.AsString() values.ThinkLevel = req.Think.String()
} }
values.IsThinkSet = req.Think != nil values.IsThinkSet = req.Think != nil
@@ -310,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
var thinkingState *thinking.Parser var thinkingState *thinking.Parser
if !useHarmony { if !useHarmony {
openingTag, closingTag := thinking.InferTags(m.Template.Template) openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
thinkingState = &thinking.Parser{ thinkingState = &thinking.Parser{
OpeningTag: openingTag, OpeningTag: openingTag,
ClosingTag: closingTag, ClosingTag: closingTag,
@@ -364,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
*toolName = strings.TrimPrefix(*toolName, "functions.") *toolName = strings.TrimPrefix(*toolName, "functions.")
var args api.ToolCallFunctionArguments var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil { if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
ch <- gin.H{"error parsing tool call": err.Error()} errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
ch <- gin.H{"error": errStr}
return return
} }
@@ -1382,6 +1390,15 @@ func Serve(ln net.Listener) error {
gpus := discover.GetGPUInfo() gpus := discover.GetGPUInfo()
gpus.LogDetails() gpus.LogDetails()
var totalVRAM uint64
for _, gpu := range gpus {
totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
}
if totalVRAM < lowVRAMThreshold {
s.lowVRAM = true
slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
}
err = srvr.Serve(ln) err = srvr.Serve(ln)
// If server is closed from the signal handler, wait for the ctx to be done // If server is closed from the signal handler, wait for the ctx to be done
// otherwise error out quickly // otherwise error out quickly
@@ -1530,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
if len(req.Tools) > 0 { if len(req.Tools) > 0 {
caps = append(caps, model.CapabilityTools) caps = append(caps, model.CapabilityTools)
} }
if req.Think != nil && req.Think.AsBool() { if req.Think != nil && req.Think.Bool() {
caps = append(caps, model.CapabilityThinking) caps = append(caps, model.CapabilityThinking)
} }
@@ -1584,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
// Validate Think value: string values currently only allowed for gptoss models // Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony { if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return return
} }
@@ -1603,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
var thinkingState *thinking.Parser var thinkingState *thinking.Parser
openingTag, closingTag := thinking.InferTags(m.Template.Template) openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
thinkingState = &thinking.Parser{ thinkingState = &thinking.Parser{
OpeningTag: openingTag, OpeningTag: openingTag,
ClosingTag: closingTag, ClosingTag: closingTag,
@@ -1655,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
*toolName = strings.TrimPrefix(*toolName, "functions.") *toolName = strings.TrimPrefix(*toolName, "functions.")
var args api.ToolCallFunctionArguments var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil { if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
ch <- gin.H{"error parsing tool call": err.Error()} errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
ch <- gin.H{"error": errStr}
return return
} }
res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}} res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}

View File

@@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly // opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry []int var numParallelToTry []int
if *numParallel <= 0 { if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1 // If no specific parallel setting was provided, try larger then smaller, always end with 1
@@ -769,42 +767,54 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
} }
for _, gl := range gpus.ByLibrary() { for _, gl := range gpus.ByLibrary() {
var ok bool
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU if !envconfig.SchedSpread() {
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() { // Try to pack into as few GPUs as possible, starting from 1 GPU
for _, g := range sgl { for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { gpuSubset := sgl[:numGPUs]
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
*numParallel = p
return []discover.GpuInfo{g}
}
}
}
}
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
*numParallel = p
return gpuSubset
}
}
}
} else {
// TODO future refinements // TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit // - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family // - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUs // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
*numParallel = p *numParallel = p
return sgl return sgl
} }
} }
} }
}
return nil return nil
} }