diff --git a/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch new file mode 100644 index 00000000..84aefd1d --- /dev/null +++ b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch @@ -0,0 +1,130 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Wed, 27 Aug 2025 14:39:48 -0700 +Subject: [PATCH] ggml: Enable resetting backend devices + +Touching a CUDA device causes the allocation of a primary context +with CUDA data structures (~300 MB of VRAM). If a device is +unused then it can be reset to free these data structures. +--- + ggml/include/ggml-backend.h | 1 + + ggml/src/ggml-backend-impl.h | 4 ++++ + ggml/src/ggml-backend.cpp | 8 ++++++++ + ggml/src/ggml-cuda/ggml-cuda.cu | 17 +++++++++++++++-- + ggml/src/ggml-cuda/vendors/hip.h | 1 + + 5 files changed, 29 insertions(+), 2 deletions(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index b602a7c78..fda5ceb24 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -167,6 +167,7 @@ extern "C" { + GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); + GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); + GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); ++ GGML_API void ggml_backend_dev_reset(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index 81749a5a3..6f10c353b 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -178,6 +178,10 @@ extern "C" { + ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); + void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); + void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); ++ ++ // (optional) reset device, clearing existing allocations and context ++ // the caller must ensure that there are no outstanding buffers, as these will become invalid ++ void (*reset)(ggml_backend_dev_t dev); + }; + + struct ggml_backend_device { +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index 05a842ed5..6556943b0 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par + return device->iface.init_backend(device, params); + } + ++void ggml_backend_dev_reset(ggml_backend_dev_t device) { ++ if (device->iface.reset == NULL) { ++ return; ++ } ++ ++ device->iface.reset(device); ++} ++ + ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { + return device->iface.get_buffer_type(device); + } +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index c7f9dc3a5..e43fde523 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -103,6 +103,11 @@ int ggml_cuda_get_device() { + return id; + } + ++void ggml_cuda_reset_device(int device) { ++ ggml_cuda_set_device(device); ++ CUDA_CHECK(cudaDeviceReset()); ++} ++ + static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { + ggml_cuda_set_device(device); + cudaError_t err; +@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back + props->description = ggml_backend_cuda_device_get_description(dev); + props->id = ggml_backend_cuda_device_get_id(dev); + props->type = ggml_backend_cuda_device_get_type(dev); +- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ++ ++ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device). ++ // If you need the memory data, call ggml_backend_dev_memory() explicitly. ++ props->memory_total = props->memory_free = 0; + + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; + #ifdef GGML_CUDA_NO_PEER_COPY +@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g + CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); + } + ++static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) { ++ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ++ ggml_cuda_reset_device(ctx->device); ++} ++ + static const ggml_backend_device_i ggml_backend_cuda_device_interface = { + /* .get_name = */ ggml_backend_cuda_device_get_name, + /* .get_description = */ ggml_backend_cuda_device_get_description, +@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { + /* .event_new = */ ggml_backend_cuda_device_event_new, + /* .event_free = */ ggml_backend_cuda_device_event_free, + /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, ++ /* .reset = */ ggml_backend_cuda_device_reset, + }; + + // backend reg +@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { + dev_ctx->device = i; + dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); + +- ggml_cuda_set_device(i); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); + dev_ctx->description = prop.name; +diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h +index c31f31923..cf22e60d2 100644 +--- a/ggml/src/ggml-cuda/vendors/hip.h ++++ b/ggml/src/ggml-cuda/vendors/hip.h +@@ -40,6 +40,7 @@ + #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess + #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess + #define cudaDeviceProp hipDeviceProp_t ++#define cudaDeviceReset hipDeviceReset + #define cudaDeviceSynchronize hipDeviceSynchronize + #define cudaError_t hipError_t + #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index e8403e06..6253c34e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -629,6 +629,18 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error { }) } + // Cleanup any backend state from devices that we didn't end up using +nextDevice: + for _, d := range append(gpus, append(accels, cpus...)...) { + for _, backend := range b.schedBackends { + if d == C.ggml_backend_get_device(backend) { + continue nextDevice + } + } + + C.ggml_backend_dev_reset(d) + } + if err := g.Wait(); err != nil { return err } diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index b602a7c7..fda5ceb2 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -167,6 +167,7 @@ extern "C" { GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); + GGML_API void ggml_backend_dev_reset(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index 81749a5a..6f10c353 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -178,6 +178,10 @@ extern "C" { ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); + + // (optional) reset device, clearing existing allocations and context + // the caller must ensure that there are no outstanding buffers, as these will become invalid + void (*reset)(ggml_backend_dev_t dev); }; struct ggml_backend_device { diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 05a842ed..6556943b 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par return device->iface.init_backend(device, params); } +void ggml_backend_dev_reset(ggml_backend_dev_t device) { + if (device->iface.reset == NULL) { + return; + } + + device->iface.reset(device); +} + ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { return device->iface.get_buffer_type(device); } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index c7f9dc3a..e43fde52 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -103,6 +103,11 @@ int ggml_cuda_get_device() { return id; } +void ggml_cuda_reset_device(int device) { + ggml_cuda_set_device(device); + CUDA_CHECK(cudaDeviceReset()); +} + static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); cudaError_t err; @@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back props->description = ggml_backend_cuda_device_get_description(dev); props->id = ggml_backend_cuda_device_get_id(dev); props->type = ggml_backend_cuda_device_get_type(dev); - ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); + + // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device). + // If you need the memory data, call ggml_backend_dev_memory() explicitly. + props->memory_total = props->memory_free = 0; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY @@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); } +static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + ggml_cuda_reset_device(ctx->device); +} + static const ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .get_name = */ ggml_backend_cuda_device_get_name, /* .get_description = */ ggml_backend_cuda_device_get_description, @@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .event_new = */ ggml_backend_cuda_device_event_new, /* .event_free = */ ggml_backend_cuda_device_event_free, /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, + /* .reset = */ ggml_backend_cuda_device_reset, }; // backend reg @@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->device = i; dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); - ggml_cuda_set_device(i); cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h index c31f3192..cf22e60d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h +++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h @@ -40,6 +40,7 @@ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceReset hipDeviceReset #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled