Remove unnecessary MacOs 13 and lower Patches (#12656)

* Remove unnecessary macos 13 Patch * Remove unnecessary MacOs Version Guard patch * rename patchesw * remove again macos13 patch * rename files
2025-12-23 15:08:27 +00:00 · 2025-11-07 00:52:56 +01:00
parent 565b802a6b
commit d4e0da0890
15 changed files with 1 additions and 64 deletions
--- a/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
@@ -0,0 +1,141 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Wed, 27 Aug 2025 14:39:48 -0700
+Subject: [PATCH] ggml: Enable resetting backend devices
+
+Touching a CUDA device causes the allocation of a primary context
+with CUDA data structures (~300 MB of VRAM). If a device is
+unused then it can be reset to free these data structures.
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-backend-impl.h     |  4 ++++
+ ggml/src/ggml-backend.cpp        |  8 ++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 16 +++++++++++++++-
+ ggml/src/ggml-cuda/vendors/hip.h |  1 +
+ src/llama.cpp                    |  4 +++-
+ 6 files changed, 32 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index b3b5b356a..69223c488 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -178,6 +178,7 @@ extern "C" {
+     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
+index 7bdf9d81f..21b35ac5c 100644
+--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
+@@ -195,6 +195,10 @@ extern "C" {
+         ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
+         void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
+         void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
+
+        // (optional) reset device, clearing existing allocations and context
+        // the caller must ensure that there are no outstanding buffers, as these will become invalid
+        void (*reset)(ggml_backend_dev_t dev);
+     };
+ 
+     struct ggml_backend_device {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index c81a2e48a..9b0a9b91f 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
+     return device->iface.init_backend(device, params);
+ }
+ 
+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
+    if (device->iface.reset == NULL) {
+        return;
+    }
+
+    device->iface.reset(device);
+}
+
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
+     GGML_ASSERT(device);
+     return device->iface.get_buffer_type(device);
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index f79e5d65c..c9333689f 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
+     return id;
+ }
+ 
+void ggml_cuda_reset_device(int device) {
+    ggml_cuda_set_device(device);
+    CUDA_CHECK(cudaDeviceReset());
+}
+
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
+     ggml_cuda_set_device(device);
+     cudaError_t err;
+@@ -3499,7 +3504,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     props->id          = ggml_backend_cuda_device_get_id(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
+-    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
+    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+    props->memory_total = props->memory_free = 0;
+ 
+     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+ #ifdef GGML_CUDA_NO_PEER_COPY
+@@ -3936,6 +3944,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
+ }
+ 
+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    ggml_cuda_reset_device(ctx->device);
+}
+
+ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+     /* .get_name                = */ ggml_backend_cuda_device_get_name,
+     /* .get_description         = */ ggml_backend_cuda_device_get_description,
+@@ -3952,6 +3965,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+     /* .event_new               = */ ggml_backend_cuda_device_event_new,
+     /* .event_free              = */ ggml_backend_cuda_device_event_free,
+     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
+    /* .reset                   = */ ggml_backend_cuda_device_reset,
+ };
+ 
+ // backend reg
+diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
+index 890c10364..1f06be80e 100644
+--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
+@@ -45,6 +45,7 @@
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+ #define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
+ #define cudaDeviceSynchronize hipDeviceSynchronize
+ #define cudaError_t hipError_t
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+diff --git a/src/llama.cpp b/src/llama.cpp
+index ab2e9868a..74c49e651 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -270,10 +270,12 @@ static struct llama_model * llama_model_load_from_file_impl(
+     for (auto * dev : model->devices) {
+         ggml_backend_dev_props props;
+         ggml_backend_dev_get_props(dev, &props);
+        size_t memory_free, memory_total;
+        ggml_backend_dev_memory(dev, &memory_free, &memory_total);
+         LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
+                 ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+                 props.device_id ? props.device_id : "unknown id",
+-                props.memory_free/1024/1024);
+                memory_free/1024/1024);
+     }
+ 
+     const int status = llama_model_load(path_model, splits, *model, params);