ggml: handle all streams (#13350)

Follow up from #12992 Free all streams, and keep the alloc logic aligned across streams.
2025-12-21 14:26:30 +00:00 · 2025-12-05 16:10:33 -08:00
parent 31b8c6a214
commit c146a138e3
6 changed files with 55 additions and 35 deletions
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -1221,7 +1221,11 @@ struct ggml_backend_cuda_context {

    ggml_cuda_pool & pool(int device) {
        if (pools[device][curr_stream_no] == nullptr) {
-            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
+            bool alloc = true;
+            if (pools[device][0] != nullptr) {
+                alloc = pools[device][0]->alloc_memory();
+            }
+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
        }
        return *pools[device][curr_stream_no];
    }
@@ -1238,12 +1242,12 @@ struct ggml_backend_cuda_context {
        }
    }

-    size_t pool_get_alloc_size() {
-        if (pools[device][curr_stream_no] == nullptr) {
+    size_t pool_get_alloc_size(int stream_no) {
+        if (pools[device][stream_no] == nullptr) {
            return 0;
        }

-        return pools[device][curr_stream_no]->alloc_size();
+        return pools[device][stream_no]->alloc_size();
    }
 };

--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3913,12 +3913,18 @@ static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend,

 static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
-    return ctx->pool_get_alloc_size();
+    size_t allocs = 0;
+    for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
+        allocs += ctx->pool_get_alloc_size(i);
+    }
+    return allocs;
 }

 static void ggml_backend_cuda_reset(ggml_backend_t backend) {
    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
-    ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
+    for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
+        ctx->pools[ctx->device][i] = NULL;
+    }
 }

 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {