ggml: handle all streams (#13350)

Follow up from #12992

Free all streams, and keep the alloc logic aligned across streams.
This commit is contained in:
Daniel Hiltgen
2025-12-05 16:10:33 -08:00
committed by GitHub
parent 31b8c6a214
commit c146a138e3
6 changed files with 55 additions and 35 deletions

View File

@@ -1221,7 +1221,11 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool(int device) {
if (pools[device][curr_stream_no] == nullptr) {
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
bool alloc = true;
if (pools[device][0] != nullptr) {
alloc = pools[device][0]->alloc_memory();
}
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
}
return *pools[device][curr_stream_no];
}
@@ -1238,12 +1242,12 @@ struct ggml_backend_cuda_context {
}
}
size_t pool_get_alloc_size() {
if (pools[device][curr_stream_no] == nullptr) {
size_t pool_get_alloc_size(int stream_no) {
if (pools[device][stream_no] == nullptr) {
return 0;
}
return pools[device][curr_stream_no]->alloc_size();
return pools[device][stream_no]->alloc_size();
}
};

View File

@@ -3913,12 +3913,18 @@ static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend,
static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
return ctx->pool_get_alloc_size();
size_t allocs = 0;
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
allocs += ctx->pool_get_alloc_size(i);
}
return allocs;
}
static void ggml_backend_cuda_reset(ggml_backend_t backend) {
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
ctx->pools[ctx->device][i] = NULL;
}
}
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {