ggml: handle all streams (#13350)

Follow up from #12992

Free all streams, and keep the alloc logic aligned across streams.
This commit is contained in:
Daniel Hiltgen
2025-12-05 16:10:33 -08:00
committed by GitHub
parent 31b8c6a214
commit c146a138e3
6 changed files with 55 additions and 35 deletions

View File

@@ -58,7 +58,7 @@ index 6d493a4ff..ac8f38464 100644
set_target_properties(ggml-base PROPERTIES
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0787e443c..736d47c07 100644
index 1110ca372..c1bfadb3e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -90,7 +90,7 @@ index 0787e443c..736d47c07 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context {
@@ -4255,6 +4270,11 @@ struct ggml_backend_cuda_device_context {
std::string description;
std::string pci_bus_id;
std::string id;
@@ -102,7 +102,7 @@ index 0787e443c..736d47c07 100644
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
@@ -4351,6 +4371,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -131,7 +131,7 @@ index 0787e443c..736d47c07 100644
CUDA_CHECK(cudaMemGetInfo(free, total));
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
@@ -4383,6 +4425,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
@@ -139,7 +139,7 @@ index 0787e443c..736d47c07 100644
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -4396,6 +4439,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
@@ -159,7 +159,7 @@ index 0787e443c..736d47c07 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4980,6 +5036,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -167,7 +167,7 @@ index 0787e443c..736d47c07 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4995,6 +5052,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;