mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
ggml: handle all streams (#13350)
Follow up from #12992 Free all streams, and keep the alloc logic aligned across streams.
This commit is contained in:
@@ -10,10 +10,10 @@ must be recreated with no-alloc set to false before loading data.
|
|||||||
---
|
---
|
||||||
ggml/include/ggml-backend.h | 1 +
|
ggml/include/ggml-backend.h | 1 +
|
||||||
ggml/src/ggml-backend-impl.h | 16 +++
|
ggml/src/ggml-backend-impl.h | 16 +++
|
||||||
ggml/src/ggml-backend.cpp | 72 ++++++++++-
|
ggml/src/ggml-backend.cpp | 72 +++++++++-
|
||||||
ggml/src/ggml-cuda/common.cuh | 58 ++++++++-
|
ggml/src/ggml-cuda/common.cuh | 62 ++++++++-
|
||||||
ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------
|
ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------
|
||||||
5 files changed, 321 insertions(+), 44 deletions(-)
|
5 files changed, 331 insertions(+), 44 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index 2763f2bd6..b3b5b356a 100644
|
index 2763f2bd6..b3b5b356a 100644
|
||||||
@@ -219,7 +219,7 @@ index f511e8d76..74b7f070c 100644
|
|||||||
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||||
index 611341deb..c3f8ca914 100644
|
index 611341deb..ee463af9c 100644
|
||||||
--- a/ggml/src/ggml-cuda/common.cuh
|
--- a/ggml/src/ggml-cuda/common.cuh
|
||||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||||
@@ -37,6 +37,41 @@
|
@@ -37,6 +37,41 @@
|
||||||
@@ -274,7 +274,7 @@ index 611341deb..c3f8ca914 100644
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context {
|
@@ -1179,11 +1217,15 @@ struct ggml_backend_cuda_context {
|
||||||
// pool
|
// pool
|
||||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||||
|
|
||||||
@@ -284,11 +284,15 @@ index 611341deb..c3f8ca914 100644
|
|||||||
ggml_cuda_pool & pool(int device) {
|
ggml_cuda_pool & pool(int device) {
|
||||||
if (pools[device][curr_stream_no] == nullptr) {
|
if (pools[device][curr_stream_no] == nullptr) {
|
||||||
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
|
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
|
||||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
+ bool alloc = true;
|
||||||
|
+ if (pools[device][0] != nullptr) {
|
||||||
|
+ alloc = pools[device][0]->alloc_memory();
|
||||||
|
+ }
|
||||||
|
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||||
}
|
}
|
||||||
return *pools[device][curr_stream_no];
|
return *pools[device][curr_stream_no];
|
||||||
}
|
}
|
||||||
@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context {
|
@@ -1191,6 +1233,22 @@ struct ggml_backend_cuda_context {
|
||||||
ggml_cuda_pool & pool() {
|
ggml_cuda_pool & pool() {
|
||||||
return pool(device);
|
return pool(device);
|
||||||
}
|
}
|
||||||
@@ -301,18 +305,18 @@ index 611341deb..c3f8ca914 100644
|
|||||||
+ }
|
+ }
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
+ size_t pool_get_alloc_size() {
|
+ size_t pool_get_alloc_size(int stream_no) {
|
||||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
+ if (pools[device][stream_no] == nullptr) {
|
||||||
+ return 0;
|
+ return 0;
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
+ return pools[device][curr_stream_no]->alloc_size();
|
+ return pools[device][stream_no]->alloc_size();
|
||||||
+ }
|
+ }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cuda_mm_fusion_args_host {
|
struct ggml_cuda_mm_fusion_args_host {
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index 78fb2d8b3..fe0da71ca 100644
|
index 78fb2d8b3..f1c178f31 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||||
@@ -583,7 +587,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||||||
|
|
||||||
ggml_cuda_set_device(cuda_ctx->device);
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
|
|
||||||
@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
@@ -3766,6 +3836,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -644,18 +648,24 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||||||
+
|
+
|
||||||
+static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
+static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
||||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
+ return ctx->pool_get_alloc_size();
|
+ size_t allocs = 0;
|
||||||
|
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||||
|
+ allocs += ctx->pool_get_alloc_size(i);
|
||||||
|
+ }
|
||||||
|
+ return allocs;
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
+ ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||||
|
+ ctx->pools[ctx->device][i] = NULL;
|
||||||
|
+ }
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
@@ -4035,6 +4176,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||||
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ index 74b7f070c..8d2cc167f 100644
|
|||||||
GGML_ASSERT(device);
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_buffer_type(device);
|
return device->iface.get_buffer_type(device);
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index fe0da71ca..0787e443c 100644
|
index f1c178f31..1110ca372 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -109,6 +109,11 @@ int ggml_cuda_get_device() {
|
@@ -109,6 +109,11 @@ int ggml_cuda_get_device() {
|
||||||
@@ -77,7 +77,7 @@ index fe0da71ca..0787e443c 100644
|
|||||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
@@ -4380,7 +4385,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
@@ -4386,7 +4391,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||||
@@ -89,7 +89,7 @@ index fe0da71ca..0787e443c 100644
|
|||||||
|
|
||||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
@@ -4835,6 +4843,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
@@ -4841,6 +4849,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ index fe0da71ca..0787e443c 100644
|
|||||||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||||
@@ -4851,6 +4864,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
@@ -4857,6 +4870,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ index 6d493a4ff..ac8f38464 100644
|
|||||||
|
|
||||||
set_target_properties(ggml-base PROPERTIES
|
set_target_properties(ggml-base PROPERTIES
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index 0787e443c..736d47c07 100644
|
index 1110ca372..c1bfadb3e 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
@@ -90,7 +90,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||||
@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context {
|
@@ -4255,6 +4270,11 @@ struct ggml_backend_cuda_device_context {
|
||||||
std::string description;
|
std::string description;
|
||||||
std::string pci_bus_id;
|
std::string pci_bus_id;
|
||||||
std::string id;
|
std::string id;
|
||||||
@@ -102,7 +102,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
@@ -4351,6 +4371,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
@@ -131,7 +131,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||||
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
||||||
@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
@@ -4383,6 +4425,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,7 +139,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
|
|
||||||
@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
@@ -4396,6 +4439,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||||
props->memory_total = props->memory_free = 0;
|
props->memory_total = props->memory_free = 0;
|
||||||
|
|
||||||
@@ -159,7 +159,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
bool events = false;
|
bool events = false;
|
||||||
@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
@@ -4980,6 +5036,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||||
@@ -167,7 +167,7 @@ index 0787e443c..736d47c07 100644
|
|||||||
|
|
||||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||||
@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
@@ -4995,6 +5052,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
dev_ctx->pci_bus_id = pci_bus_id;
|
dev_ctx->pci_bus_id = pci_bus_id;
|
||||||
|
|
||||||
|
|||||||
@@ -10,10 +10,10 @@ fallback to cpu
|
|||||||
1 file changed, 3 insertions(+)
|
1 file changed, 3 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index 736d47c07..7350f6758 100644
|
index c1bfadb3e..16c166a08 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -4564,6 +4564,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
@@ -4570,6 +4570,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
12
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
vendored
12
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
vendored
@@ -1221,7 +1221,11 @@ struct ggml_backend_cuda_context {
|
|||||||
|
|
||||||
ggml_cuda_pool & pool(int device) {
|
ggml_cuda_pool & pool(int device) {
|
||||||
if (pools[device][curr_stream_no] == nullptr) {
|
if (pools[device][curr_stream_no] == nullptr) {
|
||||||
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
bool alloc = true;
|
||||||
|
if (pools[device][0] != nullptr) {
|
||||||
|
alloc = pools[device][0]->alloc_memory();
|
||||||
|
}
|
||||||
|
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||||
}
|
}
|
||||||
return *pools[device][curr_stream_no];
|
return *pools[device][curr_stream_no];
|
||||||
}
|
}
|
||||||
@@ -1238,12 +1242,12 @@ struct ggml_backend_cuda_context {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t pool_get_alloc_size() {
|
size_t pool_get_alloc_size(int stream_no) {
|
||||||
if (pools[device][curr_stream_no] == nullptr) {
|
if (pools[device][stream_no] == nullptr) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return pools[device][curr_stream_no]->alloc_size();
|
return pools[device][stream_no]->alloc_size();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
10
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
10
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -3913,12 +3913,18 @@ static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend,
|
|||||||
|
|
||||||
static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
return ctx->pool_get_alloc_size();
|
size_t allocs = 0;
|
||||||
|
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||||
|
allocs += ctx->pool_get_alloc_size(i);
|
||||||
|
}
|
||||||
|
return allocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||||
|
ctx->pools[ctx->device][i] = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||||
|
|||||||
Reference in New Issue
Block a user