diff --git a/llama/patches/0020-ggml-No-alloc-mode.patch b/llama/patches/0020-ggml-No-alloc-mode.patch index 0dff5573..01a42690 100644 --- a/llama/patches/0020-ggml-No-alloc-mode.patch +++ b/llama/patches/0020-ggml-No-alloc-mode.patch @@ -10,10 +10,10 @@ must be recreated with no-alloc set to false before loading data. --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-impl.h | 16 +++ - ggml/src/ggml-backend.cpp | 72 ++++++++++- - ggml/src/ggml-cuda/common.cuh | 58 ++++++++- - ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------ - 5 files changed, 321 insertions(+), 44 deletions(-) + ggml/src/ggml-backend.cpp | 72 +++++++++- + ggml/src/ggml-cuda/common.cuh | 62 ++++++++- + ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------ + 5 files changed, 331 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 2763f2bd6..b3b5b356a 100644 @@ -219,7 +219,7 @@ index f511e8d76..74b7f070c 100644 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh -index 611341deb..c3f8ca914 100644 +index 611341deb..ee463af9c 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -37,6 +37,41 @@ @@ -274,7 +274,7 @@ index 611341deb..c3f8ca914 100644 }; template -@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context { +@@ -1179,11 +1217,15 @@ struct ggml_backend_cuda_context { // pool std::unique_ptr pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; @@ -284,11 +284,15 @@ index 611341deb..c3f8ca914 100644 ggml_cuda_pool & pool(int device) { if (pools[device][curr_stream_no] == nullptr) { - pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no); -+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true); ++ bool alloc = true; ++ if (pools[device][0] != nullptr) { ++ alloc = pools[device][0]->alloc_memory(); ++ } ++ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc); } return *pools[device][curr_stream_no]; } -@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context { +@@ -1191,6 +1233,22 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool() { return pool(device); } @@ -301,18 +305,18 @@ index 611341deb..c3f8ca914 100644 + } + } + -+ size_t pool_get_alloc_size() { -+ if (pools[device][curr_stream_no] == nullptr) { ++ size_t pool_get_alloc_size(int stream_no) { ++ if (pools[device][stream_no] == nullptr) { + return 0; + } + -+ return pools[device][curr_stream_no]->alloc_size(); ++ return pools[device][stream_no]->alloc_size(); + } }; struct ggml_cuda_mm_fusion_args_host { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 78fb2d8b3..fe0da71ca 100644 +index 78fb2d8b3..f1c178f31 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -583,7 +587,7 @@ index 78fb2d8b3..fe0da71ca 100644 ggml_cuda_set_device(cuda_ctx->device); -@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, +@@ -3766,6 +3836,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; } @@ -644,18 +648,24 @@ index 78fb2d8b3..fe0da71ca 100644 + +static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) { + ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; -+ return ctx->pool_get_alloc_size(); ++ size_t allocs = 0; ++ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) { ++ allocs += ctx->pool_get_alloc_size(i); ++ } ++ return allocs; +} + +static void ggml_backend_cuda_reset(ggml_backend_t backend) { + ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; -+ ctx->pools[ctx->device][ctx->curr_stream_no] = NULL; ++ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) { ++ ctx->pools[ctx->device][i] = NULL; ++ } +} + static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { +@@ -4035,6 +4176,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, /* .graph_optimize = */ ggml_backend_cuda_graph_optimize, diff --git a/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch b/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch index c65d84f7..04a6b0be 100644 --- a/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch +++ b/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch @@ -62,7 +62,7 @@ index 74b7f070c..8d2cc167f 100644 GGML_ASSERT(device); return device->iface.get_buffer_type(device); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index fe0da71ca..0787e443c 100644 +index f1c178f31..1110ca372 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -109,6 +109,11 @@ int ggml_cuda_get_device() { @@ -77,7 +77,7 @@ index fe0da71ca..0787e443c 100644 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); cudaError_t err; -@@ -4380,7 +4385,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -4386,7 +4391,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back props->id = ggml_backend_cuda_device_get_id(dev); props->type = ggml_backend_cuda_device_get_type(dev); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); @@ -89,7 +89,7 @@ index fe0da71ca..0787e443c 100644 bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY -@@ -4835,6 +4843,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g +@@ -4841,6 +4849,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); } @@ -101,7 +101,7 @@ index fe0da71ca..0787e443c 100644 static const ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .get_name = */ ggml_backend_cuda_device_get_name, /* .get_description = */ ggml_backend_cuda_device_get_description, -@@ -4851,6 +4864,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { +@@ -4857,6 +4870,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .event_new = */ ggml_backend_cuda_device_event_new, /* .event_free = */ ggml_backend_cuda_device_event_free, /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, diff --git a/llama/patches/0024-GPU-discovery-enhancements.patch b/llama/patches/0024-GPU-discovery-enhancements.patch index c372f0bc..e4cebfae 100644 --- a/llama/patches/0024-GPU-discovery-enhancements.patch +++ b/llama/patches/0024-GPU-discovery-enhancements.patch @@ -58,7 +58,7 @@ index 6d493a4ff..ac8f38464 100644 set_target_properties(ggml-base PROPERTIES diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 0787e443c..736d47c07 100644 +index 1110ca372..c1bfadb3e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -90,7 +90,7 @@ index 0787e443c..736d47c07 100644 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ggml_cuda_parse_uuid(prop, id).c_str()); -@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context { +@@ -4255,6 +4270,11 @@ struct ggml_backend_cuda_device_context { std::string description; std::string pci_bus_id; std::string id; @@ -102,7 +102,7 @@ index 0787e443c..736d47c07 100644 }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { +@@ -4351,6 +4371,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); @@ -131,7 +131,7 @@ index 0787e443c..736d47c07 100644 CUDA_CHECK(cudaMemGetInfo(free, total)); // ref: https://github.com/ggml-org/llama.cpp/pull/17368 -@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -4383,6 +4425,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -139,7 +139,7 @@ index 0787e443c..736d47c07 100644 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; -@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -4396,6 +4439,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back // If you need the memory data, call ggml_backend_dev_memory() explicitly. props->memory_total = props->memory_free = 0; @@ -159,7 +159,7 @@ index 0787e443c..736d47c07 100644 bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY bool events = false; -@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4980,6 +5036,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; @@ -167,7 +167,7 @@ index 0787e443c..736d47c07 100644 for (int i = 0; i < ggml_cuda_info().device_count; i++) { ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; -@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4995,6 +5052,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); dev_ctx->pci_bus_id = pci_bus_id; diff --git a/llama/patches/0029-ggml-cuda-skip-large-batches.patch b/llama/patches/0029-ggml-cuda-skip-large-batches.patch index d1d1addd..834b6e9d 100644 --- a/llama/patches/0029-ggml-cuda-skip-large-batches.patch +++ b/llama/patches/0029-ggml-cuda-skip-large-batches.patch @@ -10,10 +10,10 @@ fallback to cpu 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 736d47c07..7350f6758 100644 +index c1bfadb3e..16c166a08 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -4564,6 +4564,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g +@@ -4570,6 +4570,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { return false; } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh index c3f8ca91..ee463af9 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh @@ -1221,7 +1221,11 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool(int device) { if (pools[device][curr_stream_no] == nullptr) { - pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true); + bool alloc = true; + if (pools[device][0] != nullptr) { + alloc = pools[device][0]->alloc_memory(); + } + pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc); } return *pools[device][curr_stream_no]; } @@ -1238,12 +1242,12 @@ struct ggml_backend_cuda_context { } } - size_t pool_get_alloc_size() { - if (pools[device][curr_stream_no] == nullptr) { + size_t pool_get_alloc_size(int stream_no) { + if (pools[device][stream_no] == nullptr) { return 0; } - return pools[device][curr_stream_no]->alloc_size(); + return pools[device][stream_no]->alloc_size(); } }; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 7350f675..16c166a0 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3913,12 +3913,18 @@ static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend, static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) { ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; - return ctx->pool_get_alloc_size(); + size_t allocs = 0; + for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) { + allocs += ctx->pool_get_alloc_size(i); + } + return allocs; } static void ggml_backend_cuda_reset(ggml_backend_t backend) { ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; - ctx->pools[ctx->device][ctx->curr_stream_no] = NULL; + for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) { + ctx->pools[ctx->device][i] = NULL; + } } static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {