mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 15:08:27 +00:00
ggml update to b7108 (#12992)
* Revert "vulkan: temporary cary of vulkan fixes (#12971)"
This reverts commit 3a9e8e9fd4.
* ggml update to b7087
* fix argsort on metal
* update to b7108
* fix bakllava regression
This model lacks the metadata for the projector type.
* update to b7209
* fix TopK perf
* only build arm code on arm
This commit is contained in:
@@ -12,8 +12,8 @@ must be recreated with no-alloc set to false before loading data.
|
||||
ggml/src/ggml-backend-impl.h | 16 +++
|
||||
ggml/src/ggml-backend.cpp | 72 ++++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 58 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------
|
||||
5 files changed, 320 insertions(+), 44 deletions(-)
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------
|
||||
5 files changed, 321 insertions(+), 44 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2763f2bd6..b3b5b356a 100644
|
||||
@@ -75,7 +75,7 @@ index 0f5b03cef..7bdf9d81f 100644
|
||||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 41eef3b5f..c81a2e48a 100644
|
||||
index f511e8d76..74b7f070c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
@@ -134,7 +134,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1614,6 +1640,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
@@ -152,7 +152,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1655,11 +1692,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
@@ -167,7 +167,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1674,6 +1714,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
@@ -178,7 +178,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1719,6 +1763,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -203,7 +203,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1824,7 +1886,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -219,10 +219,10 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 41ff89c4d..2931c15ca 100644
|
||||
index 611341deb..c3f8ca914 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -35,6 +35,41 @@
|
||||
@@ -37,6 +37,41 @@
|
||||
#include "vendors/cuda.h"
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
@@ -264,7 +264,7 @@ index 41ff89c4d..2931c15ca 100644
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -856,6 +891,9 @@ struct ggml_cuda_pool {
|
||||
@@ -891,6 +926,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
@@ -274,46 +274,48 @@ index 41ff89c4d..2931c15ca 100644
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -992,11 +1030,11 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
- static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
|
||||
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, bool alloc);
|
||||
- static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
|
||||
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no, bool alloc);
|
||||
|
||||
ggml_cuda_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
- pools[device] = new_pool_for_device(device);
|
||||
+ pools[device] = new_pool_for_device(device, true);
|
||||
if (pools[device][curr_stream_no] == nullptr) {
|
||||
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
||||
}
|
||||
return *pools[device];
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1004,4 +1042,20 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
+
|
||||
+ void pool_set_alloc(bool alloc) {
|
||||
+ GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc);
|
||||
+ GGML_ASSERT(pools[device][curr_stream_no] == nullptr || pools[device][curr_stream_no]->alloc_memory() == alloc);
|
||||
+
|
||||
+ if (pools[device] == nullptr) {
|
||||
+ pools[device] = new_pool_for_device(device, alloc);
|
||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ size_t pool_get_alloc_size() {
|
||||
+ if (pools[device] == nullptr) {
|
||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ return pools[device]->alloc_size();
|
||||
+ return pools[device][curr_stream_no]->alloc_size();
|
||||
+ }
|
||||
};
|
||||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 02d413467..f79e5d65c 100644
|
||||
index 78fb2d8b3..fe0da71ca 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -359,6 +359,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
|
||||
@@ -322,7 +324,7 @@ index 02d413467..f79e5d65c 100644
|
||||
// buffer pool for cuda (legacy)
|
||||
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
static const int MAX_BUFFERS = 256;
|
||||
@@ -371,9 +373,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -373,9 +375,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
|
||||
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
|
||||
size_t pool_size = 0;
|
||||
@@ -337,7 +339,7 @@ index 02d413467..f79e5d65c 100644
|
||||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
@@ -381,7 +386,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -383,7 +388,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
@@ -348,7 +350,7 @@ index 02d413467..f79e5d65c 100644
|
||||
pool_size -= b.size;
|
||||
}
|
||||
}
|
||||
@@ -429,8 +436,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -431,8 +438,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
void * ptr;
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
@@ -366,7 +368,7 @@ index 02d413467..f79e5d65c 100644
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -450,10 +464,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -452,10 +466,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
@@ -389,7 +391,7 @@ index 02d413467..f79e5d65c 100644
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
@@ -465,18 +489,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -467,18 +491,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
CUdeviceptr pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
@@ -417,7 +419,7 @@ index 02d413467..f79e5d65c 100644
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
||||
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
||||
@@ -503,35 +533,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -505,35 +535,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
||||
|
||||
@@ -493,7 +495,7 @@ index 02d413467..f79e5d65c 100644
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
@@ -564,16 +608,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -566,17 +610,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||
}
|
||||
@@ -505,11 +507,14 @@ index 02d413467..f79e5d65c 100644
|
||||
+ size_t alloc_size() override {
|
||||
+ return pool_size + last_alloc;
|
||||
+ }
|
||||
+
|
||||
};
|
||||
#endif // defined(GGML_USE_VMM)
|
||||
|
||||
-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) {
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device,
|
||||
- [[maybe_unused]] int stream_no) {
|
||||
+ [[maybe_unused]] int stream_no,
|
||||
+ bool alloc) {
|
||||
#if defined(GGML_USE_VMM)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
@@ -521,7 +526,7 @@ index 02d413467..f79e5d65c 100644
|
||||
}
|
||||
|
||||
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
||||
@@ -757,11 +809,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
@@ -760,11 +814,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
@@ -543,7 +548,7 @@ index 02d413467..f79e5d65c 100644
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
@@ -785,6 +846,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
@@ -788,6 +851,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL,
|
||||
@@ -551,7 +556,7 @@ index 02d413467..f79e5d65c 100644
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
@@ -2986,6 +3048,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
@@ -3258,6 +3322,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
@@ -559,7 +564,7 @@ index 02d413467..f79e5d65c 100644
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
@@ -3001,6 +3064,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3347,6 +3412,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -567,11 +572,10 @@ index 02d413467..f79e5d65c 100644
|
||||
+ if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
|
||||
@@ -3140,6 +3208,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
@@ -3691,6 +3760,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
@@ -579,7 +583,7 @@ index 02d413467..f79e5d65c 100644
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
@@ -3215,6 +3284,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -645,16 +649,16 @@ index 02d413467..f79e5d65c 100644
|
||||
+
|
||||
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
+ ctx->pools[ctx->device] = NULL;
|
||||
+ ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
@@ -3255,6 +3389,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .graph_optimize = */ NULL,
|
||||
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
||||
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
|
||||
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
|
||||
+ /* .reset = */ ggml_backend_cuda_reset,
|
||||
|
||||
Reference in New Issue
Block a user