mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-25 07:58:01 +00:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
@@ -15,18 +15,18 @@ problem.
|
||||
ggml/src/ggml-backend.cpp | 9 +++++++--
|
||||
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
|
||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 2 ++
|
||||
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
|
||||
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
|
||||
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
|
||||
8 files changed, 20 insertions(+), 2 deletions(-)
|
||||
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 1b9d29e9..97f47abd 100644
|
||||
index ff9135fe..8ba86f82 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.free_buffer != NULL) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
@@ -34,7 +34,7 @@ index 1b9d29e9..97f47abd 100644
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
@@ -529,6 +528,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
|
||||
free(ctx->buffers);
|
||||
free(ctx);
|
||||
@@ -42,9 +42,9 @@ index 1b9d29e9..97f47abd 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -1890,6 +1890,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
|
||||
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
+ delete buffer;
|
||||
+}
|
||||
@@ -54,7 +54,7 @@ index 1b9d29e9..97f47abd 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -1937,7 +1942,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
@@ -64,10 +64,10 @@ index 1b9d29e9..97f47abd 100644
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index cf575b36..ca1addfa 100755
|
||||
index b51b554e..3ba0f5a6 100755
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -826,6 +826,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
||||
@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||
delete ctx;
|
||||
@@ -75,7 +75,7 @@ index cf575b36..ca1addfa 100755
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1572,6 +1573,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
*/
|
||||
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
@@ -84,7 +84,7 @@ index cf575b36..ca1addfa 100755
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index d9110491..37ee2a6d 100644
|
||||
index b7e81b21..fdf8c63d 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -111,23 +111,31 @@ index d9110491..37ee2a6d 100644
|
||||
}
|
||||
|
||||
static void * ggml_cuda_host_malloc(size_t size) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index cb8eff4a..7bccc7bf 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -6032,6 +6032,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index e11555a7..909e17de 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
free(ctx);
|
||||
+ free(buffer);
|
||||
ggml_metal_buffer_free(ctx);
|
||||
+ delete buffer;
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_free(ctx);
|
||||
+ delete buffer;
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
index 8ba1e00d..8163e8dc 100644
|
||||
index 0cf3b924..09d706b5 100644
|
||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
@@ -2745,6 +2745,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
@@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
@@ -136,10 +144,10 @@ index 8ba1e00d..8163e8dc 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index df6ba540..2e395968 100644
|
||||
index f99681c8..59591770 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -486,6 +486,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
delete ctx;
|
||||
@@ -148,7 +156,7 @@ index df6ba540..2e395968 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 3992dad0..67503951 100644
|
||||
index 4ac919ea..447ea3c4 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -176,10 +184,10 @@ index 3992dad0..67503951 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 4070e248..394a2839 100644
|
||||
index 2608cbd0..061cd078 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -10209,6 +10209,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@@ -187,7 +195,7 @@ index 4070e248..394a2839 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -10352,6 +10353,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
||||
@@ -10,10 +10,10 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index f7e03e70..8ebe11cf 100644
|
||||
index da938af0..2a38abf4 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1804,16 +1804,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
clean_spaces = true;
|
||||
@@ -31,8 +31,8 @@ index f7e03e70..8ebe11cf 100644
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1975,7 +1966,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||
@@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
|
||||
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 20c21733..f4f69cfc 100644
|
||||
index 210ecc88..355219a9 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -28,6 +28,19 @@
|
||||
@@ -33,7 +33,7 @@ index 20c21733..f4f69cfc 100644
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
enum ffn_op_type {
|
||||
@@ -2597,7 +2610,29 @@ struct clip_model_loader {
|
||||
@@ -2759,7 +2772,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@@ -63,7 +63,7 @@ index 20c21733..f4f69cfc 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -2624,7 +2659,11 @@ struct clip_model_loader {
|
||||
@@ -2786,7 +2821,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
|
||||
7 files changed, 248 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 18dcc6dd..4b285646 100644
|
||||
index 4e8d54c4..f98a3574 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -78,6 +78,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -26,15 +26,15 @@ index 18dcc6dd..4b285646 100644
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -164,6 +165,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
@@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -1794,6 +1796,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -59,7 +59,7 @@ index 18dcc6dd..4b285646 100644
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -2219,6 +2239,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -68,10 +68,10 @@ index 18dcc6dd..4b285646 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 7af587e7..3ea994c7 100644
|
||||
index b5c6f3d7..aa8e0e7b 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -82,6 +82,7 @@ enum llm_arch {
|
||||
@@ -85,6 +85,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_GRANITE_HYBRID,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -79,15 +79,15 @@ index 7af587e7..3ea994c7 100644
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -168,6 +169,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
@@ -181,6 +182,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -394,6 +396,7 @@ enum llm_tensor {
|
||||
@@ -417,6 +419,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -96,10 +96,10 @@ index 7af587e7..3ea994c7 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index 7a06368d..35fc054f 100644
|
||||
index c04ac58f..24a515a0 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -146,6 +146,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
@@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
||||
}
|
||||
|
||||
@@ -115,10 +115,10 @@ index 7a06368d..35fc054f 100644
|
||||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index bd231224..29bd9056 100644
|
||||
index 0fe4b569..eb13709f 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -62,6 +62,8 @@ struct llama_hparams {
|
||||
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -127,7 +127,7 @@ index bd231224..29bd9056 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -220,6 +222,9 @@ struct llama_hparams {
|
||||
@@ -236,6 +238,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
@@ -135,10 +135,10 @@ index bd231224..29bd9056 100644
|
||||
+ bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
+
|
||||
bool is_swa(uint32_t il) const;
|
||||
};
|
||||
|
||||
bool has_kv(uint32_t il) const;
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index f71c40f8..7eab9b68 100644
|
||||
index 8182a9ad..daef900c 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -465,6 +465,7 @@ namespace GGUFMeta {
|
||||
@@ -150,10 +150,10 @@ index f71c40f8..7eab9b68 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 58ca7df7..280129e1 100644
|
||||
index 2470f878..0398b553 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1706,6 +1706,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -175,7 +175,7 @@ index 58ca7df7..280129e1 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -4793,6 +4808,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -210,7 +210,7 @@ index 58ca7df7..280129e1 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -15495,6 +15538,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
||||
@@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -229,7 +229,7 @@ index 58ca7df7..280129e1 100644
|
||||
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
+
|
||||
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
+ auto * inp_attn = build_attn_inp_kv_unified();
|
||||
+ auto * inp_attn = build_attn_inp_kv();
|
||||
+
|
||||
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
+
|
||||
@@ -316,7 +316,7 @@ index 58ca7df7..280129e1 100644
|
||||
+
|
||||
+ cur = build_attn(inp_attn,
|
||||
+ model.layers[il].wo, model.layers[il].bo,
|
||||
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
+ cb(cur, "attn_out", il);
|
||||
+ }
|
||||
+
|
||||
@@ -376,7 +376,7 @@ index 58ca7df7..280129e1 100644
|
||||
// ref: https://github.com/facebookresearch/chameleon
|
||||
// based on the original build_llama() function, changes:
|
||||
// * qk-norm
|
||||
@@ -18439,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
@@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
@@ -387,7 +387,7 @@ index 58ca7df7..280129e1 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -18652,6 +18858,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@@ -396,10 +396,10 @@ index 58ca7df7..280129e1 100644
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 6fcd74d5..09964533 100644
|
||||
index d73ce969..c086f94e 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -70,6 +70,7 @@ enum llm_type {
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
@@ -407,7 +407,7 @@ index 6fcd74d5..09964533 100644
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
@@ -367,6 +368,8 @@ struct llama_layer {
|
||||
@@ -380,6 +381,8 @@ struct llama_layer {
|
||||
// openai-moe
|
||||
struct ggml_tensor * attn_sinks = nullptr;
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 8ebe11cf..c011008f 100644
|
||||
index 2a38abf4..26fa9fad 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index 637891f5..98b8280f 100644
|
||||
index db1f0b23..f4de7e34 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -307,7 +307,7 @@ private:
|
||||
@@ -308,7 +308,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
||||
@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 6c315137..3040b2aa 100644
|
||||
index 136afec7..f794d9cf 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry {
|
||||
@@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_entry> backends;
|
||||
@@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
|
||||
|
||||
ggml_backend_registry() {
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -207,7 +207,7 @@ struct ggml_backend_registry {
|
||||
@@ -223,7 +223,7 @@ struct ggml_backend_registry {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
|
||||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
@@ -218,15 +218,20 @@ struct ggml_backend_registry {
|
||||
@@ -234,15 +234,20 @@ struct ggml_backend_registry {
|
||||
#endif
|
||||
backends.push_back({ reg, std::move(handle) });
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
@@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
||||
@@ -270,7 +275,7 @@ struct ggml_backend_registry {
|
||||
@@ -286,7 +291,7 @@ struct ggml_backend_registry {
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
||||
|
||||
@@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
|
||||
|
||||
return reg;
|
||||
}
|
||||
@@ -293,7 +298,7 @@ struct ggml_backend_registry {
|
||||
@@ -309,7 +314,7 @@ struct ggml_backend_registry {
|
||||
// remove devices
|
||||
devices.erase(
|
||||
std::remove_if(devices.begin(), devices.end(),
|
||||
@@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
|
||||
devices.end());
|
||||
|
||||
// remove backend
|
||||
@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() {
|
||||
@@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 177fb282..f5a5079a 100644
|
||||
index c8f3d859..ff6229a0 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
@@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
|
||||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index f5a5079a..5158acd6 100644
|
||||
index ff6229a0..33b3a15f 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
|
||||
@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
|
||||
// get ith C string from array with given key_id
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
|
||||
index 53504399..0f71d5f3 100644
|
||||
index 8cc4ef1c..d950dbdf 100644
|
||||
--- a/ggml/src/gguf.cpp
|
||||
+++ b/ggml/src/gguf.cpp
|
||||
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
|
||||
@@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index c011008f..fa388b03 100644
|
||||
index 26fa9fad..64c78a16 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||
@@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
|
||||
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index d89cd8f4..a5689c18 100644
|
||||
index dbc07301..f8574d01 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
@@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
||||
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index bfbf5fa2..11f93f42 100644
|
||||
index 2186f827..8fb86009 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
@@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
|
||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
|
||||
@@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
|
||||
Subject: [PATCH] add argsort and cuda copy for i32
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
||||
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++
|
||||
4 files changed, 192 insertions(+), 2 deletions(-)
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
||||
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++
|
||||
5 files changed, 256 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 854f1c2b..a2924757 100644
|
||||
index 14f7dcf4..f7f8da35 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
@@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
|
||||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort(
|
||||
@@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
@@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
|
||||
+ }
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
index 410c12b7..b8e9e107 100644
|
||||
index e621cb98..597c0c8b 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t>
|
||||
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
+
|
||||
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
@@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
|
||||
+ *dst = *src;
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||
index f9bb0256..9c3774e5 100644
|
||||
index 746f4396..911220e9 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||
@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
@@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
@@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
@@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
@@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 96df6f0c..44dc31c0 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
|
||||
}
|
||||
}
|
||||
|
||||
+typedef void (i32_argsort_t)(
|
||||
+ constant ggml_metal_kargs_argsort & args,
|
||||
+ device const int32_t * x,
|
||||
+ device int32_t * dst,
|
||||
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
+
|
||||
+template<ggml_sort_order order>
|
||||
+kernel void kernel_argsort_i32_i32(
|
||||
+ constant ggml_metal_kargs_argsort & args,
|
||||
+ device const int32_t * x,
|
||||
+ device int32_t * dst,
|
||||
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
|
||||
+ // bitonic sort
|
||||
+ int col = tpitg[0];
|
||||
+ int row = tgpig[1];
|
||||
+
|
||||
+ if (col >= args.ncols_pad) return;
|
||||
+
|
||||
+ device const int32_t * x_row = x + row * args.ncols;
|
||||
+ threadgroup int32_t * dst_row = shared_values;
|
||||
+
|
||||
+ // initialize indices
|
||||
+ dst_row[col] = col;
|
||||
+
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
+
|
||||
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
|
||||
+ for (int j = k / 2; j > 0; j /= 2) {
|
||||
+ int ixj = col ^ j;
|
||||
+ if (ixj > col) {
|
||||
+ if ((col & k) == 0) {
|
||||
+ if (dst_row[col] >= args.ncols ||
|
||||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||
+ ) {
|
||||
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (dst_row[ixj] >= args.ncols ||
|
||||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||
+ ) {
|
||||
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // copy the result to dst without the padding
|
||||
+ if (col < args.ncols) {
|
||||
+ dst[row * args.ncols + col] = dst_row[col];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
|
||||
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
|
||||
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
|
||||
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
|
||||
|
||||
kernel void kernel_leaky_relu_f32(
|
||||
constant ggml_metal_kargs_leaky_relu & args,
|
||||
|
||||
@@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
|
||||
---
|
||||
ggml/include/ggml-alloc.h | 1 +
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++----
|
||||
ggml/src/ggml-alloc.c | 34 +++++++++++++++++++++++++++++++---
|
||||
ggml/src/ggml-backend.cpp | 7 +++++++
|
||||
4 files changed, 41 insertions(+), 4 deletions(-)
|
||||
4 files changed, 40 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||
index 2cb150fd2..7ab3f0192 100644
|
||||
index 2cb150fd..7ab3f019 100644
|
||||
--- a/ggml/include/ggml-alloc.h
|
||||
+++ b/ggml/include/ggml-alloc.h
|
||||
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
@@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index a2977ea2e..e8cf30841 100644
|
||||
index 62b6d65e..fe20dca3 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -303,6 +303,7 @@ extern "C" {
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
@@ -316,6 +316,7 @@ extern "C" {
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index 8b6e60283..b58bd671d 100644
|
||||
index fa46f3b4..421ff7c7 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -350,6 +350,7 @@ struct node_alloc {
|
||||
@@ -492,6 +492,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
+ size_t *buffer_sizes; // [n_buffers]
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||
@@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
||||
@@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
@@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
@@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
|
||||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
}
|
||||
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
+ } else {
|
||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
@@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
@@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
|
||||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 97f47abd2..d02a40e60 100644
|
||||
index 8ba86f82..cb2b9956 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
@@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
|
||||
+}
|
||||
+
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
GGML_ASSERT(sched);
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -6,28 +6,28 @@ Subject: [PATCH] ggml: Export GPU UUIDs
|
||||
This enables matching up devices and information reported by the backend
|
||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++---
|
||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 1 +
|
||||
3 files changed, 63 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 8a91b381..9424394e 100644
|
||||
index fe20dca3..48777212 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -152,6 +152,7 @@ extern "C" {
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
@@ -158,6 +158,7 @@ extern "C" {
|
||||
const char * description;
|
||||
+ const char * id;
|
||||
// device free memory in bytes
|
||||
size_t memory_free;
|
||||
+ const char * id;
|
||||
// device total memory in bytes
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 37ee2a6d..57eae461 100644
|
||||
index fdf8c63d..ad389ece 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
@@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
|
||||
+}
|
||||
+
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
@@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
|
||||
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
+ std::string id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
@@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
@@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
@@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
+ props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 7bccc7bf..fe7b2f0a 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index 909e17de..08ab4fc9 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
+ props->id = "0";
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
|
||||
@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
||||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index a05373d5..6f70f7f4 100644
|
||||
index cd022c5e..3d680945 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index a5689c18..85af19a3 100644
|
||||
index f8574d01..530efce0 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
@@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
|
||||
@@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard
|
||||
|
||||
Only enable BF16 on supported MacOS versions (v14+)
|
||||
---
|
||||
ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
|
||||
1 file changed, 5 insertions(+), 1 deletion(-)
|
||||
ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
|
||||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index fe7b2f0a..e4c31268 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
||||
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
|
||||
index 052efb7a..b47dc787 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal-context.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
|
||||
@@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
|
||||
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
- res->use_bfloat = props_dev->has_bfloat;
|
||||
+ if (@available(macOS 14.0, *)) {
|
||||
+ res->use_bfloat = props_dev->has_bfloat;
|
||||
+ } else {
|
||||
+ res->use_bfloat = false;
|
||||
+ }
|
||||
+
|
||||
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
- ctx->use_bfloat = ctx->has_bfloat;
|
||||
+ if (@available(macOS 14.0, *)) {
|
||||
+ ctx->use_bfloat = ctx->has_bfloat;
|
||||
+ } else {
|
||||
+ ctx->use_bfloat = false;
|
||||
+ }
|
||||
#else
|
||||
ctx->use_bfloat = false;
|
||||
#endif
|
||||
|
||||
@@ -13,10 +13,10 @@ checks.
|
||||
1 file changed, 18 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 57eae461..c7f9dc3a 100644
|
||||
index ad389ece..e51c5035 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
@@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
||||
|
||||
@@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
|
||||
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
||||
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
||||
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
||||
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
|
||||
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
|
||||
|
||||
+
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
@@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
|
||||
if (node->op == GGML_OP_ADD &&
|
||||
node->src[1] && node->src[1]->ne[1] > 1 &&
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
index aeac2e57..40738d5b 100644
|
||||
index 5b888cdd..2a9ff7f6 100644
|
||||
--- a/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||
@@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||
|
||||
@@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
|
||||
5 files changed, 310 insertions(+), 44 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2773cc310..ae94887dd 100644
|
||||
index 48777212..d4352663 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -291,6 +291,7 @@ extern "C" {
|
||||
@@ -303,6 +303,7 @@ extern "C" {
|
||||
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
||||
@@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||
index c36c12d65..369e9e25a 100644
|
||||
index 07784d6f..869dc07d 100644
|
||||
--- a/ggml/src/ggml-backend-impl.h
|
||||
+++ b/ggml/src/ggml-backend-impl.h
|
||||
@@ -26,12 +26,17 @@ extern "C" {
|
||||
@@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
|
||||
};
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
@@ -114,6 +120,16 @@ extern "C" {
|
||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// wait for an event on on a different stream
|
||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
@@ -117,6 +123,16 @@ extern "C" {
|
||||
|
||||
// (optional) sort/optimize the nodes in the graph
|
||||
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
+
|
||||
+ // (optional) reserves intermediate buffers needed for the compution
|
||||
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
|
||||
@@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
|
||||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index d02a40e60..6b4dee4c7 100644
|
||||
index cb2b9956..6ef5eeaf 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
@@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
|
||||
+ return buf;
|
||||
+ }
|
||||
+
|
||||
GGML_ASSERT(buft);
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
@@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
};
|
||||
|
||||
return buffer;
|
||||
@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -663,6 +683,12 @@ struct ggml_backend_sched {
|
||||
@@ -723,6 +743,12 @@ struct ggml_backend_sched {
|
||||
bool op_offload;
|
||||
|
||||
int debug;
|
||||
@@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
@@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
@@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
@@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 2e5d48797..b915ee1b8 100644
|
||||
index c4246b65..448badf0 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -35,6 +35,31 @@
|
||||
@@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -771,6 +796,9 @@ struct ggml_cuda_pool {
|
||||
@@ -880,6 +905,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
@@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
||||
|
||||
@@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||
}
|
||||
return *pools[device];
|
||||
}
|
||||
@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
@@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||
+ }
|
||||
};
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c7f9dc3a5..d5abe09e0 100644
|
||||
index e51c5035..d324bc68 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
@@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
@@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
@@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||
+
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
||||
@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
@@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
@@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
@@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .graph_optimize = */ NULL,
|
||||
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
|
||||
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
|
||||
+ /* .reset = */ ggml_backend_cuda_reset,
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 26a5cf9c..6ece5263 100644
|
||||
index d8a8b5e6..09247cef 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
|
||||
@@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
|
||||
5 files changed, 29 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index b602a7c78..fda5ceb24 100644
|
||||
index d4352663..0a2dae26 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -167,6 +167,7 @@ extern "C" {
|
||||
@@ -178,6 +178,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||
@@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||
index 81749a5a3..6f10c353b 100644
|
||||
index 869dc07d..4889df79 100644
|
||||
--- a/ggml/src/ggml-backend-impl.h
|
||||
+++ b/ggml/src/ggml-backend-impl.h
|
||||
@@ -178,6 +178,10 @@ extern "C" {
|
||||
@@ -195,6 +195,10 @@ extern "C" {
|
||||
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
||||
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
@@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
|
||||
|
||||
struct ggml_backend_device {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 05a842ed5..6556943b0 100644
|
||||
index 6ef5eeaf..0b757af5 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
return device->iface.init_backend(device, params);
|
||||
}
|
||||
|
||||
@@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
|
||||
+}
|
||||
+
|
||||
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
||||
GGML_ASSERT(device);
|
||||
return device->iface.get_buffer_type(device);
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c7f9dc3a5..e43fde523 100644
|
||||
index d324bc68..531d6e27 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
|
||||
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
@@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err;
|
||||
@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
@@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
+
|
||||
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
|
||||
@@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
@@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
@@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
@@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
@@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||
};
|
||||
|
||||
// backend reg
|
||||
@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
dev_ctx->device = i;
|
||||
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
||||
|
||||
@@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index c31f31923..cf22e60d2 100644
|
||||
index 37386afc..06f9e7c1 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -40,6 +40,7 @@
|
||||
@@ -41,6 +41,7 @@
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
|
||||
@@ -6,25 +6,25 @@ Subject: [PATCH] GPU discovery enhancements
|
||||
Expose more information about the devices through backend props, and leverage
|
||||
management libraries for more accurate VRAM usage reporting if available.
|
||||
---
|
||||
ggml/include/ggml-backend.h | 9 +
|
||||
ggml/src/CMakeLists.txt | 2 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++-
|
||||
ggml/src/ggml-cuda/vendors/hip.h | 1 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.m | 2 +
|
||||
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 172 ++++++++++++
|
||||
8 files changed, 717 insertions(+), 1 deletion(-)
|
||||
ggml/include/ggml-backend.h | 9 +
|
||||
ggml/src/CMakeLists.txt | 2 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++
|
||||
ggml/src/ggml-cuda/vendors/hip.h | 4 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 3 +-
|
||||
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 172 +++++++++++
|
||||
8 files changed, 718 insertions(+), 1 deletion(-)
|
||||
create mode 100644 ggml/src/mem_hip.cpp
|
||||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index fda5ceb24..7c2d86703 100644
|
||||
index 0a2dae26..a6bf3378 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -158,6 +158,15 @@ extern "C" {
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
@@ -169,6 +169,15 @@ extern "C" {
|
||||
const char * device_id;
|
||||
// device capabilities
|
||||
struct ggml_backend_dev_caps caps;
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
@@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 5158acd6a..3a428a22d 100644
|
||||
index 33b3a15f..86191ef2 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -203,6 +203,8 @@ add_library(ggml-base
|
||||
@@ -206,6 +206,8 @@ add_library(ggml-base
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
@@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index e43fde523..14baf0fb1 100644
|
||||
index 531d6e27..3fa3a057 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
@@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
|
||||
#if defined(GGML_USE_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
@@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
+
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
@@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
std::string id;
|
||||
+ int major;
|
||||
+ int minor;
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
+ int integrated;
|
||||
+ int pci_bus_id;
|
||||
+ int pci_device_id;
|
||||
+ int pci_domain_id;
|
||||
+ int pciBusID;
|
||||
+ int pciDeviceID;
|
||||
+ int pciDomainID;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
@@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
+
|
||||
+#if defined(GGML_USE_HIP)
|
||||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
@@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
@@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
+#define GGML_HIP_NAME "HIP"
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
@@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||
props->memory_total = props->memory_free = 0;
|
||||
|
||||
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
+#if defined(GGML_USE_HIP)
|
||||
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
|
||||
+ props->compute_major = cc / 0x100;
|
||||
@@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
|
||||
+ props->driver_major = ctx->driver_major;
|
||||
+ props->driver_minor = ctx->driver_minor;
|
||||
+ props->integrated = ctx->integrated;
|
||||
+ props->pci_bus_id = ctx->pci_bus_id;
|
||||
+ props->pci_device_id = ctx->pci_device_id;
|
||||
+ props->pci_domain_id = ctx->pci_domain_id;
|
||||
+ props->pci_bus_id = ctx->pciBusID;
|
||||
+ props->pci_device_id = ctx->pciDeviceID;
|
||||
+ props->pci_domain_id = ctx->pciDomainID;
|
||||
+ props->library = GGML_CUDA_NAME;
|
||||
+
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
@@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
||||
-
|
||||
@@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
+ dev_ctx->major = prop.major;
|
||||
+ dev_ctx->minor = prop.minor;
|
||||
+ dev_ctx->driver_major = driverVersion / 1000;
|
||||
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
|
||||
+ dev_ctx->integrated = prop.integrated;
|
||||
+ dev_ctx->pci_bus_id = prop.pciBusID;
|
||||
+ dev_ctx->pci_device_id = prop.pciDeviceID;
|
||||
+ dev_ctx->pci_domain_id = prop.pciDomainID;
|
||||
+ dev_ctx->pciBusID = prop.pciBusID;
|
||||
+ dev_ctx->pciDeviceID = prop.pciDeviceID;
|
||||
+ dev_ctx->pciDomainID = prop.pciDomainID;
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index cf22e60d2..957a795f2 100644
|
||||
index 06f9e7c1..eb8f66cb 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -42,6 +42,7 @@
|
||||
@@ -5,6 +5,9 @@
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bf16.h>
|
||||
+// for rocblas_initialize()
|
||||
+#include "rocblas/rocblas.h"
|
||||
+
|
||||
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
@@ -43,6 +46,7 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceReset hipDeviceReset
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
@@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||
index 19a7adb2d..b9b102a5e 100644
|
||||
index 86a1ebf6..9fc9fbfc 100644
|
||||
--- a/ggml/src/ggml-impl.h
|
||||
+++ b/ggml/src/ggml-impl.h
|
||||
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
||||
return true;
|
||||
@@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
||||
return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
|
||||
}
|
||||
|
||||
+// Management libraries for fetching more accurate free VRAM data
|
||||
@@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index e4c31268f..ec6b385ba 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index 08ab4fc9..17999a61 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
+#define GGML_METAL_NAME "Metal"
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
props->id = "0";
|
||||
@@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
-
|
||||
+ props->library = GGML_METAL_NAME;
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
/* .async = */ false,
|
||||
props->caps = {
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
|
||||
new file mode 100644
|
||||
index 000000000..8ef19b8cf
|
||||
index 00000000..8ef19b8c
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/mem_hip.cpp
|
||||
@@ -0,0 +1,449 @@
|
||||
@@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
|
||||
\ No newline at end of file
|
||||
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
|
||||
new file mode 100644
|
||||
index 000000000..aa05e9dc1
|
||||
index 00000000..aa05e9dc
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/mem_nvml.cpp
|
||||
@@ -0,0 +1,172 @@
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Tue, 23 Sep 2025 15:41:58 -0700
|
||||
Subject: [PATCH] ggml: Backport scale kernel fixes
|
||||
|
||||
The GGML scale kernel uses signed 32-bit ints to represent
|
||||
the number of elements in the tensor. For large images,
|
||||
mistral-small3.2 overflows this, triggering CUDA errors due
|
||||
to negative arguments.
|
||||
|
||||
Currently, this can happen when the user passes a large image
|
||||
to mistral-small3.2. However, with upcoming changes to reserve
|
||||
CUDA memory, it happens every time mistral-small is loaded as
|
||||
we reserve using a worst case batch.
|
||||
|
||||
This patch is part of an upstream GGML commit and should be removed
|
||||
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
|
||||
(cuda && cpu) (#15669)".
|
||||
|
||||
Fixes #10388
|
||||
---
|
||||
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
|
||||
1 file changed, 10 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
|
||||
index 2ee9e5889..0ddeff6a1 100644
|
||||
--- a/ggml/src/ggml-cuda/scale.cu
|
||||
+++ b/ggml/src/ggml-cuda/scale.cu
|
||||
@@ -1,18 +1,19 @@
|
||||
#include "scale.cuh"
|
||||
|
||||
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
|
||||
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
+#define MAX_GRIDDIM_X 0x7FFFFFFF
|
||||
|
||||
- if (i >= k) {
|
||||
- return;
|
||||
- }
|
||||
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
|
||||
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
|
||||
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
|
||||
|
||||
- dst[i] = scale * x[i] + bias;
|
||||
+ for (int64_t i = tid; i < nelements; i += stride) {
|
||||
+ dst[i] = scale * x[i] + bias;
|
||||
+ }
|
||||
}
|
||||
|
||||
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
|
||||
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
|
||||
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
|
||||
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
Reference in New Issue
Block a user