mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-26 00:18:02 +00:00
ggml update to b7108 (#12992)
* Revert "vulkan: temporary cary of vulkan fixes (#12971)"
This reverts commit 3a9e8e9fd4.
* ggml update to b7087
* fix argsort on metal
* update to b7108
* fix bakllava regression
This model lacks the metadata for the projector type.
* update to b7209
* fix TopK perf
* only build arm code on arm
This commit is contained in:
@@ -23,7 +23,7 @@ problem.
|
||||
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index ff9135fe2..8ba86f824 100644
|
||||
index 4cf377e7f..4882541c8 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -42,7 +42,7 @@ index ff9135fe2..8ba86f824 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -2079,6 +2079,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
@@ -54,7 +54,7 @@ index ff9135fe2..8ba86f824 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -2131,7 +2136,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
@@ -64,10 +64,10 @@ index ff9135fe2..8ba86f824 100644
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index 8bd5449f1..01e2df61a 100644
|
||||
index df28d67fb..1f6a56ba2 100644
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
||||
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
@@ -75,7 +75,7 @@ index 8bd5449f1..01e2df61a 100644
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1560,6 +1561,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
@@ -1570,6 +1571,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
*/
|
||||
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
@@ -84,10 +84,10 @@ index 8bd5449f1..01e2df61a 100644
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index bc396b521..aefc6935e 100644
|
||||
index fa7e1e13a..8f3b1c173 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -576,6 +576,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -579,6 +579,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -95,7 +95,7 @@ index bc396b521..aefc6935e 100644
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
@@ -831,6 +832,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
@@ -834,6 +835,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -103,7 +103,7 @@ index bc396b521..aefc6935e 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1112,6 +1114,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
@@ -1115,6 +1117,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
@@ -112,7 +112,7 @@ index bc396b521..aefc6935e 100644
|
||||
|
||||
static void * ggml_cuda_host_malloc(size_t size) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index 7afc881fa..bf0962274 100644
|
||||
index 70bf6f3d9..f2b7fe692 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
|
||||
@@ -132,10 +132,10 @@ index 7afc881fa..bf0962274 100644
|
||||
|
||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
index db33a4ab6..c42ee26e1 100644
|
||||
index e5302f455..43fa83e8f 100644
|
||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
@@ -3266,6 +3266,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
@@ -3412,6 +3412,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
@@ -144,10 +144,10 @@ index db33a4ab6..c42ee26e1 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index a38df5a97..fd07e4a21 100644
|
||||
index 48fd99a76..da2aab3df 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -555,6 +555,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
delete ctx;
|
||||
@@ -156,10 +156,10 @@ index a38df5a97..fd07e4a21 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index b695ba051..37e853120 100644
|
||||
index 3f1bdfb9f..a95c2f305 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -352,6 +352,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
@@ -167,7 +167,7 @@ index b695ba051..37e853120 100644
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -813,6 +814,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -175,7 +175,7 @@ index b695ba051..37e853120 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1155,6 +1157,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
@@ -184,10 +184,10 @@ index b695ba051..37e853120 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index b783f7805..216dc167c 100644
|
||||
index 66dd0bfab..83cdec29e 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -11828,6 +11828,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -12368,6 +12368,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@@ -195,7 +195,7 @@ index b783f7805..216dc167c 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -11971,6 +11972,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -12511,6 +12512,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
||||
@@ -10,10 +10,10 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 639fecbd3..a7ce6f8e1 100644
|
||||
index a73c4c448..b9f0631f4 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
clean_spaces = true;
|
||||
@@ -31,8 +31,8 @@ index 639fecbd3..a7ce6f8e1 100644
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1993,7 +1984,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
||||
@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
|
||||
@@ -10,11 +10,11 @@ filesystems for paths that include wide characters
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index f2abf8852..c984e6282 100644
|
||||
index 05777d2d9..f4c4d2c48 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -28,6 +28,19 @@
|
||||
#include <numeric>
|
||||
@@ -24,6 +24,19 @@
|
||||
#include <array>
|
||||
#include <functional>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
@@ -30,10 +30,10 @@ index f2abf8852..c984e6282 100644
|
||||
+#endif
|
||||
+#endif
|
||||
+
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
enum ffn_op_type {
|
||||
@@ -2774,7 +2787,29 @@ struct clip_model_loader {
|
||||
@@ -3255,7 +3268,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@@ -63,7 +63,7 @@ index f2abf8852..c984e6282 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -2801,7 +2836,11 @@ struct clip_model_loader {
|
||||
@@ -3282,7 +3317,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,20 +5,36 @@ Subject: [PATCH] solar-pro
|
||||
|
||||
adds support for the Solar Pro architecture
|
||||
---
|
||||
src/llama-arch.cpp | 21 ++++
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/llama-arch.cpp | 21 +++++
|
||||
src/llama-arch.h | 3 +
|
||||
src/llama-hparams.cpp | 8 ++
|
||||
src/llama-hparams.h | 5 +
|
||||
src/llama-hparams.h | 5 ++
|
||||
src/llama-model-loader.cpp | 2 +-
|
||||
src/llama-model.cpp | 207 +++++++++++++++++++++++++++++++++++++
|
||||
src/llama-model.cpp | 48 +++++++++++
|
||||
src/llama-model.h | 3 +
|
||||
7 files changed, 248 insertions(+), 1 deletion(-)
|
||||
src/models/models.h | 5 ++
|
||||
src/models/solar.cpp | 158 +++++++++++++++++++++++++++++++++++++
|
||||
10 files changed, 253 insertions(+), 1 deletion(-)
|
||||
create mode 100644 src/models/solar.cpp
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 67c7807e0..fda881640 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -125,6 +125,7 @@ add_library(llama
|
||||
models/seed-oss.cpp
|
||||
models/smallthinker.cpp
|
||||
models/smollm3.cpp
|
||||
+ models/solar.cpp
|
||||
models/stablelm.cpp
|
||||
models/starcoder.cpp
|
||||
models/starcoder2.cpp
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 8ca769c5f..ab262ec0c 100644
|
||||
index 8571a2e02..b6bde25d5 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -26,7 +42,7 @@ index 8ca769c5f..ab262ec0c 100644
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -183,6 +184,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
@@ -34,7 +50,7 @@ index 8ca769c5f..ab262ec0c 100644
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -1901,6 +1903,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -2023,6 +2025,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -59,7 +75,7 @@ index 8ca769c5f..ab262ec0c 100644
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -2469,6 +2489,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -2681,6 +2701,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -68,10 +84,10 @@ index 8ca769c5f..ab262ec0c 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index dea725c1a..ea2b4ffb9 100644
|
||||
index 150646478..3936a4687 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -86,6 +86,7 @@ enum llm_arch {
|
||||
@@ -89,6 +89,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_GRANITE_HYBRID,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -79,7 +95,7 @@ index dea725c1a..ea2b4ffb9 100644
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -187,6 +188,7 @@ enum llm_kv {
|
||||
@@ -208,6 +209,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
@@ -87,7 +103,7 @@ index dea725c1a..ea2b4ffb9 100644
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -436,6 +438,7 @@ enum llm_tensor {
|
||||
@@ -459,6 +461,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -96,11 +112,11 @@ index dea725c1a..ea2b4ffb9 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index db65d69ea..b6bf6bbf2 100644
|
||||
index 8cdbaf69f..41127bf91 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
||||
@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
|
||||
}
|
||||
|
||||
+bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
|
||||
@@ -115,7 +131,7 @@ index db65d69ea..b6bf6bbf2 100644
|
||||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 6fcf91b7d..24569a258 100644
|
||||
index c3a53be79..2ffe7dd30 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||
@@ -127,7 +143,7 @@ index 6fcf91b7d..24569a258 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -250,6 +252,9 @@ struct llama_hparams {
|
||||
@@ -256,6 +258,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
@@ -151,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 2a83d6627..54621ea39 100644
|
||||
index c2a545531..4468de2f9 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1890,6 +1890,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1961,6 +1961,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -176,7 +192,7 @@ index 2a83d6627..54621ea39 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -5224,6 +5239,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5350,6 +5365,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -211,12 +227,71 @@ index 2a83d6627..54621ea39 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -16515,6 +16558,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
||||
}
|
||||
@@ -7425,6 +7468,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ llm = std::make_unique<llm_build_solar>(*this, params);
|
||||
+ } break;
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -7684,6 +7731,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index f8342cf2c..cbf4e1bfa 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_26B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
@@ -404,6 +405,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_act_beta = nullptr;
|
||||
struct ggml_tensor * ffn_act_eps = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index 7ba225b47..71fea796d 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -510,6 +510,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
+struct llm_build_solar : public llm_graph_context {
|
||||
+ llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
+ llm_build_solar(const llama_model & model, const llm_graph_params & params);
|
||||
+};
|
||||
+
|
||||
+
|
||||
struct llm_build_stablelm : public llm_graph_context {
|
||||
llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
diff --git a/src/models/solar.cpp b/src/models/solar.cpp
|
||||
new file mode 100644
|
||||
index 000000000..97383928c
|
||||
--- /dev/null
|
||||
+++ b/src/models/solar.cpp
|
||||
@@ -0,0 +1,158 @@
|
||||
+#include "models.h"
|
||||
+
|
||||
+llm_build_solar::llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
+ const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
@@ -285,7 +360,7 @@ index 2a83d6627..54621ea39 100644
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+ if (model.layers[il].bk) {
|
||||
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
@@ -371,49 +446,4 @@ index 2a83d6627..54621ea39 100644
|
||||
+ res->t_logits = cur;
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
// ref: https://github.com/facebookresearch/chameleon
|
||||
// based on the original build_llama() function, changes:
|
||||
// * qk-norm
|
||||
@@ -20096,6 +20298,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ llm = std::make_unique<llm_build_solar>(*this, params);
|
||||
+ } break;
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -20331,6 +20537,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 248f85410..4a7924aaa 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
@@ -390,6 +391,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_act_beta = nullptr;
|
||||
struct ggml_tensor * ffn_act_eps = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
+}
|
||||
|
||||
@@ -12,7 +12,7 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index a7ce6f8e1..8064dc197 100644
|
||||
index b9f0631f4..1525283d7 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -25,7 +25,7 @@ index a7ce6f8e1..8064dc197 100644
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||
index 65f366517..ce336a228 100644
|
||||
index 77ba4fc46..040518e1e 100644
|
||||
--- a/src/unicode.cpp
|
||||
+++ b/src/unicode.cpp
|
||||
@@ -2,6 +2,11 @@
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index dd9b51a9e..d88f43209 100644
|
||||
index c8421e1e8..cb659915d 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -308,7 +308,7 @@ private:
|
||||
@@ -310,7 +310,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index ba281b8e6..ead235878 100644
|
||||
index d93664b8b..800f98b65 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -314,6 +314,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -349,6 +349,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
@@ -19,7 +19,7 @@ index ba281b8e6..ead235878 100644
|
||||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -324,6 +325,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -359,6 +360,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index ead235878..f9a6587f1 100644
|
||||
index 800f98b65..6d493a4ff 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -334,10 +334,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -369,10 +369,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
|
||||
@@ -53,10 +53,10 @@ index 8cc4ef1cf..d950dbdf5 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 8064dc197..31f49801c 100644
|
||||
index 1525283d7..ea450c361 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 9ec485cfa..4b2f8b7bd 100644
|
||||
index 3247af8bb..5be08d6f4 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
@@ -20,7 +20,7 @@ index 9ec485cfa..4b2f8b7bd 100644
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2891,6 +2893,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2922,6 +2924,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
||||
@@ -6,14 +6,14 @@ Subject: [PATCH] add ollama vocab for grammar support
|
||||
---
|
||||
src/llama-grammar.cpp | 49 ++++++++++++++++++++++++++++++++++++------
|
||||
src/llama-grammar.h | 14 ++++++++++++
|
||||
src/llama-sampling.cpp | 4 ++--
|
||||
3 files changed, 58 insertions(+), 9 deletions(-)
|
||||
src/llama-sampling.cpp | 6 +++---
|
||||
3 files changed, 59 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
|
||||
index bed706bb2..b51cee090 100644
|
||||
index b3c5eb571..a7307c47f 100644
|
||||
--- a/src/llama-grammar.cpp
|
||||
+++ b/src/llama-grammar.cpp
|
||||
@@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
@@ -915,6 +915,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
@@ -21,7 +21,7 @@ index bed706bb2..b51cee090 100644
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index) {
|
||||
@@ -962,6 +963,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
@@ -970,6 +971,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
@@ -29,7 +29,7 @@ index bed706bb2..b51cee090 100644
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -975,6 +977,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
@@ -983,6 +985,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
@@ -37,7 +37,7 @@ index bed706bb2..b51cee090 100644
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
@@ -1067,6 +1070,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
@@ -1075,6 +1078,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
@@ -45,7 +45,7 @@ index bed706bb2..b51cee090 100644
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -1089,6 +1093,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
@@ -1097,6 +1101,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
||||
auto * result = new llama_grammar {
|
||||
grammar.vocab,
|
||||
@@ -53,7 +53,7 @@ index bed706bb2..b51cee090 100644
|
||||
grammar.rules,
|
||||
grammar.stacks,
|
||||
grammar.partial_utf8,
|
||||
@@ -1116,7 +1121,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
@@ -1124,7 +1129,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
}
|
||||
|
||||
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
||||
@@ -61,7 +61,7 @@ index bed706bb2..b51cee090 100644
|
||||
|
||||
if (grammar.awaiting_trigger) {
|
||||
return;
|
||||
@@ -1138,9 +1142,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
@@ -1146,9 +1150,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const llama_token id = cur_p->data[i].id;
|
||||
@@ -77,7 +77,7 @@ index bed706bb2..b51cee090 100644
|
||||
if (!allow_eog) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
@@ -1159,9 +1167,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
@@ -1167,9 +1175,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
}
|
||||
|
||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||
@@ -90,7 +90,7 @@ index bed706bb2..b51cee090 100644
|
||||
|
||||
if (grammar.awaiting_trigger) {
|
||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
@@ -1201,13 +1210,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
@@ -1209,13 +1218,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,7 +107,7 @@ index bed706bb2..b51cee090 100644
|
||||
}
|
||||
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
@@ -1227,3 +1237,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||
@@ -1235,3 +1245,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
}
|
||||
}
|
||||
@@ -184,10 +184,10 @@ index f8c291de9..2a3a62db3 100644
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index 55d2e355f..da34526b1 100644
|
||||
index 3f4a729bc..38a30ea05 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
@@ -196,12 +196,15 @@ index 55d2e355f..da34526b1 100644
|
||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
|
||||
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
|
||||
- grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
|
||||
+ grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
|
||||
} else {
|
||||
- grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
|
||||
+ grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
|
||||
}
|
||||
*ctx = {
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
||||
+ /* .grammar = */ llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
||||
};
|
||||
if (!ctx->grammar) {
|
||||
delete ctx;
|
||||
|
||||
@@ -8,14 +8,14 @@ Subject: [PATCH] add argsort and cuda copy for i32
|
||||
ggml/src/ggml-cuda/argsort.cu | 122 ++++++++++++++++++++++++---
|
||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
||||
ggml/src/ggml-cuda/cpy.cu | 40 +++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++
|
||||
5 files changed, 263 insertions(+), 12 deletions(-)
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 69 +++++++++++++++
|
||||
5 files changed, 268 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index b52f0f847..902fdad69 100644
|
||||
index 2745fc54e..40666bab6 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
@@ -7846,6 +7846,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ index b52f0f847..902fdad69 100644
|
||||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -7900,6 +7939,10 @@ void ggml_compute_forward_argsort(
|
||||
@@ -7857,6 +7896,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
@@ -73,7 +73,7 @@ index b52f0f847..902fdad69 100644
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
|
||||
index 6e7b90d42..08dd30525 100644
|
||||
index da9652c3b..b82be371c 100644
|
||||
--- a/ggml/src/ggml-cuda/argsort.cu
|
||||
+++ b/ggml/src/ggml-cuda/argsort.cu
|
||||
@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
|
||||
@@ -220,11 +220,11 @@ index 6e7b90d42..08dd30525 100644
|
||||
+ }
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
index e621cb981..597c0c8b3 100644
|
||||
index 7697c292d..00d773dd3 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
+
|
||||
@@ -234,10 +234,10 @@ index e621cb981..597c0c8b3 100644
|
||||
+ *dst = *src;
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||
index 12d5bf776..a0e34030e 100644
|
||||
index c4ceb4fc5..0e53ecc39 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||
@@ -251,6 +251,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
@@ -281,73 +281,76 @@ index 12d5bf776..a0e34030e 100644
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
@@ -332,6 +369,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_scalar_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
+ // TODO consider converting to template
|
||||
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 2c2f01415..50b8071de 100644
|
||||
index 73b45c762..aed013a9d 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -4467,8 +4467,72 @@ kernel void kernel_argsort_f32_i32(
|
||||
@@ -4721,8 +4721,77 @@ kernel void kernel_argsort_f32_i32(
|
||||
}
|
||||
}
|
||||
|
||||
+typedef void (i32_argsort_t)(
|
||||
+ constant ggml_metal_kargs_argsort & args,
|
||||
+ device const int32_t * x,
|
||||
+ device const int32_t * src0,
|
||||
+ device int32_t * dst,
|
||||
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
+ threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
+ ushort3 ntg[[threads_per_threadgroup]]);
|
||||
+
|
||||
+template<ggml_sort_order order>
|
||||
+kernel void kernel_argsort_i32_i32(
|
||||
+ constant ggml_metal_kargs_argsort & args,
|
||||
+ device const int32_t * x,
|
||||
+ device const int32_t * src0,
|
||||
+ device int32_t * dst,
|
||||
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
|
||||
+ threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
+ ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
+ // bitonic sort
|
||||
+ int col = tpitg[0];
|
||||
+ int row = tgpig[1];
|
||||
+ const int col = tpitg[0];
|
||||
+
|
||||
+ if (col >= args.ncols_pad) return;
|
||||
+ const int i00 = (tgpig[0]/args.ne01)*ntg.x;
|
||||
+ const int i01 = tgpig[0]%args.ne01;
|
||||
+ const int i02 = tgpig[1];
|
||||
+ const int i03 = tgpig[2];
|
||||
+
|
||||
+ device const int32_t * x_row = x + row * args.ncols;
|
||||
+ threadgroup int32_t * dst_row = shared_values;
|
||||
+ device const int32_t * src0_row = (device const int32_t *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03);
|
||||
+
|
||||
+ // initialize indices
|
||||
+ dst_row[col] = col;
|
||||
+ shmem_i32[col] = i00 + col;
|
||||
+
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
+
|
||||
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
|
||||
+ for (int k = 2; k <= ntg.x; k *= 2) {
|
||||
+ for (int j = k / 2; j > 0; j /= 2) {
|
||||
+ int ixj = col ^ j;
|
||||
+ if (ixj > col) {
|
||||
+ if ((col & k) == 0) {
|
||||
+ if (dst_row[col] >= args.ncols ||
|
||||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||
+ if (shmem_i32[col] >= args.ne00 ||
|
||||
+ (shmem_i32[ixj] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] :
|
||||
+ src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]]))
|
||||
+ ) {
|
||||
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||
+ SWAP(shmem_i32[col], shmem_i32[ixj]);
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (dst_row[ixj] >= args.ncols ||
|
||||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||
+ if (shmem_i32[ixj] >= args.ne00 ||
|
||||
+ (shmem_i32[col] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
+ src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] :
|
||||
+ src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]]))
|
||||
+ ) {
|
||||
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||
+ SWAP(shmem_i32[col], shmem_i32[ixj]);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
@@ -356,8 +359,10 @@ index 2c2f01415..50b8071de 100644
|
||||
+ }
|
||||
+
|
||||
+ // copy the result to dst without the padding
|
||||
+ if (col < args.ncols) {
|
||||
+ dst[row * args.ncols + col] = dst_row[col];
|
||||
+ if (i00 + col < args.ne00) {
|
||||
+ dst += i00 + args.ne00*i01 + args.ne00*args.ne01*i02 + args.ne00*args.ne01*args.ne02*i03;
|
||||
+
|
||||
+ dst[col] = shmem_i32[col];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
@@ -366,5 +371,5 @@ index 2c2f01415..50b8071de 100644
|
||||
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
|
||||
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
|
||||
|
||||
kernel void kernel_leaky_relu_f32(
|
||||
constant ggml_metal_kargs_leaky_relu & args,
|
||||
typedef void (argsort_merge_t)(
|
||||
constant ggml_metal_kargs_argsort_merge & args,
|
||||
|
||||
@@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index c830c0965..363853873 100644
|
||||
index 218222ece..06ee502ab 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -486,6 +486,7 @@ struct node_alloc {
|
||||
@@ -493,6 +493,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
@@ -46,7 +46,7 @@ index c830c0965..363853873 100644
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -509,6 +510,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
@@ -516,6 +517,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
@@ -56,7 +56,7 @@ index c830c0965..363853873 100644
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -576,6 +580,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
@@ -583,6 +587,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
@@ -64,7 +64,7 @@ index c830c0965..363853873 100644
|
||||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -891,6 +896,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -898,6 +903,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,8 +73,8 @@ index c830c0965..363853873 100644
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -920,14 +927,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
|
||||
@@ -932,14 +939,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
@@ -96,7 +96,7 @@ index c830c0965..363853873 100644
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -1082,6 +1094,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
@@ -1094,6 +1106,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
@@ -120,10 +120,10 @@ index c830c0965..363853873 100644
|
||||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 8ba86f824..cb2b99562 100644
|
||||
index 4882541c8..ff41c7712 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
@@ -1813,6 +1813,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Thu, 24 Apr 2025 14:48:51 -0700
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Sun, 30 Nov 2025 11:05:56 -0800
|
||||
Subject: [PATCH] ggml: Export GPU UUIDs
|
||||
|
||||
This enables matching up devices and information reported by the backend
|
||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
|
||||
@@ -24,10 +22,10 @@ index c54ff98bf..229bf387b 100644
|
||||
size_t memory_total;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index aefc6935e..cc201afff 100644
|
||||
index 8f3b1c173..e803f4af6 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
@@ -185,6 +185,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
@@ -79,7 +77,7 @@ index aefc6935e..cc201afff 100644
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -251,22 +296,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
@@ -110,7 +108,7 @@ index aefc6935e..cc201afff 100644
|
||||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -3268,6 +3315,7 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4048,6 +4095,7 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
@@ -118,9 +116,9 @@ index aefc6935e..cc201afff 100644
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3280,6 +3328,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
@@ -4136,6 +4184,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
@@ -130,7 +128,7 @@ index aefc6935e..cc201afff 100644
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -3296,6 +3349,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4176,6 +4229,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
@@ -138,7 +136,7 @@ index aefc6935e..cc201afff 100644
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -3869,6 +3923,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4767,6 +4821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
@@ -147,10 +145,10 @@ index aefc6935e..cc201afff 100644
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index bf0962274..f2ff9f322 100644
|
||||
index f2b7fe692..8fc1c2fb5 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
|
||||
@@ -10,10 +10,10 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
||||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index 4d487581a..35a0d25ed 100644
|
||||
index dfad9cd79..9858de630 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
|
||||
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
|
||||
MTMD_SLICE_TMPL_IDEFICS3,
|
||||
};
|
||||
|
||||
@@ -31,7 +31,7 @@ index 4d487581a..35a0d25ed 100644
|
||||
return "<__media__>";
|
||||
}
|
||||
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
|
||||
index f4ea07d3a..cf287224b 100644
|
||||
index 015119be8..8d3fa5d34 100644
|
||||
--- a/tools/mtmd/mtmd.h
|
||||
+++ b/tools/mtmd/mtmd.h
|
||||
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 4b2f8b7bd..046646282 100644
|
||||
index 5be08d6f4..7a0df30c3 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -2441,7 +2441,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
@@ -2463,7 +2463,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
|
||||
@@ -58,7 +58,7 @@ index 6792ba986..0f5b03cef 100644
|
||||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index cb2b99562..41eef3b5f 100644
|
||||
index ff41c7712..f511e8d76 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
@@ -97,7 +97,7 @@ index cb2b99562..41eef3b5f 100644
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1556,7 +1558,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
@@ -106,7 +106,7 @@ index cb2b99562..41eef3b5f 100644
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1578,7 +1580,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
|
||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||
|
||||
@@ -115,7 +115,7 @@ index cb2b99562..41eef3b5f 100644
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1657,6 +1659,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
@@ -123,7 +123,7 @@ index cb2b99562..41eef3b5f 100644
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1688,6 +1691,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
free(sched);
|
||||
}
|
||||
|
||||
@@ -178,10 +178,10 @@ index 3191faaa4..32f14c811 100644
|
||||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cc201afff..02d413467 100644
|
||||
index e803f4af6..78fb2d8b3 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2693,7 +2693,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
@@ -2885,7 +2885,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
@@ -190,7 +190,7 @@ index cc201afff..02d413467 100644
|
||||
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
|
||||
@@ -2726,24 +2726,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
@@ -2918,24 +2918,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -241,7 +241,7 @@ index cc201afff..02d413467 100644
|
||||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
@@ -3128,7 +3138,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3679,7 +3689,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,7 +250,7 @@ index cc201afff..02d413467 100644
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
@@ -3166,7 +3176,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3717,7 +3727,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
@@ -260,10 +260,10 @@ index cc201afff..02d413467 100644
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index f2ff9f322..05ff6a5a6 100644
|
||||
index 8fc1c2fb5..ba95b4acc 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
|
||||
@@ -419,10 +419,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644
|
||||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 216dc167c..3a6bbe564 100644
|
||||
index 83cdec29e..a36c6560c 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12357,7 +12357,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
@@ -13103,7 +13103,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
@@ -290,7 +290,7 @@ index 216dc167c..3a6bbe564 100644
|
||||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
@@ -12561,6 +12561,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
@@ -13320,6 +13320,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
|
||||
@@ -12,8 +12,8 @@ must be recreated with no-alloc set to false before loading data.
|
||||
ggml/src/ggml-backend-impl.h | 16 +++
|
||||
ggml/src/ggml-backend.cpp | 72 ++++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 58 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------
|
||||
5 files changed, 320 insertions(+), 44 deletions(-)
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------
|
||||
5 files changed, 321 insertions(+), 44 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2763f2bd6..b3b5b356a 100644
|
||||
@@ -75,7 +75,7 @@ index 0f5b03cef..7bdf9d81f 100644
|
||||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 41eef3b5f..c81a2e48a 100644
|
||||
index f511e8d76..74b7f070c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
@@ -134,7 +134,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1614,6 +1640,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
@@ -152,7 +152,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1655,11 +1692,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
@@ -167,7 +167,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1674,6 +1714,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
@@ -178,7 +178,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1719,6 +1763,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -203,7 +203,7 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1824,7 +1886,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -219,10 +219,10 @@ index 41eef3b5f..c81a2e48a 100644
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 41ff89c4d..2931c15ca 100644
|
||||
index 611341deb..c3f8ca914 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -35,6 +35,41 @@
|
||||
@@ -37,6 +37,41 @@
|
||||
#include "vendors/cuda.h"
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
@@ -264,7 +264,7 @@ index 41ff89c4d..2931c15ca 100644
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -856,6 +891,9 @@ struct ggml_cuda_pool {
|
||||
@@ -891,6 +926,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
@@ -274,46 +274,48 @@ index 41ff89c4d..2931c15ca 100644
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -992,11 +1030,11 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
- static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
|
||||
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, bool alloc);
|
||||
- static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
|
||||
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no, bool alloc);
|
||||
|
||||
ggml_cuda_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
- pools[device] = new_pool_for_device(device);
|
||||
+ pools[device] = new_pool_for_device(device, true);
|
||||
if (pools[device][curr_stream_no] == nullptr) {
|
||||
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
||||
}
|
||||
return *pools[device];
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1004,4 +1042,20 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
+
|
||||
+ void pool_set_alloc(bool alloc) {
|
||||
+ GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc);
|
||||
+ GGML_ASSERT(pools[device][curr_stream_no] == nullptr || pools[device][curr_stream_no]->alloc_memory() == alloc);
|
||||
+
|
||||
+ if (pools[device] == nullptr) {
|
||||
+ pools[device] = new_pool_for_device(device, alloc);
|
||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ size_t pool_get_alloc_size() {
|
||||
+ if (pools[device] == nullptr) {
|
||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ return pools[device]->alloc_size();
|
||||
+ return pools[device][curr_stream_no]->alloc_size();
|
||||
+ }
|
||||
};
|
||||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 02d413467..f79e5d65c 100644
|
||||
index 78fb2d8b3..fe0da71ca 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -359,6 +359,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
|
||||
@@ -322,7 +324,7 @@ index 02d413467..f79e5d65c 100644
|
||||
// buffer pool for cuda (legacy)
|
||||
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
static const int MAX_BUFFERS = 256;
|
||||
@@ -371,9 +373,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -373,9 +375,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
|
||||
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
|
||||
size_t pool_size = 0;
|
||||
@@ -337,7 +339,7 @@ index 02d413467..f79e5d65c 100644
|
||||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
@@ -381,7 +386,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -383,7 +388,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
@@ -348,7 +350,7 @@ index 02d413467..f79e5d65c 100644
|
||||
pool_size -= b.size;
|
||||
}
|
||||
}
|
||||
@@ -429,8 +436,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -431,8 +438,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
void * ptr;
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
@@ -366,7 +368,7 @@ index 02d413467..f79e5d65c 100644
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -450,10 +464,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -452,10 +466,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
@@ -389,7 +391,7 @@ index 02d413467..f79e5d65c 100644
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
@@ -465,18 +489,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -467,18 +491,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
CUdeviceptr pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
@@ -417,7 +419,7 @@ index 02d413467..f79e5d65c 100644
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
||||
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
||||
@@ -503,35 +533,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -505,35 +535,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
||||
|
||||
@@ -493,7 +495,7 @@ index 02d413467..f79e5d65c 100644
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
@@ -564,16 +608,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -566,17 +610,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||
}
|
||||
@@ -505,11 +507,14 @@ index 02d413467..f79e5d65c 100644
|
||||
+ size_t alloc_size() override {
|
||||
+ return pool_size + last_alloc;
|
||||
+ }
|
||||
+
|
||||
};
|
||||
#endif // defined(GGML_USE_VMM)
|
||||
|
||||
-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) {
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device,
|
||||
- [[maybe_unused]] int stream_no) {
|
||||
+ [[maybe_unused]] int stream_no,
|
||||
+ bool alloc) {
|
||||
#if defined(GGML_USE_VMM)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
@@ -521,7 +526,7 @@ index 02d413467..f79e5d65c 100644
|
||||
}
|
||||
|
||||
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
||||
@@ -757,11 +809,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
@@ -760,11 +814,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
@@ -543,7 +548,7 @@ index 02d413467..f79e5d65c 100644
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
@@ -785,6 +846,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
@@ -788,6 +851,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL,
|
||||
@@ -551,7 +556,7 @@ index 02d413467..f79e5d65c 100644
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
@@ -2986,6 +3048,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
@@ -3258,6 +3322,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
@@ -559,7 +564,7 @@ index 02d413467..f79e5d65c 100644
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
@@ -3001,6 +3064,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3347,6 +3412,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -567,11 +572,10 @@ index 02d413467..f79e5d65c 100644
|
||||
+ if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
|
||||
@@ -3140,6 +3208,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
@@ -3691,6 +3760,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
@@ -579,7 +583,7 @@ index 02d413467..f79e5d65c 100644
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
@@ -3215,6 +3284,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -645,16 +649,16 @@ index 02d413467..f79e5d65c 100644
|
||||
+
|
||||
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
+ ctx->pools[ctx->device] = NULL;
|
||||
+ ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
@@ -3255,6 +3389,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .graph_optimize = */ NULL,
|
||||
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
||||
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
|
||||
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
|
||||
+ /* .reset = */ ggml_backend_cuda_reset,
|
||||
|
||||
@@ -8,12 +8,12 @@ Subject: [PATCH] decode: disable output_all
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index bd348bcad..8b4a89d38 100644
|
||||
index e04f0fc4f..1359c614b 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
@@ -999,8 +999,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
- // when computing embeddings, all tokens are output
|
||||
- const bool output_all = cparams.embeddings;
|
||||
|
||||
@@ -43,7 +43,7 @@ index 7bdf9d81f..21b35ac5c 100644
|
||||
|
||||
struct ggml_backend_device {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index c81a2e48a..9b0a9b91f 100644
|
||||
index 74b7f070c..8d2cc167f 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
@@ -62,10 +62,10 @@ index c81a2e48a..9b0a9b91f 100644
|
||||
GGML_ASSERT(device);
|
||||
return device->iface.get_buffer_type(device);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index f79e5d65c..c9333689f 100644
|
||||
index fe0da71ca..0787e443c 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
|
||||
@@ -109,6 +109,11 @@ int ggml_cuda_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ index f79e5d65c..c9333689f 100644
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err;
|
||||
@@ -3499,7 +3504,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4380,7 +4385,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
@@ -89,7 +89,7 @@ index f79e5d65c..c9333689f 100644
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
@@ -3936,6 +3944,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
@@ -4835,6 +4843,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
@@ -101,7 +101,7 @@ index f79e5d65c..c9333689f 100644
|
||||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
@@ -3952,6 +3965,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
@@ -4851,6 +4864,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
@@ -110,7 +110,7 @@ index f79e5d65c..c9333689f 100644
|
||||
|
||||
// backend reg
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index 890c10364..1f06be80e 100644
|
||||
index b7d6edf7f..b987d7aeb 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -45,6 +45,7 @@
|
||||
|
||||
@@ -20,10 +20,10 @@ fix vulkan PCI ID and ID handling
|
||||
ggml/src/ggml-cuda/vendors/hip.h | 3 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 209 +++++++++--
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 169 ++++++++-
|
||||
ggml/src/mem_hip.cpp | 529 +++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 209 +++++++++++
|
||||
9 files changed, 1003 insertions(+), 30 deletions(-)
|
||||
9 files changed, 976 insertions(+), 17 deletions(-)
|
||||
create mode 100644 ggml/src/mem_hip.cpp
|
||||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
@@ -45,7 +45,7 @@ index 69223c488..6510e0cba 100644
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index f9a6587f1..03f359ae9 100644
|
||||
index 6d493a4ff..ac8f38464 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -209,6 +209,8 @@ add_library(ggml-base
|
||||
@@ -56,12 +56,12 @@ index f9a6587f1..03f359ae9 100644
|
||||
+ mem_nvml.cpp
|
||||
gguf.cpp)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c9333689f..f1a20e7fe 100644
|
||||
index 0787e443c..736d47c07 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
@@ -78,7 +78,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
#if defined(GGML_USE_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -316,6 +326,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
@@ -90,7 +90,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
@@ -3468,6 +3483,11 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
std::string id;
|
||||
@@ -102,7 +102,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3488,6 +3508,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -129,9 +129,9 @@ index c9333689f..f1a20e7fe 100644
|
||||
+ }
|
||||
+#endif
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
@@ -3496,6 +3538,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
||||
@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
@@ -139,7 +139,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
@@ -3509,6 +3552,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||
props->memory_total = props->memory_free = 0;
|
||||
|
||||
@@ -159,7 +159,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -4075,6 +4131,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
@@ -167,7 +167,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -4090,6 +4147,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
@@ -183,7 +183,7 @@ index c9333689f..f1a20e7fe 100644
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index 1f06be80e..2f9ef2dc0 100644
|
||||
index b987d7aeb..5ad5623ae 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -5,6 +5,8 @@
|
||||
@@ -204,7 +204,7 @@ index 1f06be80e..2f9ef2dc0 100644
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||
index e9201cdc6..44ae76d66 100644
|
||||
index fe57d4c58..1c07e767a 100644
|
||||
--- a/ggml/src/ggml-impl.h
|
||||
+++ b/ggml/src/ggml-impl.h
|
||||
@@ -677,6 +677,14 @@ static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
|
||||
@@ -223,10 +223,10 @@ index e9201cdc6..44ae76d66 100644
|
||||
}
|
||||
#endif
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
index 05ff6a5a6..032dee76d 100644
|
||||
index ba95b4acc..f6f8f7a10 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||
@@ -537,6 +537,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
@@ -546,6 +546,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
@@ -234,7 +234,7 @@ index 05ff6a5a6..032dee76d 100644
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
@@ -545,6 +546,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
|
||||
@@ -554,6 +555,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
|
||||
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -243,18 +243,18 @@ index 05ff6a5a6..032dee76d 100644
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 3a6bbe564..ca02ea079 100644
|
||||
index a36c6560c..a234eda2e 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -229,6 +229,7 @@ class vk_memory_logger;
|
||||
#endif
|
||||
@@ -236,6 +236,7 @@ class vk_memory_logger;
|
||||
class vk_perf_logger;
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
||||
+static std::string ggml_vk_get_device_id(int device);
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -11813,6 +11814,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
@@ -12353,6 +12354,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
@@ -284,7 +284,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -12761,31 +12785,102 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
@@ -13614,15 +13638,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
@@ -312,24 +312,23 @@ index 3a6bbe564..ca02ea079 100644
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
+};
|
||||
+
|
||||
|
||||
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
|
||||
+
|
||||
+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
|
||||
|
||||
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
|
||||
- vk::PhysicalDeviceMemoryProperties2 memprops = {};
|
||||
- bool membudget_supported = vk_instance.device_supports_membudget[device];
|
||||
+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
|
||||
vk::PhysicalDeviceMemoryProperties2 memprops = {};
|
||||
- const bool membudget_supported = vk_instance.device_supports_membudget[device];
|
||||
+ const bool membudget_supported = vk_instance.device_supports_membudget[ctx->device];
|
||||
const bool is_integrated_gpu = vkdev.getProperties().deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
+
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
+ vkdev.getProperties2(&props2);
|
||||
|
||||
- if (membudget_supported) {
|
||||
- memprops.pNext = &budgetprops;
|
||||
+ if (!ctx->is_integrated_gpu)
|
||||
+
|
||||
+ if (!is_integrated_gpu)
|
||||
+ {
|
||||
+ // Use vendor specific management libraries for best VRAM reporting if available
|
||||
+ switch (props2.properties.vendorID) {
|
||||
@@ -356,55 +355,13 @@ index 3a6bbe564..ca02ea079 100644
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
- vkdev.getMemoryProperties2(&memprops);
|
||||
+ }
|
||||
+ // else fallback to memory budget if supported
|
||||
+
|
||||
|
||||
- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
|
||||
- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
|
||||
+ *total = 0;
|
||||
+ *free = 0;
|
||||
+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
|
||||
+ vk::PhysicalDeviceMemoryProperties2 memprops2;
|
||||
+ memprops2.pNext = &mem_budget_props;
|
||||
+ vkdev.getMemoryProperties2(&memprops2);
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ // Include shared memory on iGPUs
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ }
|
||||
+ }
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ }
|
||||
+ }
|
||||
+ if (*total > 0 && *free > 0) {
|
||||
+ return;
|
||||
+ } else if (*total > 0) {
|
||||
+ *free = *total;
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
+ // else just report the physical memory
|
||||
+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
-
|
||||
- if (membudget_supported && i < budgetprops.heapUsage.size()) {
|
||||
- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
|
||||
- } else {
|
||||
- *free = heap.size;
|
||||
- }
|
||||
+ *free = heap.size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -12818,8 +12913,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
@@ -13674,8 +13755,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -419,7 +376,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -12836,19 +12936,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13692,19 +13778,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
@@ -453,7 +410,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -12860,9 +12965,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
@@ -13716,9 +13807,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
@@ -469,7 +426,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -12886,8 +12996,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13742,8 +13838,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
@@ -480,7 +437,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -12895,6 +13006,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13751,6 +13848,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
@@ -494,7 +451,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -13365,6 +13483,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14319,6 +14423,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
@@ -503,7 +460,7 @@ index 3a6bbe564..ca02ea079 100644
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -13373,12 +13493,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14327,12 +14433,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
@@ -6,108 +6,101 @@ Subject: [PATCH] interleave multi rope
|
||||
since ollama doesn't use mrope for anything else, change it to mean the
|
||||
interleaved version used for qwen3vl
|
||||
---
|
||||
ggml/src/ggml-cpu/ops.cpp | 7 ++-----
|
||||
ggml/src/ggml-cuda/rope.cu | 12 +++---------
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 10 +++-------
|
||||
ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++---------
|
||||
4 files changed, 11 insertions(+), 30 deletions(-)
|
||||
ggml/src/ggml-cpu/ops.cpp | 8 ++++----
|
||||
ggml/src/ggml-cuda/rope.cu | 8 ++++----
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 8 ++++----
|
||||
ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl | 8 ++++----
|
||||
4 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 902fdad69..70955347d 100644
|
||||
index 40666bab6..3155cb4bb 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
|
||||
}
|
||||
@@ -5599,14 +5599,14 @@ static void ggml_mrope_cache_init(
|
||||
|
||||
float theta = theta_t;
|
||||
- if (sector >= sections[0] && sector < sec_w) {
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
|
||||
theta = theta_h;
|
||||
}
|
||||
- else if (sector >= sec_w && sector < sec_w + sections[2]) {
|
||||
+ else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
|
||||
theta = theta_w;
|
||||
}
|
||||
- else if (sector >= sec_w + sections[2]) {
|
||||
- theta = theta_e;
|
||||
- }
|
||||
|
||||
rope_yarn(
|
||||
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
||||
if (is_imrope) { // qwen3vl apply interleaved mrope
|
||||
- if (sector % 3 == 1 && sector < 3 * sections[1]) {
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
|
||||
theta = theta_h;
|
||||
- } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
|
||||
+ } else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
|
||||
theta = theta_w;
|
||||
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
|
||||
theta = theta_t;
|
||||
- } else {
|
||||
- theta = theta_e;
|
||||
+ // } else {
|
||||
+ // theta = theta_e;
|
||||
}
|
||||
} else {
|
||||
if (sector >= sections[0] && sector < sec_w) {
|
||||
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
|
||||
index d058504cd..287fe9d2c 100644
|
||||
index 88ed79111..71ca60214 100644
|
||||
--- a/ggml/src/ggml-cuda/rope.cu
|
||||
+++ b/ggml/src/ggml-cuda/rope.cu
|
||||
@@ -151,19 +151,13 @@ static __global__ void rope_multi(
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
- float theta_base = 0.0;
|
||||
- if (sector < sections.v[0]) {
|
||||
- theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
- }
|
||||
- else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
+ float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
|
||||
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
- else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||
+ else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
|
||||
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
- else if (sector >= sec_w + sections.v[2]) {
|
||||
- theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
- }
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
@@ -200,14 +200,14 @@ static __global__ void rope_multi(
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (is_imrope) {
|
||||
- if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { // h
|
||||
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||
- } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
|
||||
+ } else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { // w
|
||||
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
|
||||
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
- } else {
|
||||
- theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
+ // } else {
|
||||
+ // theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
} else {
|
||||
if (sector < sections.v[0]) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 50b8071de..65a3183c8 100644
|
||||
index aed013a9d..a489de435 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -3888,15 +3888,11 @@ kernel void kernel_rope_multi(
|
||||
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
|
||||
const int sector = ic % sect_dims;
|
||||
@@ -4009,14 +4009,14 @@ kernel void kernel_rope_multi(
|
||||
|
||||
- float theta_base;
|
||||
- if (sector < args.sect_0) {
|
||||
- theta_base = (float) pos[i2];
|
||||
- } else if (sector < sec_w01) {
|
||||
+ float theta_base = (float) pos[i2];
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
|
||||
theta_base = (float) pos[i2 + args.ne02];
|
||||
- } else if (sector < sec_w012) {
|
||||
+ } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
|
||||
theta_base = (float) pos[i2 + args.ne02 * 2];
|
||||
- } else {
|
||||
- theta_base = (float) pos[i2 + args.ne02 * 3];
|
||||
}
|
||||
// end of mrope
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
|
||||
index 111286b49..633dc20ff 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
|
||||
@@ -31,19 +31,13 @@ void main() {
|
||||
const int sec_w = p.sections[1] + p.sections[0];
|
||||
const uint sector = (i0 / 2) % sect_dims;
|
||||
|
||||
- float theta_base = 0.0;
|
||||
- if (sector < p.sections[0]) {
|
||||
- theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
|
||||
- }
|
||||
- else if (sector >= p.sections[0] && sector < sec_w) {
|
||||
+ float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
|
||||
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
|
||||
}
|
||||
- else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
|
||||
+ else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
|
||||
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
|
||||
}
|
||||
- else if (sector >= sec_w + p.sections[2]) {
|
||||
- theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
|
||||
- }
|
||||
|
||||
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
|
||||
float theta_base;
|
||||
if (FC_rope_is_imrope) {
|
||||
- if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { // h
|
||||
theta_base = (float) pos[i2 + args.ne02 * 1];
|
||||
- } else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w
|
||||
+ } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { // w
|
||||
theta_base = (float) pos[i2 + args.ne02 * 2];
|
||||
} else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t
|
||||
theta_base = (float) pos[i2 + args.ne02 * 0];
|
||||
- } else { // e
|
||||
- theta_base = (float) pos[i2 + args.ne02 * 3];
|
||||
+ // } else { // e
|
||||
+ // theta_base = (float) pos[i2 + args.ne02 * 3];
|
||||
}
|
||||
} else {
|
||||
if (sector < args.sect_0) {
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
|
||||
index 9726b722d..1c8c69422 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
|
||||
@@ -148,14 +148,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (p.is_imrope != 0) {
|
||||
- if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
|
||||
+ if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
|
||||
theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
|
||||
- } else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
|
||||
+ } else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
|
||||
theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
|
||||
} else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
|
||||
theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
|
||||
- } else {
|
||||
- theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
|
||||
+ //} else {
|
||||
+ // theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
|
||||
}
|
||||
} else {
|
||||
if (sector < p.sections[0]) {
|
||||
|
||||
@@ -6,13 +6,13 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
|
||||
---
|
||||
ggml/src/CMakeLists.txt | 1 +
|
||||
ggml/src/ggml-impl.h | 3 +
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 29 ++-
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 26 ++-
|
||||
ggml/src/mem_dxgi_pdh.cpp | 297 +++++++++++++++++++++++++++
|
||||
4 files changed, 327 insertions(+), 3 deletions(-)
|
||||
4 files changed, 325 insertions(+), 2 deletions(-)
|
||||
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 03f359ae9..4b3e5efb5 100644
|
||||
index ac8f38464..faa1beed2 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -211,6 +211,7 @@ add_library(ggml-base
|
||||
@@ -22,9 +22,9 @@ index 03f359ae9..4b3e5efb5 100644
|
||||
+ mem_dxgi_pdh.cpp
|
||||
gguf.cpp)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||
index 44ae76d66..639d551a2 100644
|
||||
index 1c07e767a..0da3e065b 100644
|
||||
--- a/ggml/src/ggml-impl.h
|
||||
+++ b/ggml/src/ggml-impl.h
|
||||
@@ -684,6 +684,9 @@ GGML_API void ggml_nvml_release();
|
||||
@@ -38,10 +38,10 @@ index 44ae76d66..639d551a2 100644
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index ca02ea079..c12b069e5 100644
|
||||
index a234eda2e..c98f98c73 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16"
|
||||
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
|
||||
#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000)
|
||||
@@ -49,7 +49,7 @@ index ca02ea079..c12b069e5 100644
|
||||
|
||||
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
|
||||
VkStructureType sType;
|
||||
@@ -12802,6 +12803,7 @@ struct ggml_backend_vk_device_context {
|
||||
@@ -13655,6 +13656,7 @@ struct ggml_backend_vk_device_context {
|
||||
std::string pci_id;
|
||||
std::string id;
|
||||
std::string uuid;
|
||||
@@ -57,8 +57,8 @@ index ca02ea079..c12b069e5 100644
|
||||
int major;
|
||||
int minor;
|
||||
int driver_major;
|
||||
@@ -12817,8 +12819,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
@@ -13673,6 +13675,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vkdev.getProperties2(&props2);
|
||||
+ GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str());
|
||||
@@ -76,22 +76,17 @@ index ca02ea079..c12b069e5 100644
|
||||
+ ggml_dxgi_pdh_release();
|
||||
+ }
|
||||
|
||||
- if (!ctx->is_integrated_gpu)
|
||||
+ if (!ctx->is_integrated_gpu)
|
||||
if (!is_integrated_gpu)
|
||||
{
|
||||
// Use vendor specific management libraries for best VRAM reporting if available
|
||||
switch (props2.properties.vendorID) {
|
||||
@@ -12846,8 +12862,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
break;
|
||||
}
|
||||
@@ -13704,7 +13720,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
}
|
||||
- // else fallback to memory budget if supported
|
||||
// else fallback to memory budget if supported
|
||||
|
||||
+ // else fallback to memory budget if supported
|
||||
*total = 0;
|
||||
*free = 0;
|
||||
vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
|
||||
@@ -13500,7 +13516,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
-
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
}
|
||||
@@ -14440,7 +14455,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
@@ -99,7 +94,7 @@ index ca02ea079..c12b069e5 100644
|
||||
// Gather additional information about the device
|
||||
int dev_idx = vk_instance.device_indices[i];
|
||||
vk::PhysicalDeviceProperties props1;
|
||||
@@ -13523,6 +13538,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14463,6 +14477,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
}
|
||||
}
|
||||
ctx->uuid = oss.str();
|
||||
|
||||
@@ -10,10 +10,10 @@ fallback to cpu
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index f1a20e7fe..1a71e07c9 100644
|
||||
index 736d47c07..7350f6758 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -4564,6 +4564,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jeff Bolz <jbolz@nvidia.com>
|
||||
Date: Wed, 29 Oct 2025 03:53:04 -0500
|
||||
Subject: [PATCH] vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy
|
||||
(#16793)
|
||||
|
||||
This lets the copy to the destination device use the host-visible
|
||||
vidmem optimization.
|
||||
---
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +----
|
||||
1 file changed, 1 insertion(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c12b069e5..76c78c2ea 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
||||
// Copy device to device
|
||||
ggml_vk_ensure_sync_staging_buffer(src->device, size);
|
||||
- ggml_vk_ensure_sync_staging_buffer(dst->device, size);
|
||||
|
||||
// Copy to src staging buffer
|
||||
ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
|
||||
- // memcpy to dst staging buffer
|
||||
- memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
|
||||
// Copy to dst buffer
|
||||
- ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
|
||||
+ ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ Subject: [PATCH] win: exit instead of abort
|
||||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 9be35c1be..923c33d05 100644
|
||||
index b99345a2e..1c9e0bc05 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
25
llama/patches/0031-fix-bakllava-regression.patch
Normal file
25
llama/patches/0031-fix-bakllava-regression.patch
Normal file
@@ -0,0 +1,25 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Tue, 11 Nov 2025 11:39:43 -0800
|
||||
Subject: [PATCH] fix bakllava regression
|
||||
|
||||
Rever to prior logic of assuming an empty projector type is mlp
|
||||
---
|
||||
tools/mtmd/clip.cpp | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index f4c4d2c48..3334ff25b 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -2648,6 +2648,10 @@ struct clip_model_loader {
|
||||
if (proj_type.empty()) {
|
||||
if (modality == CLIP_MODALITY_VISION) {
|
||||
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
||||
+ if (proj_type.empty()) {
|
||||
+ // Assume MLP if no projector type listed
|
||||
+ proj_type = "mlp";
|
||||
+ }
|
||||
} else if (modality == CLIP_MODALITY_AUDIO) {
|
||||
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
|
||||
} else {
|
||||
@@ -1,657 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jeff Bolz <jbolz@nvidia.com>
|
||||
Date: Wed, 29 Oct 2025 08:44:29 -0500
|
||||
Subject: [PATCH] vulkan: Update topk_moe fusion to handle gpt's late softmax
|
||||
(#16656)
|
||||
|
||||
* vulkan: Update topk_moe fusion to handle gpt's late softmax
|
||||
|
||||
Based on #16649.
|
||||
|
||||
* Add ggml_check_edges
|
||||
|
||||
* Add sync logging to show fusion effects
|
||||
|
||||
* handle clamp added in #16655
|
||||
|
||||
* Update ggml/src/ggml-impl.h
|
||||
|
||||
Co-authored-by: Diego Devesa <slarengh@gmail.com>
|
||||
---
|
||||
ggml/src/ggml-impl.h | 16 +
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 304 +++++++++++-------
|
||||
.../ggml-vulkan/vulkan-shaders/topk_moe.comp | 90 ++++--
|
||||
3 files changed, 272 insertions(+), 138 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||
index 639d551a2..e5c446d1d 100644
|
||||
--- a/ggml/src/ggml-impl.h
|
||||
+++ b/ggml/src/ggml-impl.h
|
||||
@@ -693,6 +693,7 @@ GGML_API void ggml_dxgi_pdh_release();
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
+#include <array>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
@@ -708,6 +709,21 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
|
||||
return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
|
||||
}
|
||||
|
||||
+// Return true if the edges in the graph match expectations.
|
||||
+inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
|
||||
+ int start_idx,
|
||||
+ std::initializer_list<std::array<int, 3>> edges) {
|
||||
+ for (const auto & edge : edges) {
|
||||
+ int dst_node = edge[0];
|
||||
+ int src_idx = edge[1];
|
||||
+ int src_node = edge[2];
|
||||
+ if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 7669ed206..63a762ec2 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
|
||||
static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
|
||||
static constexpr uint32_t num_topk_moe_pipelines = 10;
|
||||
|
||||
-static constexpr std::array topk_moe_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
- GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
- GGML_OP_SUM_ROWS, GGML_OP_DIV, GGML_OP_RESHAPE };
|
||||
-static constexpr std::array topk_moe { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
- GGML_OP_VIEW, GGML_OP_GET_ROWS };
|
||||
+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
+ GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
+ GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV,
|
||||
+ GGML_OP_RESHAPE };
|
||||
+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
+ GGML_OP_VIEW, GGML_OP_GET_ROWS };
|
||||
+static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW,
|
||||
+ GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
+ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
|
||||
+
|
||||
+//node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ]
|
||||
+//node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
+//node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
+//node #981 ( VIEW): ffn_moe_topk-15 ( 0K) [Vulka ] use=4: ffn_moe_argsort-15 ( 0K) [Vulka ]
|
||||
+//node #982 ( GET_ROWS): ffn_moe_weights-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 (re ( 0K) [Vulka ] ffn_moe_topk-15 ( 0K) [Vulka ]
|
||||
+//node #983 ( RESHAPE): ffn_moe_weights-15 ( ( 0K) [Vulka ] use=2: ffn_moe_weights-15 ( 0K) [Vulka ]
|
||||
+//node #984 ( SUM_ROWS): ffn_moe_weights_sum- ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ]
|
||||
+//node #985 ( CLAMP): ffn_moe_weights_sum_ ( 0K) [Vulka ] use=1: ffn_moe_weights_sum- ( 0K) [Vulka ]
|
||||
+//node #986 ( DIV): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ] ffn_moe_weights_sum_ ( 0K) [Vulka ]
|
||||
+//node #987 ( RESHAPE): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights_norm ( 0K) [Vulka ]
|
||||
+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_norm_edges {
|
||||
+ { 1, 0, 0 }, // reshape->src[0] == softmax
|
||||
+ { 2, 0, 0 }, // argsort->src[0] == softmax
|
||||
+ { 3, 0, 2 }, // view->src[0] == argsort
|
||||
+ { 4, 0, 1 }, // get_rows->src[0] == reshape
|
||||
+ { 4, 1, 3 }, // get_rows->src[1] == view
|
||||
+ { 5, 0, 4 }, // reshape->src[0] == get_rows
|
||||
+ { 6, 0, 5 }, // sum_rows->src[0] == reshape
|
||||
+ { 7, 0, 6 }, // clamp->src[0] == sum_rows
|
||||
+ { 8, 0, 5 }, // div->src[0] == reshape
|
||||
+ { 8, 1, 7 }, // div->src[1] == clamp
|
||||
+ { 9, 0, 8 }, // reshape->src[0] == div
|
||||
+};
|
||||
+
|
||||
+// same as early_softmax_norm but ending after the get_rows
|
||||
+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges {
|
||||
+ { 1, 0, 0 }, // reshape->src[0] == softmax
|
||||
+ { 2, 0, 0 }, // argsort->src[0] == softmax
|
||||
+ { 3, 0, 2 }, // view->src[0] == argsort
|
||||
+ { 4, 0, 1 }, // get_rows->src[0] == reshape
|
||||
+ { 4, 1, 3 }, // get_rows->src[1] == view
|
||||
+};
|
||||
|
||||
+//node #652 ( ARGSORT): ffn_moe_argsort-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 ( 0K) [Vulka ]
|
||||
+//node #653 ( VIEW): ffn_moe_topk-11 ( 0K) [Vulka ] use=7: ffn_moe_argsort-11 ( 0K) [Vulka ]
|
||||
+//node #654 ( GET_ROWS): ffn_moe_weights-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 (re ( 0K) [Vulka ] ffn_moe_topk-11 ( 0K) [Vulka ]
|
||||
+//node #655 ( RESHAPE): ffn_moe_weights-11 ( ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( 0K) [Vulka ]
|
||||
+//node #656 ( SOFT_MAX): node_656 ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( ( 0K) [Vulka ]
|
||||
+//node #657 ( RESHAPE): ffn_moe_weights_soft ( 0K) [Vulka ] use=1: node_656 ( 0K) [Vulka ]
|
||||
+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_late_softmax_edges {
|
||||
+ { 1, 0, 0 }, // view->src[0] == argsort
|
||||
+ { 2, 1, 1 }, // get_rows->src[1] == view
|
||||
+ { 3, 0, 2 }, // reshape->src[0] == get_rows
|
||||
+ { 4, 0, 3 }, // soft_max->src[0] == reshape
|
||||
+ { 5, 0, 4 }, // reshape->src[0] == soft_max
|
||||
+};
|
||||
+
|
||||
+enum topk_moe_mode {
|
||||
+ TOPK_MOE_EARLY_SOFTMAX,
|
||||
+ TOPK_MOE_EARLY_SOFTMAX_NORM,
|
||||
+ TOPK_MOE_LATE_SOFTMAX,
|
||||
+ TOPK_MOE_COUNT,
|
||||
+};
|
||||
+
|
||||
+static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
|
||||
+ topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
|
||||
+ num == topk_moe_early_softmax.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX :
|
||||
+ TOPK_MOE_LATE_SOFTMAX;
|
||||
+ return mode;
|
||||
+}
|
||||
|
||||
struct vk_device_struct {
|
||||
std::recursive_mutex mutex;
|
||||
@@ -607,8 +671,7 @@ struct vk_device_struct {
|
||||
|
||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||
|
||||
- // [2] is {!norm, norm}
|
||||
- vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
|
||||
+ vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
|
||||
|
||||
std::vector<vk_pipeline_ref> all_pipelines;
|
||||
|
||||
@@ -956,6 +1019,8 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
|
||||
struct vk_op_topk_moe_push_constants {
|
||||
uint32_t n_rows;
|
||||
uint32_t n_expert_used;
|
||||
+ float clamp_min;
|
||||
+ float clamp_max;
|
||||
};
|
||||
|
||||
struct vk_op_add_id_push_constants {
|
||||
@@ -3806,8 +3871,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
||||
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][0], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0}, 1, true, true);
|
||||
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][1], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1}, 1, true, true);
|
||||
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true);
|
||||
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true);
|
||||
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true);
|
||||
}
|
||||
|
||||
for (auto &c : compiles) {
|
||||
@@ -8085,8 +8151,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
if (ctx->num_additional_fused_ops) {
|
||||
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
||||
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
||||
- bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
|
||||
- return ctx->device->pipeline_topk_moe[idx][with_norm];
|
||||
+ topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
||||
+ return ctx->device->pipeline_topk_moe[idx][mode];
|
||||
}
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
||||
@@ -8141,6 +8207,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return nullptr;
|
||||
}
|
||||
case GGML_OP_ARGSORT:
|
||||
+ if (ctx->num_additional_fused_ops) {
|
||||
+ uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
||||
+ GGML_ASSERT(idx < num_topk_moe_pipelines);
|
||||
+ topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
||||
+ return ctx->device->pipeline_topk_moe[idx][mode];
|
||||
+ }
|
||||
+
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
|
||||
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
||||
return ctx->device->pipeline_argsort_f32[idx];
|
||||
@@ -9676,10 +9749,12 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
|
||||
static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
|
||||
|
||||
- bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
|
||||
+ topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
||||
ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
|
||||
- ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
|
||||
- ggml_tensor * ids = cgraph->nodes[node_idx + 3];
|
||||
+ ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
|
||||
+ (mode == TOPK_MOE_EARLY_SOFTMAX) ? cgraph->nodes[node_idx + 4] :
|
||||
+ cgraph->nodes[node_idx + 5];
|
||||
+ ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
|
||||
|
||||
GGML_ASSERT(logits->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(weights->type == GGML_TYPE_F32);
|
||||
@@ -9738,9 +9813,14 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
GGML_ASSERT(d_ids != nullptr);
|
||||
}
|
||||
|
||||
- vk_op_topk_moe_push_constants pc;
|
||||
+ vk_op_topk_moe_push_constants pc {};
|
||||
pc.n_rows = n_rows;
|
||||
pc.n_expert_used = n_expert_used;
|
||||
+ if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
||||
+ ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
||||
+ pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
|
||||
+ pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
|
||||
+ }
|
||||
|
||||
GGML_ASSERT(n_expert_used <= n_experts);
|
||||
|
||||
@@ -11335,7 +11415,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
}
|
||||
}
|
||||
}
|
||||
+
|
||||
+#define ENABLE_SYNC_LOGGING 0
|
||||
+
|
||||
if (need_sync) {
|
||||
+#if ENABLE_SYNC_LOGGING
|
||||
+ std::cerr << "sync" << std::endl;
|
||||
+#endif
|
||||
ctx->unsynced_nodes_written.clear();
|
||||
ctx->unsynced_nodes_read.clear();
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
@@ -11353,6 +11439,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
}
|
||||
}
|
||||
}
|
||||
+#if ENABLE_SYNC_LOGGING
|
||||
+ if (!dryrun) {
|
||||
+ for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
||||
+ auto *n = cgraph->nodes[node_idx + i];
|
||||
+ std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
|
||||
+ if (n->op == GGML_OP_GLU) {
|
||||
+ std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
|
||||
+ }
|
||||
+ std::cerr << std::endl;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -11531,7 +11629,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
- ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
|
||||
+ if (ctx->num_additional_fused_ops) {
|
||||
+ ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
|
||||
+ } else {
|
||||
+ ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
|
||||
+ }
|
||||
|
||||
break;
|
||||
case GGML_OP_SUM:
|
||||
@@ -12329,30 +12431,27 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st
|
||||
}
|
||||
|
||||
static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
|
||||
- int node_idx, bool with_norm) {
|
||||
+ int node_idx, topk_moe_mode mode) {
|
||||
|
||||
- if (with_norm) {
|
||||
- if (node_idx + (int)topk_moe_norm.size() > cgraph->n_nodes) {
|
||||
- return false;
|
||||
- }
|
||||
- for (size_t i = 0; i < topk_moe_norm.size(); ++i) {
|
||||
- if (cgraph->nodes[node_idx + i]->op != topk_moe_norm[i]) {
|
||||
- return false;
|
||||
- }
|
||||
- }
|
||||
- } else {
|
||||
- if (node_idx + (int)topk_moe.size() > cgraph->n_nodes) {
|
||||
- return false;
|
||||
- }
|
||||
- for (size_t i = 0; i < topk_moe.size(); ++i) {
|
||||
- if (cgraph->nodes[node_idx + i]->op != topk_moe[i]) {
|
||||
- return false;
|
||||
- }
|
||||
- }
|
||||
- }
|
||||
+ const ggml_tensor * softmax;
|
||||
+ const ggml_tensor * weights;
|
||||
|
||||
- const ggml_tensor * softmax = cgraph->nodes[node_idx + 0];
|
||||
- const ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
|
||||
+ switch (mode) {
|
||||
+ case TOPK_MOE_EARLY_SOFTMAX_NORM:
|
||||
+ softmax = cgraph->nodes[node_idx + 0];
|
||||
+ weights = cgraph->nodes[node_idx + 9];
|
||||
+ break;
|
||||
+ case TOPK_MOE_EARLY_SOFTMAX:
|
||||
+ softmax = cgraph->nodes[node_idx + 0];
|
||||
+ weights = cgraph->nodes[node_idx + 4];
|
||||
+ break;
|
||||
+ case TOPK_MOE_LATE_SOFTMAX:
|
||||
+ softmax = cgraph->nodes[node_idx + 4];
|
||||
+ weights = cgraph->nodes[node_idx + 5];
|
||||
+ break;
|
||||
+ default:
|
||||
+ return false;
|
||||
+ }
|
||||
|
||||
const float * op_params = (const float *)softmax->op_params;
|
||||
|
||||
@@ -12378,60 +12477,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
|
||||
return false;
|
||||
}
|
||||
|
||||
- // Check that the nodes don't have any unexpected uses
|
||||
- const ggml_tensor * reshape1 = cgraph->nodes[node_idx + 1];
|
||||
- const ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
||||
- const ggml_tensor * view = cgraph->nodes[node_idx + 3];
|
||||
- const ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
||||
- const ggml_tensor * reshape5 = with_norm ? cgraph->nodes[node_idx + 5] : nullptr;
|
||||
- const ggml_tensor * sum_rows = with_norm ? cgraph->nodes[node_idx + 6] : nullptr;
|
||||
- const ggml_tensor * div = with_norm ? cgraph->nodes[node_idx + 7] : nullptr;
|
||||
- const ggml_tensor * reshape8 = with_norm ? cgraph->nodes[node_idx + 8] : nullptr;
|
||||
-
|
||||
- // softmax is used by reshape and argsort
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx) != 2 ||
|
||||
- reshape1->src[0] != softmax ||
|
||||
- argsort->src[0] != softmax) {
|
||||
- return false;
|
||||
- }
|
||||
- // reshape is used by get_rows
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx + 1) != 1 ||
|
||||
- get_rows->src[0] != reshape1) {
|
||||
- return false;
|
||||
- }
|
||||
- // argsort is used by view
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx + 2) != 1 ||
|
||||
- view->src[0] != argsort) {
|
||||
- return false;
|
||||
- }
|
||||
- // view is written (via argsort), we can skip checking it
|
||||
-
|
||||
- if (with_norm) {
|
||||
- // get_rows is used by reshape
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx + 4) != 1 ||
|
||||
- reshape5->src[0] != get_rows) {
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- // reshape is used by sum_rows and div
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx + 5) != 2 ||
|
||||
- sum_rows->src[0] != reshape5 ||
|
||||
- div->src[0] != reshape5) {
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- // sum_rows is used by div
|
||||
- if (ggml_node_get_use_count(cgraph, node_idx + 6) != 1 ||
|
||||
- div->src[1] != sum_rows) {
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- // div/reshape are written
|
||||
- if (reshape8->src[0] != div) {
|
||||
- return false;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
if (!ctx->device->subgroup_arithmetic ||
|
||||
!ctx->device->subgroup_shuffle ||
|
||||
!ctx->device->subgroup_require_full_support ||
|
||||
@@ -12517,10 +12562,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->num_additional_fused_ops = num_adds - 1;
|
||||
} else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
- } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
|
||||
- ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
|
||||
- } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
|
||||
- ctx->num_additional_fused_ops = topk_moe.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
|
||||
}
|
||||
}
|
||||
ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
|
||||
@@ -12618,10 +12671,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->num_additional_fused_ops = num_adds - 1;
|
||||
} else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
- } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
|
||||
- ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
|
||||
- } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
|
||||
- ctx->num_additional_fused_ops = topk_moe.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
|
||||
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
|
||||
+ ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
|
||||
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
|
||||
+ ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12754,25 +12815,44 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
while (first_unused < graph->n_nodes) {
|
||||
std::vector<int> current_set;
|
||||
|
||||
- // Avoid reordering topk_moe_norm
|
||||
- if (first_unused + (int)topk_moe_norm.size() <= graph->n_nodes) {
|
||||
- bool is_topk_moe_norm = true;
|
||||
- for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
|
||||
- if (graph->nodes[first_unused + j]->op != topk_moe_norm[j] || used[first_unused + j]) {
|
||||
- is_topk_moe_norm = false;
|
||||
+ // Check for fusion patterns and avoid reordering them
|
||||
+ auto const &match_pattern = [&](const std::initializer_list<ggml_op> &pattern, int start) -> bool {
|
||||
+ if (start + (int)pattern.size() <= graph->n_nodes) {
|
||||
+ bool is_pattern = true;
|
||||
+ for (size_t j = 0; j < pattern.size(); ++j) {
|
||||
+ if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) {
|
||||
+ is_pattern = false;
|
||||
+ }
|
||||
}
|
||||
+ return is_pattern;
|
||||
}
|
||||
- if (is_topk_moe_norm) {
|
||||
- for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
|
||||
+ return false;
|
||||
+ };
|
||||
+
|
||||
+ auto const &keep_pattern = [&](const std::initializer_list<ggml_op> &pattern) -> bool {
|
||||
+ if (match_pattern(pattern, first_unused)) {
|
||||
+ for (size_t j = 0; j < pattern.size(); ++j) {
|
||||
new_order.push_back(graph->nodes[first_unused + j]);
|
||||
used[first_unused + j] = true;
|
||||
}
|
||||
while (first_unused < graph->n_nodes && used[first_unused]) {
|
||||
first_unused++;
|
||||
}
|
||||
- continue;
|
||||
+ return true;
|
||||
}
|
||||
+ return false;
|
||||
+ };
|
||||
+
|
||||
+ if (keep_pattern(topk_moe_early_softmax_norm)) {
|
||||
+ continue;
|
||||
+ }
|
||||
+ if (keep_pattern(topk_moe_early_softmax)) {
|
||||
+ continue;
|
||||
}
|
||||
+ if (keep_pattern(topk_moe_late_softmax)) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
// First, grab the next unused node.
|
||||
current_set.push_back(first_unused);
|
||||
|
||||
@@ -12790,6 +12870,12 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
if (is_empty(graph->nodes[j])) {
|
||||
continue;
|
||||
}
|
||||
+ // Don't pull forward nodes from fusion patterns
|
||||
+ if (match_pattern(topk_moe_early_softmax_norm, j) ||
|
||||
+ match_pattern(topk_moe_early_softmax, j) ||
|
||||
+ match_pattern(topk_moe_late_softmax, j)) {
|
||||
+ continue;
|
||||
+ }
|
||||
bool ok = true;
|
||||
for (int c = first_unused; c < j; ++c) {
|
||||
if (!used[c] &&
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
|
||||
index 9e56d5f8a..bc1c278bf 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
|
||||
@@ -11,6 +11,8 @@ layout (push_constant) uniform parameter
|
||||
{
|
||||
uint n_rows;
|
||||
uint n_expert_used;
|
||||
+ float clamp_min;
|
||||
+ float clamp_max;
|
||||
};
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||
@@ -18,6 +20,7 @@ layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
||||
layout(constant_id = 1) const uint n_experts = 512;
|
||||
layout(constant_id = 2) const bool with_norm = true;
|
||||
+layout(constant_id = 3) const bool late_softmax = false;
|
||||
|
||||
const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
|
||||
|
||||
@@ -25,53 +28,72 @@ layout (binding = 0, std430) readonly buffer Logits {float logits[];};
|
||||
layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
|
||||
layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
|
||||
|
||||
-void main() {
|
||||
- const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
|
||||
- if (row >= n_rows) {
|
||||
- return;
|
||||
- }
|
||||
+const float INFINITY = 1.0 / 0.0;
|
||||
|
||||
- const uint logits_offset = n_experts * row;
|
||||
- const uint weights_offset = n_expert_used * row;
|
||||
- const uint ids_offset = n_experts * row;
|
||||
-
|
||||
- float logits_r[experts_per_thread];
|
||||
-
|
||||
- const float INFINITY = 1.0 / 0.0;
|
||||
+// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
|
||||
+void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit, const uint lane, const bool use_limit) {
|
||||
+ float max_val = -INFINITY;
|
||||
|
||||
[[unroll]]
|
||||
- for (uint i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
- const uint expert = i + gl_LocalInvocationID.x;
|
||||
- logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[logits_offset + expert] : -INFINITY;
|
||||
+ for (int i = 0; i < experts_per_thread; i++) {
|
||||
+ const uint idx = lane + i * WARP_SIZE;
|
||||
+ const bool is_active = !use_limit || (idx < limit);
|
||||
+ if (is_active) {
|
||||
+ max_val = max(max_val, vals[i]);
|
||||
+ }
|
||||
}
|
||||
|
||||
- float max_val = logits_r[0];
|
||||
+ max_val = subgroupMax(max_val);
|
||||
+
|
||||
+ float sum = 0.f;
|
||||
|
||||
[[unroll]]
|
||||
- for (int i = 1; i < experts_per_thread; i++) {
|
||||
- const float val = logits_r[i];
|
||||
- max_val = max(val, max_val);
|
||||
+ for (int i = 0; i < experts_per_thread; i++) {
|
||||
+ const uint idx = lane + i * WARP_SIZE;
|
||||
+ const bool is_active = !use_limit || (idx < limit);
|
||||
+ if (is_active) {
|
||||
+ const float val = exp(vals[i] - max_val);
|
||||
+ vals[i] = val;
|
||||
+ sum += val;
|
||||
+ } else {
|
||||
+ vals[i] = 0.f;
|
||||
+ }
|
||||
}
|
||||
|
||||
- max_val = subgroupMax(max_val);
|
||||
+ sum = subgroupAdd(sum);
|
||||
|
||||
- float wt[experts_per_thread];
|
||||
- float tmp = 0.f;
|
||||
+ const float inv_sum = 1.0f / sum;
|
||||
|
||||
[[unroll]]
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
- const float val = logits_r[i];
|
||||
- wt[i] = exp(val - max_val);
|
||||
- tmp += wt[i];
|
||||
+ const uint idx = lane + i * WARP_SIZE;
|
||||
+ const bool is_active = !use_limit || (idx < limit);
|
||||
+ if (is_active) {
|
||||
+ vals[i] *= inv_sum;
|
||||
+ }
|
||||
}
|
||||
+}
|
||||
|
||||
- tmp = subgroupAdd(tmp);
|
||||
+void main() {
|
||||
+ const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
|
||||
+ if (row >= n_rows) {
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
- const float inv_sum = 1.0f / tmp;
|
||||
+ const uint logits_offset = n_experts * row;
|
||||
+ const uint weights_offset = n_expert_used * row;
|
||||
+ const uint ids_offset = n_experts * row;
|
||||
+
|
||||
+ float wt[experts_per_thread];
|
||||
|
||||
[[unroll]]
|
||||
- for (int i = 0; i < experts_per_thread; i++) {
|
||||
- wt[i] = wt[i] * inv_sum;
|
||||
+ for (uint i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
+ const uint expert = i + gl_LocalInvocationID.x;
|
||||
+ wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
|
||||
+ }
|
||||
+
|
||||
+ if (!late_softmax) {
|
||||
+ softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
|
||||
}
|
||||
|
||||
// at this point, each thread holds a portion of softmax,
|
||||
@@ -82,6 +104,11 @@ void main() {
|
||||
|
||||
float output_weights[experts_per_thread];
|
||||
|
||||
+ [[unroll]]
|
||||
+ for (int i = 0; i < experts_per_thread; i++) {
|
||||
+ output_weights[i] = 0.f;
|
||||
+ }
|
||||
+
|
||||
for (int k = 0; k < n_expert_used; k++) {
|
||||
float max_val = wt[0];
|
||||
uint max_expert = gl_LocalInvocationID.x;
|
||||
@@ -121,6 +148,7 @@ void main() {
|
||||
|
||||
if (with_norm) {
|
||||
wt_sum = subgroupAdd(wt_sum);
|
||||
+ wt_sum = clamp(wt_sum, clamp_min, clamp_max);
|
||||
const float inv_sum = 1.0f / wt_sum;
|
||||
|
||||
[[unroll]]
|
||||
@@ -129,6 +157,10 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
+ if (late_softmax) {
|
||||
+ softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
|
||||
+ }
|
||||
+
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < experts_per_thread; ++i) {
|
||||
uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,85 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jeff Bolz <jbolz@nvidia.com>
|
||||
Date: Thu, 30 Oct 2025 01:27:41 -0500
|
||||
Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
|
||||
|
||||
---
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 4 ++++
|
||||
ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp | 16 ++++++++++++----
|
||||
2 files changed, 16 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index db92a7901..e959674d1 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
|
||||
|
||||
struct vk_op_argsort_push_constants {
|
||||
uint32_t ncols;
|
||||
+ uint32_t nrows;
|
||||
int32_t order;
|
||||
};
|
||||
|
||||
@@ -8710,6 +8711,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
||||
+ elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
@@ -9952,9 +9954,11 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
int32_t * op_params = (int32_t *)dst->op_params;
|
||||
|
||||
uint32_t ncols = src0->ne[0];
|
||||
+ uint32_t nrows = ggml_nrows(src0);
|
||||
|
||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
||||
ncols,
|
||||
+ nrows,
|
||||
op_params[0],
|
||||
}, dryrun);
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
|
||||
index c81b84452..c4e68bc02 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
|
||||
@@ -14,6 +14,7 @@ layout (binding = 1) buffer D {int data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
+ uint nrows;
|
||||
uint order;
|
||||
} p;
|
||||
|
||||
@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
|
||||
dst_row[idx1] = tmp;
|
||||
}
|
||||
|
||||
-void argsort(bool needs_bounds_check) {
|
||||
+void argsort(bool needs_bounds_check, const uint row) {
|
||||
// bitonic sort
|
||||
const int col = int(gl_LocalInvocationID.x);
|
||||
- const uint row = gl_WorkGroupID.y;
|
||||
|
||||
const uint row_offset = row * p.ncols;
|
||||
|
||||
@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
|
||||
|
||||
void main() {
|
||||
if (p.ncols == BLOCK_SIZE) {
|
||||
- argsort(false);
|
||||
+ uint row = gl_WorkGroupID.y;
|
||||
+ while (row < p.nrows) {
|
||||
+ argsort(false, row);
|
||||
+ row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
|
||||
+ }
|
||||
} else {
|
||||
- argsort(true);
|
||||
+ uint row = gl_WorkGroupID.y;
|
||||
+ while (row < p.nrows) {
|
||||
+ argsort(true, row);
|
||||
+ row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Ruben Ortlam <picard12@live.de>
|
||||
Date: Fri, 31 Oct 2025 08:14:49 +0100
|
||||
Subject: [PATCH] vulkan: fix shmem overrun in mmq id shader (#16873)
|
||||
|
||||
* vulkan: fix shmem overrun in mmq id shader
|
||||
|
||||
* metal : fix mul_mm_id
|
||||
|
||||
---------
|
||||
|
||||
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
|
||||
---
|
||||
ggml/src/ggml-metal/ggml-metal-device.cpp | 2 +-
|
||||
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp | 4 ++++
|
||||
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl | 2 +-
|
||||
tests/test-backend-ops.cpp | 3 +++
|
||||
4 files changed, 9 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
|
||||
index 758116342..c78082ac3 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
|
||||
@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
|
||||
- snprintf(name, 256, "%s", base);
|
||||
+ snprintf(name, 256, "%s_ne02=%d", base, ne02);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (res) {
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
|
||||
index 8b238ac4b..d955b4fc7 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
|
||||
@@ -82,9 +82,13 @@ layout (constant_id = 10) const uint WARP = 32;
|
||||
|
||||
#include "mul_mmq_shmem_types.glsl"
|
||||
|
||||
+#ifdef MUL_MAT_ID
|
||||
+#define BK_STEP 1
|
||||
+#else
|
||||
#ifndef BK_STEP
|
||||
#define BK_STEP 4
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
// Shared memory cache
|
||||
shared block_a_cache buf_a[BM * BK_STEP];
|
||||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
|
||||
index 72fec4404..1c0f5306f 100644
|
||||
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
|
||||
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
|
||||
@@ -27,7 +27,7 @@ struct block_a_cache {
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
#define QUANT_R_MMQ 1
|
||||
// AMD likes 4, Intel likes 1 and Nvidia likes 2
|
||||
-#define BK_STEP 1
|
||||
+// #define BK_STEP 1
|
||||
struct block_a_cache {
|
||||
int32_t qs[32/4];
|
||||
FLOAT_TYPE dm;
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index 657b6cc2f..1f8dda383 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -6722,6 +6722,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
|
||||
|
||||
+ // gpt-oss issue with Vulkan mmq_id
|
||||
+ test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
+
|
||||
for (ggml_type type_a : base_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
for (int n_mats : {4, 8}) {
|
||||
@@ -1,80 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Masato Nakasaka <masato.nakasaka@intel.com>
|
||||
Date: Fri, 31 Oct 2025 16:18:59 +0900
|
||||
Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
|
||||
supported (#16796)
|
||||
|
||||
* Experimenting crash fix
|
||||
|
||||
* added assert for aborting and fixed comment
|
||||
|
||||
* changed to check if a pipeline is empty or not
|
||||
|
||||
* Moved function in class definition
|
||||
|
||||
* replaced with is_empty
|
||||
|
||||
* Modified is_empty to check only unaligned pipelines
|
||||
---
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 +++++++++++++-------
|
||||
1 file changed, 13 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index e959674d1..903050b0b 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
||||
struct vk_matmul_pipeline_struct {
|
||||
vk_pipeline l, m, s;
|
||||
vk_pipeline a_l, a_m, a_s;
|
||||
+ // Returns true when all unaligned pipelines are null.
|
||||
+ // We only check for unaligned variants since one of the unaligned pipelines must exist
|
||||
+ // while aligned pipelines are optional
|
||||
+ bool is_empty() const {
|
||||
+ return l == nullptr && m == nullptr && s == nullptr;
|
||||
+ }
|
||||
};
|
||||
-
|
||||
typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
|
||||
|
||||
struct vk_matmul_pipeline2 {
|
||||
@@ -5080,7 +5085,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||
if (src1_type == GGML_TYPE_Q8_1) {
|
||||
vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc;
|
||||
|
||||
- if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
|
||||
+ if (pipelines->is_empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -5229,7 +5234,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||
if (src1_type == GGML_TYPE_Q8_1) {
|
||||
vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc;
|
||||
|
||||
- if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
|
||||
+ if (pipelines->is_empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -5264,16 +5269,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
+ vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
|
||||
// XXX TODO 'prec' is not actually allowed in mul_mat_id.
|
||||
bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/;
|
||||
- bool support_fp16acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc != nullptr;
|
||||
- bool support_fp32acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc != nullptr;
|
||||
+ bool support_fp16acc = !mmp.f16acc->is_empty();
|
||||
+ bool support_fp32acc = !mmp.f32acc->is_empty();
|
||||
|
||||
if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) {
|
||||
- return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc;
|
||||
+ return mmp.f16acc;
|
||||
} else {
|
||||
GGML_ASSERT(support_fp32acc);
|
||||
- return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc;
|
||||
+ return mmp.f32acc;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user