feat: llama.cpp bump (17f7f4) for SSM performance improvements (#13408)

* feat: Bump llama.cpp to the latest master (17f7f4b)

This brings in significant improvements to prefill performance for all
models using the SSM_CONV and SSM_SCAN ops (granite4, jamba, falcon-h,
nemotron-h, Qwen3 Next) on Apple Metal.

See https://github.com/ggml-org/llama.cpp/pull/17876

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Update patches 1-4

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Update patches 5-12

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Update patches 13-18

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Update patch 20

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Update patches 21-31

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Sync vendored code

The two files I'm not sure about here are the swap from gemma3-iswa.cpp to
gemma3.cpp (I chose to include this because I think it's required), and the
inclusion of `ggml-zendnn.h` which I chose to omit.

Branch: LlamaCPPMetalSSMImprovements

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
Gabe Goodhart
2025-12-10 13:59:27 -07:00
committed by GitHub
parent c34fc64688
commit b95693056c
115 changed files with 5176 additions and 2585 deletions

View File

@@ -22,10 +22,10 @@ index c54ff98bf..229bf387b 100644
size_t memory_total;
// device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 8f3b1c173..e803f4af6 100644
index 5145c1e88..f641c1016 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -185,6 +185,51 @@ static int ggml_cuda_parse_id(char devName[]) {
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
}
#endif // defined(GGML_USE_HIP)
@@ -77,7 +77,7 @@ index 8f3b1c173..e803f4af6 100644
static ggml_cuda_device_info ggml_cuda_init() {
ggml_cuda_device_info info = {};
@@ -251,22 +296,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
@@ -108,7 +108,7 @@ index 8f3b1c173..e803f4af6 100644
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -4048,6 +4095,7 @@ struct ggml_backend_cuda_device_context {
@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
@@ -116,7 +116,7 @@ index 8f3b1c173..e803f4af6 100644
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4136,6 +4184,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
}
#endif // defined(__linux__)
@@ -128,7 +128,7 @@ index 8f3b1c173..e803f4af6 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -4176,6 +4229,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -136,7 +136,7 @@ index 8f3b1c173..e803f4af6 100644
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -4767,6 +4821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4833,6 +4887,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;