mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
feat: llama.cpp bump (17f7f4) for SSM performance improvements (#13408)
* feat: Bump llama.cpp to the latest master (17f7f4b) This brings in significant improvements to prefill performance for all models using the SSM_CONV and SSM_SCAN ops (granite4, jamba, falcon-h, nemotron-h, Qwen3 Next) on Apple Metal. See https://github.com/ggml-org/llama.cpp/pull/17876 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches 1-4 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Update patches 5-12 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches 13-18 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patch 20 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches 21-31 Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Sync vendored code The two files I'm not sure about here are the swap from gemma3-iswa.cpp to gemma3.cpp (I chose to include this because I think it's required), and the inclusion of `ggml-zendnn.h` which I chose to omit. Branch: LlamaCPPMetalSSMImprovements Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
@@ -22,10 +22,10 @@ index c54ff98bf..229bf387b 100644
|
||||
size_t memory_total;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 8f3b1c173..e803f4af6 100644
|
||||
index 5145c1e88..f641c1016 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -185,6 +185,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
@@ -77,7 +77,7 @@ index 8f3b1c173..e803f4af6 100644
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
@@ -251,22 +296,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
@@ -108,7 +108,7 @@ index 8f3b1c173..e803f4af6 100644
|
||||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -4048,6 +4095,7 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
@@ -116,7 +116,7 @@ index 8f3b1c173..e803f4af6 100644
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4136,6 +4184,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
@@ -128,7 +128,7 @@ index 8f3b1c173..e803f4af6 100644
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -4176,6 +4229,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
@@ -136,7 +136,7 @@ index 8f3b1c173..e803f4af6 100644
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -4767,6 +4821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4833,6 +4887,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
Reference in New Issue
Block a user