diff --git a/llama/patches/0022-ggml-No-alloc-mode.patch b/llama/patches/0022-ggml-No-alloc-mode.patch index d03c6c84..6e2599b3 100644 --- a/llama/patches/0022-ggml-No-alloc-mode.patch +++ b/llama/patches/0022-ggml-No-alloc-mode.patch @@ -11,9 +11,9 @@ must be recreated with no-alloc set to false before loading data. ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-impl.h | 16 +++ ggml/src/ggml-backend.cpp | 72 ++++++++++- - ggml/src/ggml-cuda/common.cuh | 48 ++++++- + ggml/src/ggml-cuda/common.cuh | 58 ++++++++- ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------ - 5 files changed, 310 insertions(+), 44 deletions(-) + 5 files changed, 320 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 2763f2bd6..b3b5b356a 100644 @@ -219,10 +219,10 @@ index 41eef3b5f..c81a2e48a 100644 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh -index e0abde542..28d6bcd71 100644 +index e0abde542..e98044bd8 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh -@@ -35,6 +35,31 @@ +@@ -35,6 +35,41 @@ #include "vendors/cuda.h" #endif // defined(GGML_USE_HIP) @@ -246,15 +246,25 @@ index e0abde542..28d6bcd71 100644 + } +} + ++static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) { ++ if (!reserving_graph) { ++ return cudaMemsetAsync(devPtr, value, count, stream); ++ } else { ++ return cudaSuccess; ++ } ++} ++ +#undef cudaMemcpyAsync +#define cudaMemcpyAsync cudaMemcpyAsyncReserve +#undef cudaMemcpy2DAsync +#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve ++#undef cudaMemsetAsync ++#define cudaMemsetAsync cudaMemsetAsyncReserve + #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) -@@ -856,6 +881,9 @@ struct ggml_cuda_pool { +@@ -856,6 +891,9 @@ struct ggml_cuda_pool { virtual void * alloc(size_t size, size_t * actual_size) = 0; virtual void free(void * ptr, size_t size) = 0; @@ -264,7 +274,7 @@ index e0abde542..28d6bcd71 100644 }; template -@@ -999,11 +1027,11 @@ struct ggml_backend_cuda_context { +@@ -999,11 +1037,11 @@ struct ggml_backend_cuda_context { // pool std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; @@ -278,7 +288,7 @@ index e0abde542..28d6bcd71 100644 } return *pools[device]; } -@@ -1011,4 +1039,20 @@ struct ggml_backend_cuda_context { +@@ -1011,4 +1049,20 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool() { return pool(device); } @@ -300,7 +310,7 @@ index e0abde542..28d6bcd71 100644 + } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index f4d4a4267..ac70dcac8 100644 +index c555cd30f..eb3db0f19 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh index 28d6bcd7..e98044bd 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh @@ -55,10 +55,20 @@ static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const vo } } +static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) { + if (!reserving_graph) { + return cudaMemsetAsync(devPtr, value, count, stream); + } else { + return cudaSuccess; + } +} + #undef cudaMemcpyAsync #define cudaMemcpyAsync cudaMemcpyAsyncReserve #undef cudaMemcpy2DAsync #define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve +#undef cudaMemsetAsync +#define cudaMemsetAsync cudaMemsetAsyncReserve #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)