mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
ggml: handle all streams (#13350)
Follow up from #12992 Free all streams, and keep the alloc logic aligned across streams.
This commit is contained in:
12
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
vendored
12
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
vendored
@@ -1221,7 +1221,11 @@ struct ggml_backend_cuda_context {
|
||||
|
||||
ggml_cuda_pool & pool(int device) {
|
||||
if (pools[device][curr_stream_no] == nullptr) {
|
||||
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
||||
bool alloc = true;
|
||||
if (pools[device][0] != nullptr) {
|
||||
alloc = pools[device][0]->alloc_memory();
|
||||
}
|
||||
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||
}
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1238,12 +1242,12 @@ struct ggml_backend_cuda_context {
|
||||
}
|
||||
}
|
||||
|
||||
size_t pool_get_alloc_size() {
|
||||
if (pools[device][curr_stream_no] == nullptr) {
|
||||
size_t pool_get_alloc_size(int stream_no) {
|
||||
if (pools[device][stream_no] == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return pools[device][curr_stream_no]->alloc_size();
|
||||
return pools[device][stream_no]->alloc_size();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
10
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
10
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -3913,12 +3913,18 @@ static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend,
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
return ctx->pool_get_alloc_size();
|
||||
size_t allocs = 0;
|
||||
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||
allocs += ctx->pool_get_alloc_size(i);
|
||||
}
|
||||
return allocs;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
||||
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||
ctx->pools[ctx->device][i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
|
||||
Reference in New Issue
Block a user