mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
ggml: Remove allocation status reporting
For each memory allocation we report the size of the (attempted) allocation and whether it succeeded or failed. The latter status reporting proved to be not that useful in practice as systems such as Windows can automatically overflow from VRAM into RAM, resultings in successful allocations even when there isn't enough memory where we wanted. As a result, this information is only used for debug logging, which isn't worthwhile enough for the amount of code. It also isn't fully accurate, as multiple allocations may result in partial failures.
This commit is contained in:
@@ -158,40 +158,6 @@ func (e ErrNoMem) Error() string {
|
||||
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
||||
}
|
||||
|
||||
type AllocationStatus int
|
||||
|
||||
const (
|
||||
// Unallocated memory - have not yet attempted to allocate
|
||||
Unallocated AllocationStatus = iota
|
||||
|
||||
// Failed memory - tried to allocate the memory and did not succeed
|
||||
Failed
|
||||
|
||||
// Allocated memory = tried and succeeded to allocate memory
|
||||
Allocated
|
||||
)
|
||||
|
||||
// Memory is the size of an allocation and whether it was successful.
|
||||
type Memory struct {
|
||||
Size uint64
|
||||
Status AllocationStatus
|
||||
}
|
||||
|
||||
func (m Memory) String() string {
|
||||
s := fmt.Sprint(m.Size)
|
||||
|
||||
switch m.Status {
|
||||
case Unallocated:
|
||||
s += "U"
|
||||
case Failed:
|
||||
s += "F"
|
||||
case Allocated:
|
||||
s += "A"
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// DeviceMemory provides a breakdown of the memory needed
|
||||
// per device, such as a CPU or GPU.
|
||||
type DeviceMemory struct {
|
||||
@@ -204,39 +170,32 @@ type DeviceMemory struct {
|
||||
ID string
|
||||
|
||||
// Weights is the per-layer memory needed for the model weights.
|
||||
Weights []Memory
|
||||
Weights []uint64
|
||||
|
||||
// Cache is the per-layer memory needed for the KV cache.
|
||||
Cache []Memory
|
||||
Cache []uint64
|
||||
|
||||
// Graph is the size of the compute graph. It is not per-layer.
|
||||
Graph Memory
|
||||
Graph uint64
|
||||
}
|
||||
|
||||
// Allocated returns the total size of the memory that has been successfully
|
||||
// allocated on this device
|
||||
func (m DeviceMemory) Allocated() uint64 {
|
||||
var mem uint64
|
||||
func sumMemory(mem []uint64) uint64 {
|
||||
var sum uint64
|
||||
|
||||
for _, w := range m.Weights {
|
||||
if w.Status == Allocated {
|
||||
mem += w.Size
|
||||
}
|
||||
}
|
||||
for _, c := range m.Cache {
|
||||
if c.Status == Allocated {
|
||||
mem += c.Size
|
||||
}
|
||||
}
|
||||
if m.Graph.Status == Allocated {
|
||||
mem += m.Graph.Size
|
||||
for _, m := range mem {
|
||||
sum += m
|
||||
}
|
||||
|
||||
return mem
|
||||
return sum
|
||||
}
|
||||
|
||||
func memoryPresent(mem []Memory) bool {
|
||||
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
|
||||
// Size returns the total size of the memory required by this device
|
||||
func (m DeviceMemory) Size() uint64 {
|
||||
return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
|
||||
}
|
||||
|
||||
func memoryPresent(mem []uint64) bool {
|
||||
return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
|
||||
}
|
||||
|
||||
func (m DeviceMemory) LogValue() slog.Value {
|
||||
@@ -249,7 +208,7 @@ func (m DeviceMemory) LogValue() slog.Value {
|
||||
attrs = append(attrs, slog.Any("Cache", m.Cache))
|
||||
}
|
||||
|
||||
if m.Graph.Size != 0 {
|
||||
if m.Graph != 0 {
|
||||
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
||||
}
|
||||
|
||||
@@ -267,7 +226,7 @@ func (m DeviceMemory) LogValue() slog.Value {
|
||||
// accommodate that to make forward progress.
|
||||
type BackendMemory struct {
|
||||
// InputWeights are always located on the CPU and cannot be moved
|
||||
InputWeights Memory
|
||||
InputWeights uint64
|
||||
|
||||
// CPU model components are located in system memory. This does not
|
||||
// include unified memory allocated through the GPU.
|
||||
@@ -279,7 +238,7 @@ type BackendMemory struct {
|
||||
|
||||
func (m BackendMemory) LogValue() slog.Value {
|
||||
var attrs []slog.Attr
|
||||
if m.InputWeights.Size != 0 {
|
||||
if m.InputWeights != 0 {
|
||||
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
|
||||
}
|
||||
|
||||
@@ -291,17 +250,7 @@ func (m BackendMemory) LogValue() slog.Value {
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
func sumMemory(mem []Memory) uint64 {
|
||||
var sum uint64
|
||||
|
||||
for _, m := range mem {
|
||||
sum += m.Size
|
||||
}
|
||||
|
||||
return sum
|
||||
}
|
||||
|
||||
// Log prints a high level summary of the memory (allocated or not)
|
||||
// Log prints a high level summary of the memory
|
||||
func (m BackendMemory) Log(level slog.Level) {
|
||||
var total uint64
|
||||
|
||||
@@ -311,7 +260,7 @@ func (m BackendMemory) Log(level slog.Level) {
|
||||
total += sum
|
||||
}
|
||||
}
|
||||
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
|
||||
if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
|
||||
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
@@ -328,12 +277,12 @@ func (m BackendMemory) Log(level slog.Level) {
|
||||
}
|
||||
|
||||
for _, gpu := range m.GPUs {
|
||||
if sum := gpu.Graph.Size; sum > 0 {
|
||||
if sum := gpu.Graph; sum > 0 {
|
||||
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
}
|
||||
if sum := m.CPU.Graph.Size; sum > 0 {
|
||||
if sum := m.CPU.Graph; sum > 0 {
|
||||
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||
total += sum
|
||||
}
|
||||
|
||||
@@ -169,8 +169,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
|
||||
requiredMemory.CPU.ID = C.GoString(props.id)
|
||||
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.CPU.Weights = make([]uint64, blocks+1)
|
||||
requiredMemory.CPU.Cache = make([]uint64, blocks+1)
|
||||
|
||||
// create list of buffer types for each gpu
|
||||
var gpuDeviceBufferTypes []deviceBufferType
|
||||
@@ -188,8 +188,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(d, &props)
|
||||
requiredMemory.GPUs[i].ID = C.GoString(props.id)
|
||||
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
|
||||
requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
|
||||
}
|
||||
|
||||
// inputs always use cpu
|
||||
@@ -275,13 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
|
||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
||||
if layer == -1 {
|
||||
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
|
||||
if params.AllocMemory {
|
||||
requiredMemory.InputWeights.Status = ml.Allocated
|
||||
}
|
||||
requiredMemory.InputWeights.Size += uint64(size)
|
||||
requiredMemory.InputWeights += uint64(size)
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
|
||||
btDeviceMemory[bt].Weights[layer] += uint64(size)
|
||||
}
|
||||
|
||||
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
||||
@@ -349,18 +345,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
||||
if params.AllocMemory {
|
||||
for i := range btDeviceMemory[bt].Weights {
|
||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
||||
if b != nil {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if b == nil {
|
||||
for _, b := range bbs {
|
||||
C.ggml_backend_buffer_free(b)
|
||||
@@ -795,24 +779,15 @@ func (c *Context) Reserve() {
|
||||
|
||||
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
|
||||
for _, bt := range c.b.schedBufts {
|
||||
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
|
||||
c.b.btDeviceMemory[bt].Graph = 0
|
||||
}
|
||||
|
||||
for i := range c.b.schedBackends {
|
||||
bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||
|
||||
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
|
||||
graph.Size += uint64(bufferStatus.size)
|
||||
if c.b.allocMemory {
|
||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
||||
graph.Status = ml.Allocated
|
||||
} else {
|
||||
graph.Status = ml.Failed
|
||||
}
|
||||
}
|
||||
bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||
c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)
|
||||
|
||||
logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
||||
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
|
||||
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
|
||||
}
|
||||
|
||||
if !reserved {
|
||||
@@ -862,16 +837,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
|
||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
||||
if c.layer >= 0 {
|
||||
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
|
||||
|
||||
cache.Size += uint64(size)
|
||||
if c.b.allocMemory {
|
||||
if b != nil {
|
||||
cache.Status = ml.Allocated
|
||||
} else {
|
||||
cache.Status = ml.Failed
|
||||
}
|
||||
}
|
||||
c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size)
|
||||
}
|
||||
|
||||
if b == nil {
|
||||
|
||||
7
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
7
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
@@ -65,12 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
struct ggml_allocr_buffer_status {
|
||||
size_t size;
|
||||
bool allocated;
|
||||
};
|
||||
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
|
||||
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -306,12 +306,7 @@ extern "C" {
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
struct ggml_backend_buffer_status {
|
||||
size_t size;
|
||||
bool allocated;
|
||||
};
|
||||
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
8
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
8
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
@@ -932,7 +932,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
||||
|
||||
for (int i = 0; i < buffer_id; i++) {
|
||||
@@ -941,13 +941,11 @@ struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gal
|
||||
// (See above.) However, we need a different check because multiple buffers might be NULL in our
|
||||
// case and we still want to know the attempted size.
|
||||
|
||||
struct ggml_allocr_buffer_status status = {0, true};
|
||||
return status;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
||||
return status;
|
||||
return galloc->buffer_sizes[buffer_id];
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
7
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
7
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
@@ -1656,14 +1656,11 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
||||
|
||||
return status;
|
||||
return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
|
||||
Reference in New Issue
Block a user