llm: Use Ollama engine memory layouts for both old and new engines

Currently for both the old and new engines, there is code to calculate how much memory is required for a model and lay out the layers onto GPUs. This reuses the new engine's lay out code for the old engine as well, bringing them closer together. The old engine continues to use its current method of estimating required memory. This reduces maintainence effort and improves consistency, as new features only need to be implemented in one place. The newer code is also more accurate, especially with multiple GPUs.
2025-12-21 14:26:30 +00:00 · 2025-11-05 14:17:09 -08:00
parent 4372d0bfef
commit f560bd077f
5 changed files with 210 additions and 889 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -92,7 +92,8 @@ type llmServer struct {
 	numParallel int
 	modelPath   string

-	loadRequest LoadRequest // Parameters used to initialize the runner
+	loadRequest LoadRequest       // Parameters used to initialize the runner
+	mem         *ml.BackendMemory // Memory allocations for this model

 	// llamaModel is an instance of the cgo llama.cpp model definition
 	// nil if this server is running the new engine
@@ -113,15 +114,11 @@ type llmServer struct {
 type llamaServer struct {
 	llmServer

-	ggml     *ggml.GGML
-	gpus     []ml.DeviceInfo // The set of GPUs covered by the memory estimate
-	estimate MemoryEstimate
+	ggml *ggml.GGML
 }

 type ollamaServer struct {
 	llmServer
-
-	mem *ml.BackendMemory
 }

 // LoadModel will load a model from disk. The model must be in the GGML format.
@@ -463,169 +460,226 @@ type LoadResponse struct {

 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

-func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
-	systemTotalMemory := systemInfo.TotalMemory
-	systemFreeMemory := systemInfo.FreeMemory
-	systemSwapFreeMemory := systemInfo.FreeSwap
-	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
+	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)

-	if len(gpus) == 0 || s.options.NumGPU == 0 {
-		if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
+	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
+
+	// Synthesize memory allocation information based on our estimates
+	s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
+		Name:    "CPU",
+		Weights: make([]uint64, s.totalLayers),
+		Cache:   make([]uint64, s.totalLayers),
+	}, GPUs: make([]ml.DeviceMemory, len(gpus))}
+
+	for i := range s.mem.GPUs {
+		s.mem.GPUs[i].Name = gpus[i].Name
+		s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
+		s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
+		s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
+	}
+
+	kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
+		s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
+
+	// Use the size of one layer as a buffer
+	layers := s.ggml.Tensors().GroupLayers()
+	if blk0, ok := layers["blk.0"]; ok {
+		for i := range gpus {
+			gpus[i].FreeMemory -= blk0.Size() + kv[0]
 		}
 	} else {
-		g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-		if g == nil {
-			if !requireFull {
-				g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-			} else {
-				slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-				return nil, ErrLoadRequiredFull
+		slog.Warn("model missing blk.0 layer size")
+	}
+
+	// Assign all the layers to the CPU for now, they will get reassigned later
+	for i := range s.ggml.KV().BlockCount() {
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			s.mem.CPU.Weights[i] = blk.Size()
+			s.mem.CPU.Cache[i] += kv[i]
+		}
+	}
+
+	// We historically haven't included InputWeights in the model size
+	var outputWeights uint64
+	if layer, ok := layers["output_norm"]; ok {
+		outputWeights += layer.Size()
+	}
+	if layer, ok := layers["output"]; ok {
+		outputWeights += layer.Size()
+	} else if layer, ok := layers["token_embd"]; ok {
+		outputWeights += layer.Size()
+	}
+	s.mem.CPU.Weights[s.totalLayers-1] = outputWeights
+
+	// The vision projector is always loaded on the first GPU if available.
+	// This can't be assigned by us, so just subtract it from free space
+	projectorGPU := -1
+	var projectorWeights uint64
+	if len(gpus) > 0 {
+		for _, projector := range s.loadRequest.LoraPath {
+			projectorWeights += projectorMemoryRequirements(projector)
+		}
+
+		// llama.cpp uses the first discrete GPU if available, otherwise the first iGPU
+		firstIntegrated := -1
+		for i := range gpus {
+			if !gpus[i].Integrated {
+				projectorGPU = i
+				break
+			}
+			if firstIntegrated == -1 {
+				firstIntegrated = i
 			}
 		}
-		gpus = g
-	}
-
-	s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
-
-	if len(gpus) >= 1 {
-		switch {
-		case s.options.NumGPU == 0:
-			gpus = []ml.DeviceInfo{}
-		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
-			// disable partial offloading when model is greater than total system memory as this
-			// can lead to locking up the system
-			s.options.NumGPU = 0
-			gpus = []ml.DeviceInfo{}
-		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
-			// Don't bother loading into the GPU if no layers can fit
-			gpus = []ml.DeviceInfo{}
-		case s.options.NumGPU < 0 && s.estimate.Layers > 0:
-			s.options.NumGPU = s.estimate.Layers
+		if projectorGPU == -1 {
+			projectorGPU = firstIntegrated
 		}
-	} else {
-		s.options.NumGPU = 0
+
+		gpus[projectorGPU].FreeMemory -= projectorWeights
 	}

-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
-		systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
-		available := systemInfo.FreeMemory + systemInfo.FreeSwap
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+	var kvTotal uint64
+	for _, kvLayer := range kv {
+		kvTotal += kvLayer
+	}
+
+	if graphPartialOffload == 0 {
+		headsKV := s.ggml.KV().HeadCountKVMin()
+		if headsKV == 0 {
+			headsKV = 1
+		}
+		gqa := s.ggml.KV().HeadCountMax() / headsKV
+		graphPartialOffload = gqa * kvTotal / 6
+	}
+	if graphFullOffload == 0 {
+		graphFullOffload = graphPartialOffload
+	}
+
+	// On Metal there's no partial offload overhead
+	if len(gpus) > 0 && gpus[0].Library == "Metal" {
+		graphPartialOffload = graphFullOffload
+	}
+
+	// Create a layout based on the memory data that we've built. The compute graph
+	// for GPUs is iteratively assigned based on the number of GPUs that are required.
+	var gpuLayers ml.GPULayersList
+	for {
+		prevGPULayers := gpuLayers
+
+		var err error
+		gpuLayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, 0)
+		if err != nil {
+			return nil, err
+		}
+
+		if len(gpuLayers) > len(prevGPULayers) {
+			for _, gl := range gpuLayers {
+				for i := range s.mem.GPUs {
+					if gl.DeviceID == s.mem.GPUs[i].DeviceID {
+						s.mem.GPUs[i].Graph = max(graphPartialOffload, graphFullOffload)
+						break
+					}
+				}
+			}
+		} else {
+			break
 		}
 	}

-	slog.Info("offload", "", s.estimate)
+	// This maintains the historical assignment of graph sizes, though it isn't fully accurate
+	graphSize := graphFullOffload
+	if gpuLayers.Sum() < int(s.totalLayers) {
+		graphSize = graphPartialOffload
+	}

-	s.gpus = gpus
-	s.loadRequest.GPULayers = createGPULayers(s.estimate, s.ggml, gpus, s.options.NumGPU)
+	// For all layers that we have assigned to GPUs, move them in the memory data so
+	// that it is reported accurately
+	for _, gl := range gpuLayers {
+		for i := range s.mem.GPUs {
+			if gl.DeviceID == s.mem.GPUs[i].DeviceID {
+				for _, l := range gl.Layers {
+					s.mem.GPUs[i].Weights[l] = s.mem.CPU.Weights[l]
+					s.mem.GPUs[i].Cache[l] = s.mem.CPU.Cache[l]

-	// Mmap is only supported on the llama engine
-	if s.textProcessor == nil {
-		s.loadRequest.UseMmap = true
+					s.mem.CPU.Weights[l] = 0
+					s.mem.CPU.Cache[l] = 0
+				}

-		// mmap has issues with partial offloading on metal
-		for _, g := range gpus {
-			if g.Library == "Metal" &&
-				uint64(s.options.NumGPU) > 0 &&
-				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
-				s.options.UseMMap = new(bool)
-				*s.options.UseMMap = false
+				s.mem.GPUs[i].Graph = graphSize
+				break
 			}
 		}
+	}

-		// Windows CUDA should not use mmap for best performance
-		// Linux  with a model larger than free space, mmap leads to thrashing
-		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
-			(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
-			(len(gpus) == 0 && s.options.UseMMap == nil) ||
-			(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
-			(s.options.UseMMap != nil && !*s.options.UseMMap) {
-			s.loadRequest.UseMmap = false
+	if projectorGPU > 0 && len(s.mem.GPUs[projectorGPU].Weights) > 0 {
+		s.mem.GPUs[projectorGPU].Weights[s.totalLayers-1] += projectorWeights
+	}
+
+	slog.Debug("memory", "estimate", s.mem)
+	s.mem.Log(slog.LevelInfo)
+
+	// The llama engine uses mmap by default
+	s.loadRequest.UseMmap = true
+
+	// mmap has issues with partial offloading on metal
+	for _, g := range gpus {
+		if g.Library == "Metal" &&
+			uint64(s.options.NumGPU) > 0 &&
+			uint64(s.options.NumGPU) < s.totalLayers {
+			s.options.UseMMap = new(bool)
+			*s.options.UseMMap = false
 		}
 	}

+	// Windows CUDA should not use mmap for best performance
+	// Linux  with a model larger than free space, mmap leads to thrashing
+	// For CPU loads we want the memory to be allocated, not FS cache
+	if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
+		(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) ||
+		(len(gpus) == 0 && s.options.UseMMap == nil) ||
+		(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
+		(s.options.UseMMap != nil && !*s.options.UseMMap) {
+		s.loadRequest.UseMmap = false
+	}
+
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
 		return nil, err
 	}

+	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
 		return nil, err
 	}

-	// On the Ollama engine, we can print out a summary of the memory allocations.
-	// We don't have this for the llama engine but it does something similar itself.
-	if s.textProcessor != nil {
-		resp.Memory.Log(slog.LevelInfo)
-	}
-
 	if !resp.Success {
-		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
 		return nil, errors.New("failed to allocate memory for model")
 	}

 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
-	if s.textProcessor == nil {
-		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
-	} else {
-		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
-	}
+	return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 }

-// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
-// of particular layers onto GPUs
-func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
-	if numGPU <= 0 || len(gpus) == 0 {
-		return nil
+func projectorMemoryRequirements(filename string) (weights uint64) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
+	}
+	defer file.Close()
+
+	ggml, err := ggml.Decode(file, 1024)
+	if err != nil {
+		return 0
 	}

-	gpuLayers := make(ml.GPULayersList, len(gpus))
-	for i := range gpuLayers {
-		gpuLayers[i].DeviceID = gpus[i].DeviceID
+	for _, layer := range ggml.Tensors().GroupLayers() {
+		weights += layer.Size()
 	}

-	var sum float32
-	splits := make([]float32, len(estimate.TensorSplit))
-	// cumulative sum of all splits
-	for i := range splits {
-		sum += float32(estimate.TensorSplit[i])
-		splits[i] = sum
-	}
-
-	if sum <= 0 {
-		return nil
-	}
-
-	// normalize splits
-	for i := range splits {
-		splits[i] /= sum
-	}
-
-	blocks := int(ggml.KV().BlockCount())
-	gpuRangeStart := max(0, blocks-numGPU)
-	gpuRangeStop := min(gpuRangeStart+numGPU, blocks+1)
-	for i := range blocks + 1 {
-		if i < gpuRangeStart || i >= gpuRangeStop {
-			continue
-		}
-
-		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
-		if index < 0 || index >= len(gpus) {
-			continue
-		}
-
-		gpuLayers[index].Layers = append(gpuLayers[index].Layers, i)
-	}
-
-	return gpuLayers
+	return weights
 }

 // Load finds the optimal layout of layers to offload on GPUs based on no initial information about the size of the model
@@ -652,23 +706,6 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus

 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)

-	systemTotalMemory := systemInfo.TotalMemory
-	systemFreeMemory := systemInfo.FreeMemory
-	systemSwapFreeMemory := systemInfo.FreeSwap
-	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
-
-	for _, gpu := range gpus {
-		available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
-		if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
-			available = 0
-		}
-		slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
-			"available", format.HumanBytes2(available),
-			"free", format.HumanBytes2(gpu.FreeMemory),
-			"minimum", format.HumanBytes2(gpu.MinimumMemory()),
-			"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
-	}
-
 	pastAllocations := make(map[uint64]struct{})
 	var backoff float32

@@ -834,25 +871,22 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
 // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
 // - Assigning layers
 // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
+func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
 			Weights: make([]uint64, s.totalLayers),
 			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}
-	gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
-	if err != nil {
-		return nil, err
-	}
-	err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
+	err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
 	if err != nil {
 		return nil, err
 	}
 	return gpuLayers, nil
 }

-func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
+func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64) {
 	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
 	sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))

@@ -910,11 +944,11 @@ func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.Backen
 			gpuLayers = libraryGpuLayers
 		}
 	}
-	return gpuLayers, layers, nil
+	return gpuLayers, layers
 }

 // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
+func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@@ -942,11 +976,13 @@ nextLayer:

 	if requireFull {
 		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
 			return ErrLoadRequiredFull
 		}

 		if cpuSize > systemInfo.FreeMemory {
-			return ErrLoadRequiredFull
+			slog.Info("model requires more system memory than is currently available, evicting a model to make space", "required", cpuSize, "free", systemInfo.FreeMemory)
+			return fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
 		}
 	}

@@ -1734,31 +1770,12 @@ func (s *llmServer) Close() error {
 	return nil
 }

-func (s *llamaServer) VRAMSize() uint64 {
-	return s.estimate.VRAMSize
-}
-
-func (s *llamaServer) TotalSize() uint64 {
-	return s.estimate.TotalSize
-}
-
-func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
-	for i, gpu := range s.gpus {
-		if gpu.DeviceID == id {
-			if i < len(s.estimate.GPUSizes) {
-				return s.estimate.GPUSizes[i]
-			}
-		}
-	}
-	return 0
-}
-
 func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	slog.Debug("llamarunner free vram reporting not supported")
 	return nil
 }

-func (s *ollamaServer) VRAMSize() uint64 {
+func (s *llmServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1786,7 +1803,7 @@ func (s *ollamaServer) VRAMSize() uint64 {
 	return mem
 }

-func (s *ollamaServer) TotalSize() uint64 {
+func (s *llmServer) TotalSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1800,7 +1817,7 @@ func (s *ollamaServer) TotalSize() uint64 {
 	return mem
 }

-func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
+func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}