ggml: Enable op_offload to improve partial offload performance

When a model is partially offloaded to system RAM, we can either do the calculations on the CPU or we can temporarily transfer the data to the GPU to do the calculations there. Small batches tend to be better on the CPU, large batches on the GPU. The llamarunner used the GPU in most cases and the ollamarunner used the CPU. Although the ollamarunner saw an improvement in token generation performance, there was a large performance hit in prompt processing (3-10x). There is an existing heuristic to dynamically switch between these two modes but in practice it doesn't have enough information to accurately make that decision. This adds authoritative data to make the check work to get the best of both worlds. Fixes #12037
2025-12-23 23:18:26 +00:00 · 2025-10-27 16:32:05 -07:00
parent 26465fb85f
commit afaf7ce8c3
15 changed files with 405 additions and 128 deletions
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -386,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		C.int(len(schedBackends)),
 		C.size_t(maxGraphNodes),
 		C._Bool(false),
-		C._Bool(false),
+		C._Bool(true),
 		C._Bool(params.AllocMemory),
 	)

@@ -749,6 +749,9 @@ type Context struct {
 	ctx   *C.struct_ggml_context
 	graph *C.struct_ggml_cgraph

+	// batchSize is a hint to optimize processing
+	batchSize int
+
 	// buft is the buffer type used for new tensors
 	buft C.ggml_backend_buffer_type_t

@@ -805,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 	return c
 }

+func (c *Context) SetBatchSize(batchSize int) {
+	c.batchSize = batchSize
+}
+
 func (c *Context) Compute(tensors ...ml.Tensor) {
 	c.ComputeWithNotify(nil, tensors...)
 }
@@ -815,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
 	if cb != nil {
 		go cb()
 	}
+
+	if c.batchSize > 0 {
+		C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
+	}
+
 	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
 		panic(fmt.Errorf("error computing ggml graph: %v", status))
 	}
@@ -836,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
 }

 func (c *Context) Reserve() {
+	if c.batchSize > 0 {
+		C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
+	}
+
 	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)

 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))