ollamarunner: Preallocate worst case graph at startup

Currently, the KV cache and graph are lazily allocated as needed. The cache is fully allocated on first use of the corresponding layer whereas the graph grows with the size of the context. This can be an issue if another application allocates more VRAM after we do our calculations - Ollama will crash in the middle of inference. If we instead allocate the maximum needed memory at startup of the runner, we will either succeed or fail at that point rather than at some surprising time in the future. Currently, this only generates a worst case batch for text, which means that vision models may get a partial allocation and continue to lazily allocate the rest.
2025-12-21 14:26:30 +00:00 · 2025-04-03 12:50:20 -07:00
parent a807985e59
commit dbb149e6f7
10 changed files with 156 additions and 55 deletions
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -56,8 +56,9 @@ type Cache interface {

 	// StartForward is called before the start of the model's forward pass.
 	// For each token in the coming batch, there must be a corresponding
-	// entry in positions and seqs.
-	StartForward(ctx ml.Context, batch input.Batch) error
+	// entry in positions and seqs. reserve is to preallocate memory
+	// without actually storing data in the cache.
+	StartForward(ctx ml.Context, batch input.Batch, reserve bool) error

 	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
 	CopyPrefix(srcSeq, dstSeq int, len int32)
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -146,51 +146,60 @@ func (c *Causal) Close() {
 	}
 }

-func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

-	c.updateSlidingWindow()
+	if !reserve {
+		c.updateSlidingWindow()
+
+		var err error
+		c.curLoc, err = c.findStartLoc()
+		if errors.Is(err, ErrKvCacheFull) {
+			c.defrag()
+			c.curLoc, err = c.findStartLoc()
+		}
+		if err != nil {
+			return err
+		}
+
+		c.curCellRange = newRange()
+		for i, pos := range batch.Positions {
+			seq := batch.Sequences[i]
+
+			c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
+
+			seqRange, ok := c.cellRanges[seq]
+			if !ok {
+				seqRange = newRange()
+			}
+
+			if c.curLoc+i > seqRange.max {
+				seqRange.max = c.curLoc + i
+			}
+			if seqRange.max > c.curCellRange.max {
+				c.curCellRange.max = seqRange.max
+			}
+
+			if c.curLoc+i < seqRange.min {
+				seqRange.min = c.curLoc + i
+			}
+			if seqRange.min < c.curCellRange.min {
+				c.curCellRange.min = seqRange.min
+			}
+			c.cellRanges[seq] = seqRange
+		}
+	} else {
+		// If we are reserving memory, don't update any of the cache metadata but set the size
+		// to the worst case.
+		c.curLoc = 0
+		c.curCellRange.min = 0
+		c.curCellRange.max = len(c.cells) - 1
+	}

 	var err error
-	c.curLoc, err = c.findStartLoc()
-	if errors.Is(err, ErrKvCacheFull) {
-		c.defrag()
-		c.curLoc, err = c.findStartLoc()
-	}
-	if err != nil {
-		return err
-	}
-
-	c.curCellRange = newRange()
-	for i, pos := range batch.Positions {
-		seq := batch.Sequences[i]
-
-		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
-
-		seqRange, ok := c.cellRanges[seq]
-		if !ok {
-			seqRange = newRange()
-		}
-
-		if c.curLoc+i > seqRange.max {
-			seqRange.max = c.curLoc + i
-		}
-		if seqRange.max > c.curCellRange.max {
-			c.curCellRange.max = seqRange.max
-		}
-
-		if c.curLoc+i < seqRange.min {
-			seqRange.min = c.curLoc + i
-		}
-		if seqRange.min < c.curCellRange.min {
-			c.curCellRange.min = seqRange.min
-		}
-		c.cellRanges[seq] = seqRange
-	}
-
 	c.curMask, err = c.buildMask(ctx)

 	return err
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -281,7 +281,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			context := backend.NewContext()
 			defer context.Close()

-			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs})
+			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs}, false)
 			if err != nil {
 				panic(err)
 			}
@@ -315,7 +315,7 @@ func TestCanResume(t *testing.T) {
 	err := cache.StartForward(context, input.Batch{
 		Positions: []int32{0, 1, 2, 3},
 		Sequences: []int{0, 0, 0, 0},
-	})
+	}, false)
 	if err != nil {
 		t.Fatalf("StartForward failed: %v", err)
 	}
@@ -342,7 +342,7 @@ func TestCanResume(t *testing.T) {
 	err = cache.StartForward(context, input.Batch{
 		Positions: []int32{4, 5},
 		Sequences: []int{0, 0},
-	})
+	}, false)
 	if err != nil {
 		t.Fatalf("StartForward failed: %v", err)
 	}
@@ -440,6 +440,8 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }

 func (c *testContext) Compute(...ml.Tensor) {}

+func (c *testContext) Reserve() error { return nil }
+
 func (c *testContext) MaxGraphNodes() int {
 	return 10
 }
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -27,6 +27,11 @@ type EncoderCache struct {
 	// anything will be stored)
 	curPos int32

+	// curReserve indicates that this forward pass is only for
+	// memory reservation and we should not update our metadata
+	// based on it.
+	curReserve bool
+
 	// ** cache metadata **

 	// was something stored in the cache?
@@ -83,12 +88,14 @@ func (c *EncoderCache) Close() {
 	}
 }

-func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
 	// We work with the most recent image
 	if len(batch.Multimodal) > 0 {
 		c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
 	}

+	c.curReserve = reserve
+
 	return nil
 }

@@ -105,8 +112,10 @@ func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
 }

 func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.encoderPos = c.curPos
-	c.encoderCached = true
+	if !c.curReserve {
+		c.encoderPos = c.curPos
+		c.encoderCached = true
+	}

 	if c.config.PermutedV {
 		value = value.Permute(ctx, 1, 2, 0, 3)
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -41,9 +41,9 @@ func (c *WrapperCache) Close() {
 	}
 }

-func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
 	for i, cache := range c.caches {
-		err := cache.StartForward(ctx, batch)
+		err := cache.StartForward(ctx, batch, reserve)
 		if err != nil {
 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
 			for j := i - 1; j >= 0; j-- {