ggml: Enable op_offload to improve partial offload performance

When a model is partially offloaded to system RAM, we can either
do the calculations on the CPU or we can temporarily transfer the
data to the GPU to do the calculations there. Small batches tend
to be better on the CPU, large batches on the GPU.

The llamarunner used the GPU in most cases and the ollamarunner
used the CPU. Although the ollamarunner saw an improvement in
token generation performance, there was a large performance hit
in prompt processing (3-10x).

There is an existing heuristic to dynamically switch between these
two modes but in practice it doesn't have enough information to
accurately make that decision. This adds authoritative data to make
the check work to get the best of both worlds.

Fixes #12037
This commit is contained in:
Jesse Gross
2025-10-27 16:32:05 -07:00
committed by Jesse Gross
parent 26465fb85f
commit afaf7ce8c3
15 changed files with 405 additions and 128 deletions

View File

@@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
computeCtx.Forward(tensors...)
entry.data = make([][]float32, len(entry.mm))
// Multimodal processing is computationally intensive, so treat it similarly to a large batch
computeCtx.SetBatchSize(512)
if !reserve {
computeCtx.Compute(tensors...)

View File

@@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
// Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute
batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs))
batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs))
nextBatch.ctx.SetBatchSize(len(batchInputs))
nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch)
if err != nil {
err = fmt.Errorf("failed to build graph: %w", err)
@@ -1108,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
return err
}
ctx.SetBatchSize(batchSize)
ctx.Forward(t).Reserve()
return nil