llm: Clamp batch size to context size

The context must always be able to store the current batch, so
if the user requests a small context then we should also shrink
the batch to match. This also fixes the TestLongInputContext
test on the new engine. (The old engine already has this behavior.)
This commit is contained in:
Jesse Gross
2025-09-08 17:33:31 -07:00
committed by Jesse Gross
parent 1a558f98e2
commit e119783e66
3 changed files with 5 additions and 3 deletions

View File

@@ -173,6 +173,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
opts.NumCtx = int(trainCtx)
}
opts.NumBatch = min(opts.NumBatch, opts.NumCtx)
loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()