From 408c2f99d0522a0a74f522a913b987b3ceb31b75 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 5 Nov 2025 08:12:15 -0800
Subject: [PATCH] log: trace logging for scheduler (#12961)

---
 server/sched.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/server/sched.go b/server/sched.go
index 1c04047e..5ae42efd 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -143,6 +143,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
 				continue
 			}
+			logutil.Trace("processing incoming request", "model", pending.model.ModelPath)
 
 			for {
 				var runnerToExpire *runnerRef
@@ -161,6 +162,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						runnerToExpire = runner
 					} else {
 						// Runner is usable, return it
+						logutil.Trace("using existing loaded runner", "model", pending.model.ModelPath)
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
@@ -174,13 +176,16 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					if pending.opts.NumGPU == 0 {
 						gpus = []ml.DeviceInfo{}
 					} else {
+						logutil.Trace("refreshing GPU list", "model", pending.model.ModelPath)
 						gpus = s.getGpuFn(ctx, runnersSnapshot)
 					}
+					logutil.Trace("refreshing system information", "model", pending.model.ModelPath)
 					systemInfo := s.getSystemInfoFn()
 					if maxRunners <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
 						if pending.opts.NumGPU == 0 {
 							// Need to get actual GPU list to set the correct default max models
+							logutil.Trace("refreshing GPU list", "model", pending.model.ModelPath)
 							g := s.getGpuFn(ctx, runnersSnapshot)
 							maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
 						} else {
@@ -190,6 +195,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 
 					// Load model for fitting
+					logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
 					ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
 					if err != nil {
 						pending.errCh <- err
@@ -197,6 +203,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 
 					// Update free memory from currently loaded models
+					logutil.Trace("updating free space", "gpu_count", len(gpus), "model", pending.model.ModelPath)
 					s.updateFreeSpace(gpus)
 
 					if loadedCount == 0 {
@@ -208,7 +215,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
 					// More than one loaded model, so we have to see if the
 					// new one fits
-
+					logutil.Trace("loading additional model", "model", pending.model.ModelPath)
 					needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
 					if !needEvict {
 						slog.Debug("new model fits with existing models, loading")