log: trace logging for scheduler (#12961)

This commit is contained in:
Daniel Hiltgen
2025-11-05 08:12:15 -08:00
committed by GitHub
parent 809b9c68fa
commit 408c2f99d0

View File

@@ -143,6 +143,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
slog.Debug("pending request cancelled or timed out, skipping scheduling") slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue continue
} }
logutil.Trace("processing incoming request", "model", pending.model.ModelPath)
for { for {
var runnerToExpire *runnerRef var runnerToExpire *runnerRef
@@ -161,6 +162,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
runnerToExpire = runner runnerToExpire = runner
} else { } else {
// Runner is usable, return it // Runner is usable, return it
logutil.Trace("using existing loaded runner", "model", pending.model.ModelPath)
pending.useLoadedRunner(runner, s.finishedReqCh) pending.useLoadedRunner(runner, s.finishedReqCh)
break break
} }
@@ -174,13 +176,16 @@ func (s *Scheduler) processPending(ctx context.Context) {
if pending.opts.NumGPU == 0 { if pending.opts.NumGPU == 0 {
gpus = []ml.DeviceInfo{} gpus = []ml.DeviceInfo{}
} else { } else {
logutil.Trace("refreshing GPU list", "model", pending.model.ModelPath)
gpus = s.getGpuFn(ctx, runnersSnapshot) gpus = s.getGpuFn(ctx, runnersSnapshot)
} }
logutil.Trace("refreshing system information", "model", pending.model.ModelPath)
systemInfo := s.getSystemInfoFn() systemInfo := s.getSystemInfoFn()
if maxRunners <= 0 { if maxRunners <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt // No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
if pending.opts.NumGPU == 0 { if pending.opts.NumGPU == 0 {
// Need to get actual GPU list to set the correct default max models // Need to get actual GPU list to set the correct default max models
logutil.Trace("refreshing GPU list", "model", pending.model.ModelPath)
g := s.getGpuFn(ctx, runnersSnapshot) g := s.getGpuFn(ctx, runnersSnapshot)
maxRunners = uint(defaultModelsPerGPU * max(len(g), 1)) maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
} else { } else {
@@ -190,6 +195,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
// Load model for fitting // Load model for fitting
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024) ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
if err != nil { if err != nil {
pending.errCh <- err pending.errCh <- err
@@ -197,6 +203,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
// Update free memory from currently loaded models // Update free memory from currently loaded models
logutil.Trace("updating free space", "gpu_count", len(gpus), "model", pending.model.ModelPath)
s.updateFreeSpace(gpus) s.updateFreeSpace(gpus)
if loadedCount == 0 { if loadedCount == 0 {
@@ -208,7 +215,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// More than one loaded model, so we have to see if the // More than one loaded model, so we have to see if the
// new one fits // new one fits
logutil.Trace("loading additional model", "model", pending.model.ModelPath)
needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true) needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
if !needEvict { if !needEvict {
slog.Debug("new model fits with existing models, loading") slog.Debug("new model fits with existing models, loading")