test: harden scheduler tests (#12662)

* test: harden scheduler tests

This removes reschedDelay which was stale code, and adds
a new configurable timeout for the waitForVRAMRecovery so
tests can now set the timeout to be very short to avoid the
scheduler getting stuck and hitting a test timeout.

* test: tune tests for partial loads

Give stress tests more time when the model is split between CPU/GPU
This commit is contained in:
Daniel Hiltgen
2025-10-17 08:56:44 -07:00
committed by GitHub
parent 270679932f
commit 68e04c7ff8
10 changed files with 195 additions and 143 deletions

View File

@@ -30,15 +30,15 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
@@ -223,15 +223,15 @@ func TestChatDebugRenderOnly(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)

View File

@@ -35,15 +35,15 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@@ -219,15 +219,15 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{

View File

@@ -68,15 +68,15 @@ func TestGenerateChat(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
@@ -679,15 +679,15 @@ func TestGenerate(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
@@ -1104,15 +1104,15 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
s := &Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}

View File

@@ -268,15 +268,15 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 100 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
@@ -419,15 +419,15 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 100 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
@@ -601,15 +601,15 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,

View File

@@ -52,11 +52,13 @@ type Scheduler struct {
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
getCpuFn func() discover.GpuInfo
reschedDelay time.Duration
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
getCpuFn func() discover.GpuInfo
// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
waitForRecovery time.Duration
}
// Default automatic value for number of models we allow per GPU
@@ -69,15 +71,15 @@ var ErrMaxQueue = errors.New("server busy, please try again. maximum pending re
func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
waitForRecovery: 5 * time.Second,
}
sched.loadFn = sched.load
return sched
@@ -650,8 +652,8 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
freeMemoryNow := freeMemoryBefore
go func() {
// typical convergence is 0.5-1.5s - If it takes more than 5 seconds to discover and converge, let the scheduler estimate VRAM usage
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
// typical convergence is 0.5-1.5s - If it takes too long to discover and converge, let the scheduler estimate VRAM usage
ctx, cancel := context.WithTimeout(context.Background(), s.waitForRecovery)
defer cancel()
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()

View File

@@ -26,7 +26,7 @@ func TestMain(m *testing.M) {
os.Exit(m.Run())
}
func TestInitScheduler(t *testing.T) {
func TestSchedInit(t *testing.T) {
ctx, done := context.WithCancel(t.Context())
defer done()
s := InitScheduler(ctx)
@@ -35,10 +35,11 @@ func TestInitScheduler(t *testing.T) {
s.loadedMu.Unlock()
}
func TestLoad(t *testing.T) {
func TestSchedLoad(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
var f *ggml.GGML // value not used in tests
req := &LlmRequest{
ctx: ctx,
@@ -167,10 +168,11 @@ func getCpuFn() discover.GpuInfo {
return g
}
func TestRequestsSameModelSameRequest(t *testing.T) {
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
@@ -210,10 +212,11 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
}
}
func TestRequestsSimpleReloadSameModel(t *testing.T) {
func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 5000*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
@@ -267,10 +270,11 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
}
}
func TestRequestsMultipleLoadedModels(t *testing.T) {
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn // 1 metal GPU
s.getCpuFn = getCpuFn // 1 CPU
@@ -389,7 +393,7 @@ closeWait:
s.loadedMu.Unlock()
}
func TestGetRunner(t *testing.T) {
func TestSchedGetRunner(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
defer done()
@@ -398,6 +402,7 @@ func TestGetRunner(t *testing.T) {
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
t.Setenv("OLLAMA_MAX_QUEUE", "1")
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
s.newServerFn = a.newServer
@@ -442,10 +447,11 @@ func TestGetRunner(t *testing.T) {
b.ctxDone()
}
func TestExpireRunner(t *testing.T) {
func TestSchedExpireRunner(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
@@ -490,13 +496,14 @@ func TestExpireRunner(t *testing.T) {
}
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
func TestSchedPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
defer done()
// Same model, same request
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
g.TotalMemory = 24 * format.GigaByte
@@ -537,7 +544,7 @@ func TestPrematureExpired(t *testing.T) {
time.Sleep(5 * time.Millisecond)
}
func TestUseLoadedRunner(t *testing.T) {
func TestSchedUseLoadedRunner(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
req := &LlmRequest{
ctx: ctx,
@@ -564,7 +571,7 @@ func TestUseLoadedRunner(t *testing.T) {
require.Equal(t, req, fin)
}
func TestUpdateFreeSpace(t *testing.T) {
func TestSchedUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
@@ -597,6 +604,7 @@ func TestUpdateFreeSpace(t *testing.T) {
r2 := &runnerRef{llama: llm2, gpus: gpuIDs, numParallel: 1}
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loaded["b"] = r2
@@ -607,7 +615,7 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
}
func TestFindRunnerToUnload(t *testing.T) {
func TestSchedFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
@@ -615,6 +623,7 @@ func TestFindRunnerToUnload(t *testing.T) {
r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loaded["b"] = r2
@@ -627,7 +636,7 @@ func TestFindRunnerToUnload(t *testing.T) {
require.Equal(t, r1, resp)
}
func TestNeedsReload(t *testing.T) {
func TestSchedNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
@@ -674,13 +683,14 @@ func TestNeedsReload(t *testing.T) {
require.False(t, resp)
}
func TestUnloadAllRunners(t *testing.T) {
func TestSchedUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.unloadAllRunners()
r1 := &runnerRef{llama: llm1, numParallel: 1}
@@ -696,7 +706,7 @@ func TestUnloadAllRunners(t *testing.T) {
require.True(t, llm2.closeCalled)
}
func TestUnload(t *testing.T) {
func TestSchedUnload(t *testing.T) {
llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
r1 := &runnerRef{llama: llm1, numParallel: 1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
@@ -706,13 +716,14 @@ func TestUnload(t *testing.T) {
require.Nil(t, r2.model)
}
func TestAlreadyCanceled(t *testing.T) {
func TestSchedAlreadyCanceled(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
defer done()
dctx, done2 := context.WithCancel(ctx)
done2()
scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0}, nil)
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
slog.Info("scenario1a")
s.pendingReqCh <- scenario1a.req
require.Len(t, s.pendingReqCh, 1)