mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 23:03:55 +00:00
test: harden scheduler tests (#12662)
* test: harden scheduler tests This removes reschedDelay which was stale code, and adds a new configurable timeout for the waitForVRAMRecovery so tests can now set the timeout to be very short to avoid the scheduler getting stuck and hitting a test timeout. * test: tune tests for partial loads Give stress tests more time when the model is split between CPU/GPU
This commit is contained in:
@@ -26,7 +26,7 @@ func TestMain(m *testing.M) {
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestInitScheduler(t *testing.T) {
|
||||
func TestSchedInit(t *testing.T) {
|
||||
ctx, done := context.WithCancel(t.Context())
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
@@ -35,10 +35,11 @@ func TestInitScheduler(t *testing.T) {
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
func TestLoad(t *testing.T) {
|
||||
func TestSchedLoad(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
var f *ggml.GGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
@@ -167,10 +168,11 @@ func getCpuFn() discover.GpuInfo {
|
||||
return g
|
||||
}
|
||||
|
||||
func TestRequestsSameModelSameRequest(t *testing.T) {
|
||||
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
@@ -210,10 +212,11 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 5000*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
@@ -267,10 +270,11 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRequestsMultipleLoadedModels(t *testing.T) {
|
||||
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn // 1 metal GPU
|
||||
s.getCpuFn = getCpuFn // 1 CPU
|
||||
|
||||
@@ -389,7 +393,7 @@ closeWait:
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
func TestGetRunner(t *testing.T) {
|
||||
func TestSchedGetRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
|
||||
defer done()
|
||||
|
||||
@@ -398,6 +402,7 @@ func TestGetRunner(t *testing.T) {
|
||||
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
|
||||
t.Setenv("OLLAMA_MAX_QUEUE", "1")
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.newServerFn = a.newServer
|
||||
@@ -442,10 +447,11 @@ func TestGetRunner(t *testing.T) {
|
||||
b.ctxDone()
|
||||
}
|
||||
|
||||
func TestExpireRunner(t *testing.T) {
|
||||
func TestSchedExpireRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
@@ -490,13 +496,14 @@ func TestExpireRunner(t *testing.T) {
|
||||
}
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestPrematureExpired(t *testing.T) {
|
||||
func TestSchedPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
@@ -537,7 +544,7 @@ func TestPrematureExpired(t *testing.T) {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
|
||||
func TestUseLoadedRunner(t *testing.T) {
|
||||
func TestSchedUseLoadedRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
@@ -564,7 +571,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
||||
require.Equal(t, req, fin)
|
||||
}
|
||||
|
||||
func TestUpdateFreeSpace(t *testing.T) {
|
||||
func TestSchedUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := discover.GpuInfoList{
|
||||
@@ -597,6 +604,7 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
r2 := &runnerRef{llama: llm2, gpus: gpuIDs, numParallel: 1}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
@@ -607,7 +615,7 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||
}
|
||||
|
||||
func TestFindRunnerToUnload(t *testing.T) {
|
||||
func TestSchedFindRunnerToUnload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
@@ -615,6 +623,7 @@ func TestFindRunnerToUnload(t *testing.T) {
|
||||
r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
@@ -627,7 +636,7 @@ func TestFindRunnerToUnload(t *testing.T) {
|
||||
require.Equal(t, r1, resp)
|
||||
}
|
||||
|
||||
func TestNeedsReload(t *testing.T) {
|
||||
func TestSchedNeedsReload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
@@ -674,13 +683,14 @@ func TestNeedsReload(t *testing.T) {
|
||||
require.False(t, resp)
|
||||
}
|
||||
|
||||
func TestUnloadAllRunners(t *testing.T) {
|
||||
func TestSchedUnloadAllRunners(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.unloadAllRunners()
|
||||
|
||||
r1 := &runnerRef{llama: llm1, numParallel: 1}
|
||||
@@ -696,7 +706,7 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||
require.True(t, llm2.closeCalled)
|
||||
}
|
||||
|
||||
func TestUnload(t *testing.T) {
|
||||
func TestSchedUnload(t *testing.T) {
|
||||
llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
r1 := &runnerRef{llama: llm1, numParallel: 1}
|
||||
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
|
||||
@@ -706,13 +716,14 @@ func TestUnload(t *testing.T) {
|
||||
require.Nil(t, r2.model)
|
||||
}
|
||||
|
||||
func TestAlreadyCanceled(t *testing.T) {
|
||||
func TestSchedAlreadyCanceled(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
defer done()
|
||||
dctx, done2 := context.WithCancel(ctx)
|
||||
done2()
|
||||
scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0}, nil)
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
slog.Info("scenario1a")
|
||||
s.pendingReqCh <- scenario1a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
|
||||
Reference in New Issue
Block a user