Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-07-05 21:48:45 +08:00
committed by GitHub
16 changed files with 228 additions and 231 deletions

View File

@@ -159,49 +159,18 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory // Runner options which must be set when the model is loaded into memory
type Runner struct { type Runner struct {
UseNUMA bool `json:"numa,omitempty"` UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"` NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"` NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"` NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"` LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"` F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"` LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"` VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap TriState `json:"use_mmap,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"` UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"` NumThread int `json:"num_thread,omitempty"`
}
type TriState int
const (
TriStateUndefined TriState = -1
TriStateFalse TriState = 0
TriStateTrue TriState = 1
)
func (b *TriState) UnmarshalJSON(data []byte) error {
var v bool
if err := json.Unmarshal(data, &v); err != nil {
return err
}
if v {
*b = TriStateTrue
}
*b = TriStateFalse
return nil
}
func (b *TriState) MarshalJSON() ([]byte, error) {
if *b == TriStateUndefined {
return nil, nil
}
var v bool
if *b == TriStateTrue {
v = true
}
return json.Marshal(v)
} }
// EmbeddingRequest is the request passed to [Client.Embeddings]. // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -444,19 +413,6 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue continue
} }
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
if val {
field.SetInt(int64(TriStateTrue))
} else {
field.SetInt(int64(TriStateFalse))
}
continue
}
switch field.Kind() { switch field.Kind() {
case reflect.Int: case reflect.Int:
switch t := val.(type) { switch t := val.(type) {
@@ -503,6 +459,17 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
slice[i] = str slice[i] = str
} }
field.Set(reflect.ValueOf(slice)) field.Set(reflect.ValueOf(slice))
case reflect.Pointer:
var b bool
if field.Type() == reflect.TypeOf(&b) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
field.Set(reflect.ValueOf(&val))
} else {
return fmt.Errorf("unknown type loading config params: %v %v", field.Kind(), field.Type())
}
default: default:
return fmt.Errorf("unknown type loading config params: %v", field.Kind()) return fmt.Errorf("unknown type loading config params: %v", field.Kind())
} }
@@ -545,7 +512,7 @@ func DefaultOptions() Options {
LowVRAM: false, LowVRAM: false,
F16KV: true, F16KV: true,
UseMLock: false, UseMLock: false,
UseMMap: TriStateUndefined, UseMMap: nil,
UseNUMA: false, UseNUMA: false,
}, },
} }
@@ -615,19 +582,6 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else { } else {
field := valueOpts.FieldByName(opt.Name) field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() { if field.IsValid() && field.CanSet() {
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
if boolVal {
out[key] = TriStateTrue
} else {
out[key] = TriStateFalse
}
continue
}
switch field.Kind() { switch field.Kind() {
case reflect.Float32: case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32) floatVal, err := strconv.ParseFloat(vals[0], 32)
@@ -655,6 +609,17 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
case reflect.Slice: case reflect.Slice:
// TODO: only string slices are supported right now // TODO: only string slices are supported right now
out[key] = vals out[key] = vals
case reflect.Pointer:
var b bool
if field.Type() == reflect.TypeOf(&b) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
out[key] = &boolVal
} else {
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
}
default: default:
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key) return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
} }

View File

@@ -108,25 +108,27 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
} }
func TestUseMmapParsingFromJSON(t *testing.T) { func TestUseMmapParsingFromJSON(t *testing.T) {
tr := true
fa := false
tests := []struct { tests := []struct {
name string name string
req string req string
exp TriState exp *bool
}{ }{
{ {
name: "Undefined", name: "Undefined",
req: `{ }`, req: `{ }`,
exp: TriStateUndefined, exp: nil,
}, },
{ {
name: "True", name: "True",
req: `{ "use_mmap": true }`, req: `{ "use_mmap": true }`,
exp: TriStateTrue, exp: &tr,
}, },
{ {
name: "False", name: "False",
req: `{ "use_mmap": false }`, req: `{ "use_mmap": false }`,
exp: TriStateFalse, exp: &fa,
}, },
} }
@@ -144,50 +146,52 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
} }
func TestUseMmapFormatParams(t *testing.T) { func TestUseMmapFormatParams(t *testing.T) {
tr := true
fa := false
tests := []struct { tests := []struct {
name string name string
req map[string][]string req map[string][]string
exp TriState exp *bool
err error err error
}{ }{
{ {
name: "True", name: "True",
req: map[string][]string{ req: map[string][]string{
"use_mmap": []string{"true"}, "use_mmap": {"true"},
}, },
exp: TriStateTrue, exp: &tr,
err: nil, err: nil,
}, },
{ {
name: "False", name: "False",
req: map[string][]string{ req: map[string][]string{
"use_mmap": []string{"false"}, "use_mmap": {"false"},
}, },
exp: TriStateFalse, exp: &fa,
err: nil, err: nil,
}, },
{ {
name: "Numeric True", name: "Numeric True",
req: map[string][]string{ req: map[string][]string{
"use_mmap": []string{"1"}, "use_mmap": {"1"},
}, },
exp: TriStateTrue, exp: &tr,
err: nil, err: nil,
}, },
{ {
name: "Numeric False", name: "Numeric False",
req: map[string][]string{ req: map[string][]string{
"use_mmap": []string{"0"}, "use_mmap": {"0"},
}, },
exp: TriStateFalse, exp: &fa,
err: nil, err: nil,
}, },
{ {
name: "invalid string", name: "invalid string",
req: map[string][]string{ req: map[string][]string{
"use_mmap": []string{"foo"}, "use_mmap": {"foo"},
}, },
exp: TriStateUndefined, exp: nil,
err: fmt.Errorf("invalid bool value [foo]"), err: fmt.Errorf("invalid bool value [foo]"),
}, },
} }
@@ -195,11 +199,11 @@ func TestUseMmapFormatParams(t *testing.T) {
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req) resp, err := FormatParams(test.req)
require.Equal(t, err, test.err) require.Equal(t, test.err, err)
respVal, ok := resp["use_mmap"] respVal, ok := resp["use_mmap"]
if test.exp != TriStateUndefined { if test.exp != nil {
assert.True(t, ok, "resp: %v", resp) assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, test.exp, respVal) assert.Equal(t, *test.exp, *respVal.(*bool))
} }
}) })
} }

View File

@@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/ If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
## Container fails to run on NVIDIA GPU ## NVIDIA GPU Discovery
Make sure you've set up the container runtime first as described in [docker.md](./docker.md) When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results.
Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem ### Linux NVIDIA Troubleshooting
- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
- Try rebooting - Try rebooting
- Make sure you're running the latest nvidia drivers - Make sure you're running the latest nvidia drivers

View File

@@ -4,12 +4,14 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"math"
"net" "net"
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"time"
) )
type OllamaHost struct { type OllamaHost struct {
@@ -34,17 +36,17 @@ var (
// Set via OLLAMA_HOST in the environment // Set via OLLAMA_HOST in the environment
Host *OllamaHost Host *OllamaHost
// Set via OLLAMA_KEEP_ALIVE in the environment // Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive string KeepAlive time.Duration
// Set via OLLAMA_LLM_LIBRARY in the environment // Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary string LLMLibrary string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment // Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners int MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment // Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_MAX_VRAM in the environment // Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64 MaxVRAM uint64
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_NOHISTORY in the environment // Set via OLLAMA_NOHISTORY in the environment
NoHistory bool NoHistory bool
// Set via OLLAMA_NOPRUNE in the environment // Set via OLLAMA_NOPRUNE in the environment
@@ -132,6 +134,7 @@ func init() {
NumParallel = 0 // Autoselect NumParallel = 0 // Autoselect
MaxRunners = 0 // Autoselect MaxRunners = 0 // Autoselect
MaxQueuedRequests = 512 MaxQueuedRequests = 512
KeepAlive = 5 * time.Minute
LoadConfig() LoadConfig()
} }
@@ -266,7 +269,10 @@ func LoadConfig() {
} }
} }
KeepAlive = clean("OLLAMA_KEEP_ALIVE") ka := clean("OLLAMA_KEEP_ALIVE")
if ka != "" {
loadKeepAlive(ka)
}
var err error var err error
ModelsDir, err = getModelsDir() ModelsDir, err = getModelsDir()
@@ -344,3 +350,24 @@ func getOllamaHost() (*OllamaHost, error) {
Port: port, Port: port,
}, nil }, nil
} }
func loadKeepAlive(ka string) {
v, err := strconv.Atoi(ka)
if err != nil {
d, err := time.ParseDuration(ka)
if err == nil {
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
} else {
KeepAlive = d
}
}
} else {
d := time.Duration(v) * time.Second
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
} else {
KeepAlive = d
}
}
}

View File

@@ -2,8 +2,10 @@ package envconfig
import ( import (
"fmt" "fmt"
"math"
"net" "net"
"testing" "testing"
"time"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@@ -23,6 +25,21 @@ func TestConfig(t *testing.T) {
t.Setenv("OLLAMA_FLASH_ATTENTION", "1") t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
LoadConfig() LoadConfig()
require.True(t, FlashAttention) require.True(t, FlashAttention)
t.Setenv("OLLAMA_KEEP_ALIVE", "")
LoadConfig()
require.Equal(t, 5*time.Minute, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "3")
LoadConfig()
require.Equal(t, 3*time.Second, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
LoadConfig()
require.Equal(t, 1*time.Hour, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
} }
func TestClientFromEnvironment(t *testing.T) { func TestClientFromEnvironment(t *testing.T) {

View File

@@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList {
}() }()
if !bootstrapped { if !bootstrapped {
slog.Debug("Detecting GPUs") slog.Info("looking for compatible GPUs")
needRefresh = false needRefresh = false
cpuCapability = GetCPUCapability() cpuCapability = GetCPUCapability()
var memInfo C.mem_info_t var memInfo C.mem_info_t
@@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList {
rocmGPUs = AMDGetGPUInfo() rocmGPUs = AMDGetGPUInfo()
bootstrapped = true bootstrapped = true
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
slog.Info("no compatible GPUs were discovered")
}
} }
// For detected GPUs, load library if not loaded // For detected GPUs, load library if not loaded
@@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
defer C.free(unsafe.Pointer(lib)) defer C.free(unsafe.Pointer(lib))
C.nvcuda_init(lib, &resp) C.nvcuda_init(lib, &resp)
if resp.err != nil { if resp.err != nil {
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) // Decide what log level based on the type of error message to help users understand why
msg := C.GoString(resp.err)
switch resp.cudaErr {
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
case C.CUDA_ERROR_NO_DEVICE:
slog.Info("no nvidia devices detected", "library", libPath)
case C.CUDA_ERROR_UNKNOWN:
slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
default:
if strings.Contains(msg, "wrong ELF class") {
slog.Debug("skipping 32bit library", "library", libPath)
} else {
slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
}
}
C.free(unsafe.Pointer(resp.err)) C.free(unsafe.Pointer(resp.err))
} else { } else {
return int(resp.num_devices), &resp.ch, libPath return int(resp.num_devices), &resp.ch, libPath

View File

@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
CUresult ret; CUresult ret;
resp->err = NULL; resp->err = NULL;
resp->num_devices = 0; resp->num_devices = 0;
resp->cudaErr = CUDA_SUCCESS;
const int buflen = 256; const int buflen = 256;
char buf[buflen + 1]; char buf[buflen + 1];
int i; int i;
@@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
nvcuda_lib_path, msg); nvcuda_lib_path, msg);
free(msg); free(msg);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = -1;
return; return;
} }
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
msg); msg);
free(msg); free(msg);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = -1;
return; return;
} }
} }
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret); LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = ret;
return; return;
} }
@@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->ch.handle = NULL; resp->ch.handle = NULL;
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = ret;
return; return;
} }
} }
@@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (h.handle == NULL) { if (h.handle == NULL) {
resp->err = strdup("nvcuda handle isn't initialized"); resp->err = strdup("cuda driver library handle isn't initialized");
return; return;
} }
ret = (*h.cuDeviceGet)(&device, i); ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device failed to initialize"); snprintf(buf, buflen, "cuda driver library device failed to initialize");
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
@@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret); snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
// Best effort on failure... // Best effort on failure...
(*h.cuCtxDestroy)(ctx); (*h.cuCtxDestroy)(ctx);
@@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx); ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret); LOG(1, "cuda driver library failed to release device context %d", ret);
} }
} }
@@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
ret = (*h.cuDeviceGet)(&device, i); ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device failed to initialize"); LOG(1, "cuda driver library device failed to initialize");
return; return;
} }
@@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to get device context %d", ret); LOG(1, "cuda driver library failed to get device context %d", ret);
return; return;
} }
ret = (*h.cuMemGetInfo_v2)(free, total); ret = (*h.cuMemGetInfo_v2)(free, total);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device memory info lookup failure %d", ret); LOG(1, "cuda driver library device memory info lookup failure %d", ret);
// Best effort on failure... // Best effort on failure...
(*h.cuCtxDestroy)(ctx); (*h.cuCtxDestroy)(ctx);
return; return;
@@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
ret = (*h.cuCtxDestroy)(ctx); ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret); LOG(1, "cuda driver library failed to release device context %d", ret);
} }
} }
void nvcuda_release(nvcuda_handle_t h) { void nvcuda_release(nvcuda_handle_t h) {
LOG(h.verbose, "releasing nvcuda library\n"); LOG(h.verbose, "releasing cuda driver library\n");
UNLOAD_LIBRARY(h.handle); UNLOAD_LIBRARY(h.handle);
// TODO and other context release logic? // TODO and other context release logic?
h.handle = NULL; h.handle = NULL;

View File

@@ -7,9 +7,12 @@
typedef enum cudaError_enum { typedef enum cudaError_enum {
CUDA_SUCCESS = 0, CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2, CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_NOT_INITIALIZED = 3, CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35, CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
CUDA_ERROR_NO_DEVICE = 100,
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
CUDA_ERROR_UNKNOWN = 999,
// Other values omitted for now... // Other values omitted for now...
} CUresult; } CUresult;
@@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp {
char *err; // If err is non-null handle is invalid char *err; // If err is non-null handle is invalid
nvcuda_handle_t ch; nvcuda_handle_t ch;
int num_devices; int num_devices;
CUresult cudaErr;
} nvcuda_init_resp_t; } nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);

View File

@@ -1732,7 +1732,7 @@ struct llama_server_context
slot.n_past -= 1; slot.n_past -= 1;
} }
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
if (slot.ga_n != 1) if (slot.ga_n != 1)
{ {

View File

@@ -38,7 +38,7 @@ func Init() error {
} }
var variants []string var variants []string
for v := range availableServers() { for v := range getAvailableServers() {
variants = append(variants, v) variants = append(variants, v)
} }
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
@@ -50,7 +50,7 @@ func Init() error {
// binary names may contain an optional variant separated by '_' // binary names may contain an optional variant separated by '_'
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2" // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
// Any library without a variant is the lowest common denominator // Any library without a variant is the lowest common denominator
func availableServers() map[string]string { func getAvailableServers() map[string]string {
payloadsDir, err := gpu.PayloadsDir() payloadsDir, err := gpu.PayloadsDir()
if err != nil { if err != nil {
slog.Error("payload lookup error", "error", err) slog.Error("payload lookup error", "error", err)
@@ -80,7 +80,7 @@ func availableServers() map[string]string {
// TODO - switch to metadata based mapping // TODO - switch to metadata based mapping
func serversForGpu(info gpu.GpuInfo) []string { func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_ // glob workDir for files that start with ollama_
availableServers := availableServers() availableServers := getAvailableServers()
requested := info.Library requested := info.Library
if info.Variant != gpu.CPUCapabilityNone { if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant.String() requested += "_" + info.Variant.String()
@@ -115,27 +115,29 @@ func serversForGpu(info gpu.GpuInfo) []string {
servers = append(servers, alt...) servers = append(servers, alt...)
} }
// Load up the best CPU variant if not primary requested if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
if info.Library != "cpu" { // Load up the best CPU variant if not primary requested
variant := gpu.GetCPUCapability() if info.Library != "cpu" {
// If no variant, then we fall back to default variant := gpu.GetCPUCapability()
// If we have a variant, try that if we find an exact match // If no variant, then we fall back to default
// Attempting to run the wrong CPU instructions will panic the // If we have a variant, try that if we find an exact match
// process // Attempting to run the wrong CPU instructions will panic the
if variant != gpu.CPUCapabilityNone { // process
for cmp := range availableServers { if variant != gpu.CPUCapabilityNone {
if cmp == "cpu_"+variant.String() { for cmp := range availableServers {
servers = append(servers, cmp) if cmp == "cpu_"+variant.String() {
break servers = append(servers, cmp)
break
}
} }
} else {
servers = append(servers, "cpu")
} }
} else {
servers = append(servers, "cpu")
} }
}
if len(servers) == 0 { if len(servers) == 0 {
servers = []string{"cpu"} servers = []string{"cpu"}
}
} }
return servers return servers
@@ -147,7 +149,7 @@ func serverForCpu() string {
return "metal" return "metal"
} }
variant := gpu.GetCPUCapability() variant := gpu.GetCPUCapability()
availableServers := availableServers() availableServers := getAvailableServers()
if variant != gpu.CPUCapabilityNone { if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant.String() { if cmp == "cpu_"+variant.String() {

View File

@@ -131,7 +131,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
availableServers := availableServers() availableServers := getAvailableServers()
if len(availableServers) == 0 {
if runtime.GOOS != "windows" {
slog.Warn("llama server binary disappeared, reinitializing payloads")
err = Init()
if err != nil {
slog.Warn("failed to reinitialize payloads", "error", err)
return nil, err
}
availableServers = getAvailableServers()
} else {
return nil, finalErr
}
}
var servers []string var servers []string
if cpuRunner != "" { if cpuRunner != "" {
servers = []string{cpuRunner} servers = []string{cpuRunner}
@@ -208,7 +221,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" && if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 && uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = api.TriStateFalse opts.UseMMap = new(bool)
*opts.UseMMap = false
} }
} }
@@ -219,10 +233,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
// Windows CUDA should not use mmap for best performance // Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing // Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache // For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == nil) ||
(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) || (gpus[0].Library == "cpu" && opts.UseMMap == nil) ||
opts.UseMMap == api.TriStateFalse { (opts.UseMMap != nil && !*opts.UseMMap) {
params = append(params, "--no-mmap") params = append(params, "--no-mmap")
} }

View File

@@ -25,7 +25,7 @@ var errorPrefixes = []string{
"CUDA error", "CUDA error",
"cudaMalloc failed", "cudaMalloc failed",
"\"ERR\"", "\"ERR\"",
"architecture", "error loading model",
} }
func (w *StatusWriter) Write(b []byte) (int, error) { func (w *StatusWriter) Write(b []byte) (int, error) {

View File

@@ -103,18 +103,9 @@ func (mp ModelPath) GetShortTagname() string {
return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag) return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
} }
// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
// The models directory is where Ollama stores its model files and manifests.
func modelsDir() (string, error) {
return envconfig.ModelsDir, nil
}
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist. // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
func (mp ModelPath) GetManifestPath() (string, error) { func (mp ModelPath) GetManifestPath() (string, error) {
dir, err := modelsDir() dir := envconfig.ModelsDir
if err != nil {
return "", err
}
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
} }
@@ -127,10 +118,7 @@ func (mp ModelPath) BaseURL() *url.URL {
} }
func GetManifestPath() (string, error) { func GetManifestPath() (string, error) {
dir, err := modelsDir() dir := envconfig.ModelsDir
if err != nil {
return "", err
}
path := filepath.Join(dir, "manifests") path := filepath.Join(dir, "manifests")
if err := os.MkdirAll(path, 0o755); err != nil { if err := os.MkdirAll(path, 0o755); err != nil {
@@ -141,10 +129,7 @@ func GetManifestPath() (string, error) {
} }
func GetBlobsPath(digest string) (string, error) { func GetBlobsPath(digest string) (string, error) {
dir, err := modelsDir() dir := envconfig.ModelsDir
if err != nil {
return "", err
}
// only accept actual sha256 digests // only accept actual sha256 digests
pattern := "^sha256[:-][0-9a-fA-F]{64}$" pattern := "^sha256[:-][0-9a-fA-F]{64}$"

View File

@@ -9,7 +9,6 @@ import (
"io" "io"
"io/fs" "io/fs"
"log/slog" "log/slog"
"math"
"net" "net"
"net/http" "net/http"
"net/netip" "net/netip"
@@ -17,7 +16,6 @@ import (
"os/signal" "os/signal"
"path/filepath" "path/filepath"
"slices" "slices"
"strconv"
"strings" "strings"
"syscall" "syscall"
"time" "time"
@@ -56,8 +54,6 @@ func init() {
gin.SetMode(mode) gin.SetMode(mode)
} }
var defaultSessionDuration = 5 * time.Minute
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) { func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
opts := api.DefaultOptions() opts := api.DefaultOptions()
if err := opts.FromMap(model.Options); err != nil { if err := opts.FromMap(model.Options); err != nil {
@@ -133,14 +129,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return return
} }
var sessionDuration time.Duration rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
if req.KeepAlive == nil {
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
var runner *runnerRef var runner *runnerRef
select { select {
case runner = <-rCh: case runner = <-rCh:
@@ -320,32 +309,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
streamResponse(c, ch) streamResponse(c, ch)
} }
func getDefaultSessionDuration() time.Duration {
if envconfig.KeepAlive != "" {
v, err := strconv.Atoi(envconfig.KeepAlive)
if err != nil {
d, err := time.ParseDuration(envconfig.KeepAlive)
if err != nil {
return defaultSessionDuration
}
if d < 0 {
return time.Duration(math.MaxInt64)
}
return d
}
d := time.Duration(v) * time.Second
if d < 0 {
return time.Duration(math.MaxInt64)
}
return d
}
return defaultSessionDuration
}
func (s *Server) EmbeddingsHandler(c *gin.Context) { func (s *Server) EmbeddingsHandler(c *gin.Context) {
var req api.EmbeddingRequest var req api.EmbeddingRequest
err := c.ShouldBindJSON(&req) err := c.ShouldBindJSON(&req)
@@ -380,14 +343,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
return return
} }
var sessionDuration time.Duration rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
if req.KeepAlive == nil {
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
var runner *runnerRef var runner *runnerRef
select { select {
case runner = <-rCh: case runner = <-rCh:
@@ -1318,14 +1274,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
return return
} }
var sessionDuration time.Duration rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
if req.KeepAlive == nil {
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
var runner *runnerRef var runner *runnerRef
select { select {
case runner = <-rCh: case runner = <-rCh:

View File

@@ -24,7 +24,7 @@ type LlmRequest struct {
model *Model model *Model
opts api.Options opts api.Options
origNumCtx int // Track the initial ctx request origNumCtx int // Track the initial ctx request
sessionDuration time.Duration sessionDuration *api.Duration
successCh chan *runnerRef successCh chan *runnerRef
errCh chan error errCh chan error
schedAttempts uint schedAttempts uint
@@ -75,7 +75,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
} }
// context must be canceled to decrement ref count and release the runner // context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 { if opts.NumCtx < 4 {
opts.NumCtx = 4 opts.NumCtx = 4
} }
@@ -389,7 +389,9 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
runner.expireTimer.Stop() runner.expireTimer.Stop()
runner.expireTimer = nil runner.expireTimer = nil
} }
runner.sessionDuration = pending.sessionDuration if pending.sessionDuration != nil {
runner.sessionDuration = pending.sessionDuration.Duration
}
pending.successCh <- runner pending.successCh <- runner
go func() { go func() {
<-pending.ctx.Done() <-pending.ctx.Done()
@@ -402,6 +404,10 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
if numParallel < 1 { if numParallel < 1 {
numParallel = 1 numParallel = 1
} }
sessionDuration := envconfig.KeepAlive
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil { if err != nil {
// some older models are not compatible with newer versions of llama.cpp // some older models are not compatible with newer versions of llama.cpp
@@ -419,7 +425,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
modelPath: req.model.ModelPath, modelPath: req.model.ModelPath,
llama: llama, llama: llama,
Options: &req.opts, Options: &req.opts,
sessionDuration: req.sessionDuration, sessionDuration: sessionDuration,
gpus: gpus, gpus: gpus,
estimatedVRAM: llama.EstimatedVRAM(), estimatedVRAM: llama.EstimatedVRAM(),
estimatedTotal: llama.EstimatedTotal(), estimatedTotal: llama.EstimatedTotal(),

View File

@@ -44,7 +44,7 @@ func TestLoad(t *testing.T) {
opts: api.DefaultOptions(), opts: api.DefaultOptions(),
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
sessionDuration: 2, sessionDuration: &api.Duration{Duration: 2 * time.Second},
} }
// Fail to load model first // Fail to load model first
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
@@ -142,7 +142,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
ctx: scenario.ctx, ctx: scenario.ctx,
model: model, model: model,
opts: api.DefaultOptions(), opts: api.DefaultOptions(),
sessionDuration: 5 * time.Millisecond, sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
@@ -156,18 +156,18 @@ func TestRequests(t *testing.T) {
// Same model, same request // Same model, same request
scenario1a := newScenario(t, ctx, "ollama-model-1", 10) scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
scenario1a.req.sessionDuration = 5 * time.Millisecond scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
scenario1b := newScenario(t, ctx, "ollama-model-1", 11) scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
scenario1b.req.model = scenario1a.req.model scenario1b.req.model = scenario1a.req.model
scenario1b.ggml = scenario1a.ggml scenario1b.ggml = scenario1a.ggml
scenario1b.req.sessionDuration = 0 scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
// simple reload of same model // simple reload of same model
scenario2a := newScenario(t, ctx, "ollama-model-1", 20) scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
tmpModel := *scenario1a.req.model tmpModel := *scenario1a.req.model
scenario2a.req.model = &tmpModel scenario2a.req.model = &tmpModel
scenario2a.ggml = scenario1a.ggml scenario2a.ggml = scenario1a.ggml
scenario2a.req.sessionDuration = 5 * time.Millisecond scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
// Multiple loaded models // Multiple loaded models
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte) scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -318,11 +318,11 @@ func TestGetRunner(t *testing.T) {
defer done() defer done()
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10) scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
scenario1a.req.sessionDuration = 0 scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10) scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
scenario1b.req.sessionDuration = 0 scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10) scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
scenario1c.req.sessionDuration = 0 scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
envconfig.MaxQueuedRequests = 1 envconfig.MaxQueuedRequests = 1
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList { s.getGpuFn = func() gpu.GpuInfoList {
@@ -402,7 +402,7 @@ func TestPrematureExpired(t *testing.T) {
case <-ctx.Done(): case <-ctx.Done():
t.Fatal("timeout") t.Fatal("timeout")
} }
time.Sleep(scenario1a.req.sessionDuration) time.Sleep(scenario1a.req.sessionDuration.Duration)
scenario1a.ctxDone() scenario1a.ctxDone()
time.Sleep(20 * time.Millisecond) time.Sleep(20 * time.Millisecond)
require.LessOrEqual(t, len(s.finishedReqCh), 1) require.LessOrEqual(t, len(s.finishedReqCh), 1)
@@ -423,7 +423,7 @@ func TestUseLoadedRunner(t *testing.T) {
ctx: ctx, ctx: ctx,
opts: api.DefaultOptions(), opts: api.DefaultOptions(),
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
sessionDuration: 2, sessionDuration: &api.Duration{Duration: 2},
} }
finished := make(chan *LlmRequest) finished := make(chan *LlmRequest)
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
@@ -614,7 +614,7 @@ func TestAlreadyCanceled(t *testing.T) {
dctx, done2 := context.WithCancel(ctx) dctx, done2 := context.WithCancel(ctx)
done2() done2()
scenario1a := newScenario(t, dctx, "ollama-model-1", 10) scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
scenario1a.req.sessionDuration = 0 scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
s := InitScheduler(ctx) s := InitScheduler(ctx)
slog.Info("scenario1a") slog.Info("scenario1a")
s.pendingReqCh <- scenario1a.req s.pendingReqCh <- scenario1a.req