diff --git a/api/types.go b/api/types.go index 428281ba..87844c67 100644 --- a/api/types.go +++ b/api/types.go @@ -159,49 +159,18 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - UseNUMA bool `json:"numa,omitempty"` - NumCtx int `json:"num_ctx,omitempty"` - NumBatch int `json:"num_batch,omitempty"` - NumGPU int `json:"num_gpu,omitempty"` - MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` - UseMMap TriState `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` - NumThread int `json:"num_thread,omitempty"` -} - -type TriState int - -const ( - TriStateUndefined TriState = -1 - TriStateFalse TriState = 0 - TriStateTrue TriState = 1 -) - -func (b *TriState) UnmarshalJSON(data []byte) error { - var v bool - if err := json.Unmarshal(data, &v); err != nil { - return err - } - if v { - *b = TriStateTrue - } - *b = TriStateFalse - return nil -} - -func (b *TriState) MarshalJSON() ([]byte, error) { - if *b == TriStateUndefined { - return nil, nil - } - var v bool - if *b == TriStateTrue { - v = true - } - return json.Marshal(v) + UseNUMA bool `json:"numa,omitempty"` + NumCtx int `json:"num_ctx,omitempty"` + NumBatch int `json:"num_batch,omitempty"` + NumGPU int `json:"num_gpu,omitempty"` + MainGPU int `json:"main_gpu,omitempty"` + LowVRAM bool `json:"low_vram,omitempty"` + F16KV bool `json:"f16_kv,omitempty"` + LogitsAll bool `json:"logits_all,omitempty"` + VocabOnly bool `json:"vocab_only,omitempty"` + UseMMap *bool `json:"use_mmap,omitempty"` + UseMLock bool `json:"use_mlock,omitempty"` + NumThread int `json:"num_thread,omitempty"` } // EmbeddingRequest is the request passed to [Client.Embeddings]. @@ -444,19 +413,6 @@ func (opts *Options) FromMap(m map[string]interface{}) error { continue } - if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) { - val, ok := val.(bool) - if !ok { - return fmt.Errorf("option %q must be of type boolean", key) - } - if val { - field.SetInt(int64(TriStateTrue)) - } else { - field.SetInt(int64(TriStateFalse)) - } - continue - } - switch field.Kind() { case reflect.Int: switch t := val.(type) { @@ -503,6 +459,17 @@ func (opts *Options) FromMap(m map[string]interface{}) error { slice[i] = str } field.Set(reflect.ValueOf(slice)) + case reflect.Pointer: + var b bool + if field.Type() == reflect.TypeOf(&b) { + val, ok := val.(bool) + if !ok { + return fmt.Errorf("option %q must be of type boolean", key) + } + field.Set(reflect.ValueOf(&val)) + } else { + return fmt.Errorf("unknown type loading config params: %v %v", field.Kind(), field.Type()) + } default: return fmt.Errorf("unknown type loading config params: %v", field.Kind()) } @@ -545,7 +512,7 @@ func DefaultOptions() Options { LowVRAM: false, F16KV: true, UseMLock: false, - UseMMap: TriStateUndefined, + UseMMap: nil, UseNUMA: false, }, } @@ -615,19 +582,6 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) { } else { field := valueOpts.FieldByName(opt.Name) if field.IsValid() && field.CanSet() { - if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) { - boolVal, err := strconv.ParseBool(vals[0]) - if err != nil { - return nil, fmt.Errorf("invalid bool value %s", vals) - } - if boolVal { - out[key] = TriStateTrue - } else { - out[key] = TriStateFalse - } - continue - } - switch field.Kind() { case reflect.Float32: floatVal, err := strconv.ParseFloat(vals[0], 32) @@ -655,6 +609,17 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) { case reflect.Slice: // TODO: only string slices are supported right now out[key] = vals + case reflect.Pointer: + var b bool + if field.Type() == reflect.TypeOf(&b) { + boolVal, err := strconv.ParseBool(vals[0]) + if err != nil { + return nil, fmt.Errorf("invalid bool value %s", vals) + } + out[key] = &boolVal + } else { + return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key) + } default: return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key) } diff --git a/api/types_test.go b/api/types_test.go index 8b6c60c6..c60ed90e 100644 --- a/api/types_test.go +++ b/api/types_test.go @@ -108,25 +108,27 @@ func TestDurationMarshalUnmarshal(t *testing.T) { } func TestUseMmapParsingFromJSON(t *testing.T) { + tr := true + fa := false tests := []struct { name string req string - exp TriState + exp *bool }{ { name: "Undefined", req: `{ }`, - exp: TriStateUndefined, + exp: nil, }, { name: "True", req: `{ "use_mmap": true }`, - exp: TriStateTrue, + exp: &tr, }, { name: "False", req: `{ "use_mmap": false }`, - exp: TriStateFalse, + exp: &fa, }, } @@ -144,50 +146,52 @@ func TestUseMmapParsingFromJSON(t *testing.T) { } func TestUseMmapFormatParams(t *testing.T) { + tr := true + fa := false tests := []struct { name string req map[string][]string - exp TriState + exp *bool err error }{ { name: "True", req: map[string][]string{ - "use_mmap": []string{"true"}, + "use_mmap": {"true"}, }, - exp: TriStateTrue, + exp: &tr, err: nil, }, { name: "False", req: map[string][]string{ - "use_mmap": []string{"false"}, + "use_mmap": {"false"}, }, - exp: TriStateFalse, + exp: &fa, err: nil, }, { name: "Numeric True", req: map[string][]string{ - "use_mmap": []string{"1"}, + "use_mmap": {"1"}, }, - exp: TriStateTrue, + exp: &tr, err: nil, }, { name: "Numeric False", req: map[string][]string{ - "use_mmap": []string{"0"}, + "use_mmap": {"0"}, }, - exp: TriStateFalse, + exp: &fa, err: nil, }, { name: "invalid string", req: map[string][]string{ - "use_mmap": []string{"foo"}, + "use_mmap": {"foo"}, }, - exp: TriStateUndefined, + exp: nil, err: fmt.Errorf("invalid bool value [foo]"), }, } @@ -195,11 +199,11 @@ func TestUseMmapFormatParams(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { resp, err := FormatParams(test.req) - require.Equal(t, err, test.err) + require.Equal(t, test.err, err) respVal, ok := resp["use_mmap"] - if test.exp != TriStateUndefined { + if test.exp != nil { assert.True(t, ok, "resp: %v", resp) - assert.Equal(t, test.exp, respVal) + assert.Equal(t, *test.exp, *respVal.(*bool)) } }) } diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index de29b344..bbb77183 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/ -## Container fails to run on NVIDIA GPU +## NVIDIA GPU Discovery -Make sure you've set up the container runtime first as described in [docker.md](./docker.md) +When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results. -Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem +### Linux NVIDIA Troubleshooting -- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. -- Is the uvm driver not loaded? `sudo nvidia-modprobe -u` +If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md) + +Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem + +- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. +- Is the uvm driver loaded? `sudo nvidia-modprobe -u` - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` - Try rebooting - Make sure you're running the latest nvidia drivers diff --git a/envconfig/config.go b/envconfig/config.go index c02c4878..62d661eb 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -4,12 +4,14 @@ import ( "errors" "fmt" "log/slog" + "math" "net" "os" "path/filepath" "runtime" "strconv" "strings" + "time" ) type OllamaHost struct { @@ -34,17 +36,17 @@ var ( // Set via OLLAMA_HOST in the environment Host *OllamaHost // Set via OLLAMA_KEEP_ALIVE in the environment - KeepAlive string + KeepAlive time.Duration // Set via OLLAMA_LLM_LIBRARY in the environment LLMLibrary string // Set via OLLAMA_MAX_LOADED_MODELS in the environment MaxRunners int // Set via OLLAMA_MAX_QUEUE in the environment MaxQueuedRequests int - // Set via OLLAMA_MODELS in the environment - ModelsDir string // Set via OLLAMA_MAX_VRAM in the environment MaxVRAM uint64 + // Set via OLLAMA_MODELS in the environment + ModelsDir string // Set via OLLAMA_NOHISTORY in the environment NoHistory bool // Set via OLLAMA_NOPRUNE in the environment @@ -132,6 +134,7 @@ func init() { NumParallel = 0 // Autoselect MaxRunners = 0 // Autoselect MaxQueuedRequests = 512 + KeepAlive = 5 * time.Minute LoadConfig() } @@ -266,7 +269,10 @@ func LoadConfig() { } } - KeepAlive = clean("OLLAMA_KEEP_ALIVE") + ka := clean("OLLAMA_KEEP_ALIVE") + if ka != "" { + loadKeepAlive(ka) + } var err error ModelsDir, err = getModelsDir() @@ -344,3 +350,24 @@ func getOllamaHost() (*OllamaHost, error) { Port: port, }, nil } + +func loadKeepAlive(ka string) { + v, err := strconv.Atoi(ka) + if err != nil { + d, err := time.ParseDuration(ka) + if err == nil { + if d < 0 { + KeepAlive = time.Duration(math.MaxInt64) + } else { + KeepAlive = d + } + } + } else { + d := time.Duration(v) * time.Second + if d < 0 { + KeepAlive = time.Duration(math.MaxInt64) + } else { + KeepAlive = d + } + } +} diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 7d923d62..a5d73fd7 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -2,8 +2,10 @@ package envconfig import ( "fmt" + "math" "net" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -23,6 +25,21 @@ func TestConfig(t *testing.T) { t.Setenv("OLLAMA_FLASH_ATTENTION", "1") LoadConfig() require.True(t, FlashAttention) + t.Setenv("OLLAMA_KEEP_ALIVE", "") + LoadConfig() + require.Equal(t, 5*time.Minute, KeepAlive) + t.Setenv("OLLAMA_KEEP_ALIVE", "3") + LoadConfig() + require.Equal(t, 3*time.Second, KeepAlive) + t.Setenv("OLLAMA_KEEP_ALIVE", "1h") + LoadConfig() + require.Equal(t, 1*time.Hour, KeepAlive) + t.Setenv("OLLAMA_KEEP_ALIVE", "-1s") + LoadConfig() + require.Equal(t, time.Duration(math.MaxInt64), KeepAlive) + t.Setenv("OLLAMA_KEEP_ALIVE", "-1") + LoadConfig() + require.Equal(t, time.Duration(math.MaxInt64), KeepAlive) } func TestClientFromEnvironment(t *testing.T) { diff --git a/gpu/gpu.go b/gpu/gpu.go index 583bb79c..29a3c103 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList { }() if !bootstrapped { - slog.Debug("Detecting GPUs") + slog.Info("looking for compatible GPUs") needRefresh = false cpuCapability = GetCPUCapability() var memInfo C.mem_info_t @@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList { rocmGPUs = AMDGetGPUInfo() bootstrapped = true + if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 { + slog.Info("no compatible GPUs were discovered") + } } // For detected GPUs, load library if not loaded @@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { defer C.free(unsafe.Pointer(lib)) C.nvcuda_init(lib, &resp) if resp.err != nil { - slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) + // Decide what log level based on the type of error message to help users understand why + msg := C.GoString(resp.err) + switch resp.cudaErr { + case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: + slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg) + case C.CUDA_ERROR_NO_DEVICE: + slog.Info("no nvidia devices detected", "library", libPath) + case C.CUDA_ERROR_UNKNOWN: + slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg) + slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information") + default: + if strings.Contains(msg, "wrong ELF class") { + slog.Debug("skipping 32bit library", "library", libPath) + } else { + slog.Info("unable to load cuda driver library", "library", libPath, "error", msg) + } + } C.free(unsafe.Pointer(resp.err)) } else { return int(resp.num_devices), &resp.ch, libPath diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index abe14084..a1a38bfc 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { CUresult ret; resp->err = NULL; resp->num_devices = 0; + resp->cudaErr = CUDA_SUCCESS; const int buflen = 256; char buf[buflen + 1]; int i; @@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { nvcuda_lib_path, msg); free(msg); resp->err = strdup(buf); + resp->cudaErr = -1; return; } @@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { msg); free(msg); resp->err = strdup(buf); + resp->cudaErr = -1; return; } } @@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { LOG(resp->ch.verbose, "cuInit err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; - if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { - resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); - return; - } - snprintf(buf, buflen, "nvcuda init failure: %d", ret); + snprintf(buf, buflen, "cuda driver library init failure: %d", ret); resp->err = strdup(buf); + resp->cudaErr = ret; return; } @@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->ch.handle = NULL; snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); + resp->cudaErr = ret; return; } } @@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; if (h.handle == NULL) { - resp->err = strdup("nvcuda handle isn't initialized"); + resp->err = strdup("cuda driver library handle isn't initialized"); return; } ret = (*h.cuDeviceGet)(&device, i); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda device failed to initialize"); + snprintf(buf, buflen, "cuda driver library device failed to initialize"); resp->err = strdup(buf); return; } @@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda failed to get device context %d", ret); + snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret); resp->err = strdup(buf); return; } ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); + snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret); resp->err = strdup(buf); // Best effort on failure... (*h.cuCtxDestroy)(ctx); @@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { ret = (*h.cuCtxDestroy)(ctx); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to release device context %d", ret); + LOG(1, "cuda driver library failed to release device context %d", ret); } } @@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) ret = (*h.cuDeviceGet)(&device, i); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda device failed to initialize"); + LOG(1, "cuda driver library device failed to initialize"); return; } @@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to get device context %d", ret); + LOG(1, "cuda driver library failed to get device context %d", ret); return; } ret = (*h.cuMemGetInfo_v2)(free, total); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda device memory info lookup failure %d", ret); + LOG(1, "cuda driver library device memory info lookup failure %d", ret); // Best effort on failure... (*h.cuCtxDestroy)(ctx); return; @@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) ret = (*h.cuCtxDestroy)(ctx); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to release device context %d", ret); + LOG(1, "cuda driver library failed to release device context %d", ret); } } void nvcuda_release(nvcuda_handle_t h) { - LOG(h.verbose, "releasing nvcuda library\n"); + LOG(h.verbose, "releasing cuda driver library\n"); UNLOAD_LIBRARY(h.handle); // TODO and other context release logic? h.handle = NULL; diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h index f9654f64..ef2fe8a3 100644 --- a/gpu/gpu_info_nvcuda.h +++ b/gpu/gpu_info_nvcuda.h @@ -7,9 +7,12 @@ typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_MEMORY_ALLOCATION = 2, + CUDA_ERROR_OUT_OF_MEMORY = 2, CUDA_ERROR_NOT_INITIALIZED = 3, CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + CUDA_ERROR_NO_DEVICE = 100, + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + CUDA_ERROR_UNKNOWN = 999, // Other values omitted for now... } CUresult; @@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp { char *err; // If err is non-null handle is invalid nvcuda_handle_t ch; int num_devices; + CUresult cudaErr; } nvcuda_init_resp_t; void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 3bc01252..09970599 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1732,7 +1732,7 @@ struct llama_server_context slot.n_past -= 1; } - slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; + slot.n_prompt_tokens_processed = slot.n_prompt_tokens; if (slot.ga_n != 1) { diff --git a/llm/payload.go b/llm/payload.go index 9296db33..b402e1f2 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -38,7 +38,7 @@ func Init() error { } var variants []string - for v := range availableServers() { + for v := range getAvailableServers() { variants = append(variants, v) } slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) @@ -50,7 +50,7 @@ func Init() error { // binary names may contain an optional variant separated by '_' // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2" // Any library without a variant is the lowest common denominator -func availableServers() map[string]string { +func getAvailableServers() map[string]string { payloadsDir, err := gpu.PayloadsDir() if err != nil { slog.Error("payload lookup error", "error", err) @@ -80,7 +80,7 @@ func availableServers() map[string]string { // TODO - switch to metadata based mapping func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ - availableServers := availableServers() + availableServers := getAvailableServers() requested := info.Library if info.Variant != gpu.CPUCapabilityNone { requested += "_" + info.Variant.String() @@ -115,27 +115,29 @@ func serversForGpu(info gpu.GpuInfo) []string { servers = append(servers, alt...) } - // Load up the best CPU variant if not primary requested - if info.Library != "cpu" { - variant := gpu.GetCPUCapability() - // If no variant, then we fall back to default - // If we have a variant, try that if we find an exact match - // Attempting to run the wrong CPU instructions will panic the - // process - if variant != gpu.CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - servers = append(servers, cmp) - break + if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { + // Load up the best CPU variant if not primary requested + if info.Library != "cpu" { + variant := gpu.GetCPUCapability() + // If no variant, then we fall back to default + // If we have a variant, try that if we find an exact match + // Attempting to run the wrong CPU instructions will panic the + // process + if variant != gpu.CPUCapabilityNone { + for cmp := range availableServers { + if cmp == "cpu_"+variant.String() { + servers = append(servers, cmp) + break + } } + } else { + servers = append(servers, "cpu") } - } else { - servers = append(servers, "cpu") } - } - if len(servers) == 0 { - servers = []string{"cpu"} + if len(servers) == 0 { + servers = []string{"cpu"} + } } return servers @@ -147,7 +149,7 @@ func serverForCpu() string { return "metal" } variant := gpu.GetCPUCapability() - availableServers := availableServers() + availableServers := getAvailableServers() if variant != gpu.CPUCapabilityNone { for cmp := range availableServers { if cmp == "cpu_"+variant.String() { diff --git a/llm/server.go b/llm/server.go index 8b63cfbd..206f9e39 100644 --- a/llm/server.go +++ b/llm/server.go @@ -131,7 +131,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") } - availableServers := availableServers() + availableServers := getAvailableServers() + if len(availableServers) == 0 { + if runtime.GOOS != "windows" { + slog.Warn("llama server binary disappeared, reinitializing payloads") + err = Init() + if err != nil { + slog.Warn("failed to reinitialize payloads", "error", err) + return nil, err + } + availableServers = getAvailableServers() + } else { + return nil, finalErr + } + } var servers []string if cpuRunner != "" { servers = []string{cpuRunner} @@ -208,7 +221,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if g.Library == "metal" && uint64(opts.NumGPU) > 0 && uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { - opts.UseMMap = api.TriStateFalse + opts.UseMMap = new(bool) + *opts.UseMMap = false } } @@ -219,10 +233,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing // For CPU loads we want the memory to be allocated, not FS cache - if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || - (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || - (gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) || - opts.UseMMap == api.TriStateFalse { + if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == nil) || + (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == nil) || + (gpus[0].Library == "cpu" && opts.UseMMap == nil) || + (opts.UseMMap != nil && !*opts.UseMMap) { params = append(params, "--no-mmap") } diff --git a/llm/status.go b/llm/status.go index 0f56b7f9..d9f36115 100644 --- a/llm/status.go +++ b/llm/status.go @@ -25,7 +25,7 @@ var errorPrefixes = []string{ "CUDA error", "cudaMalloc failed", "\"ERR\"", - "architecture", + "error loading model", } func (w *StatusWriter) Write(b []byte) (int, error) { diff --git a/server/modelpath.go b/server/modelpath.go index 64f59c29..3fdb4238 100644 --- a/server/modelpath.go +++ b/server/modelpath.go @@ -103,18 +103,9 @@ func (mp ModelPath) GetShortTagname() string { return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag) } -// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set. -// The models directory is where Ollama stores its model files and manifests. -func modelsDir() (string, error) { - return envconfig.ModelsDir, nil -} - // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist. func (mp ModelPath) GetManifestPath() (string, error) { - dir, err := modelsDir() - if err != nil { - return "", err - } + dir := envconfig.ModelsDir return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil } @@ -127,10 +118,7 @@ func (mp ModelPath) BaseURL() *url.URL { } func GetManifestPath() (string, error) { - dir, err := modelsDir() - if err != nil { - return "", err - } + dir := envconfig.ModelsDir path := filepath.Join(dir, "manifests") if err := os.MkdirAll(path, 0o755); err != nil { @@ -141,10 +129,7 @@ func GetManifestPath() (string, error) { } func GetBlobsPath(digest string) (string, error) { - dir, err := modelsDir() - if err != nil { - return "", err - } + dir := envconfig.ModelsDir // only accept actual sha256 digests pattern := "^sha256[:-][0-9a-fA-F]{64}$" diff --git a/server/routes.go b/server/routes.go index b14a146c..ac6b713a 100644 --- a/server/routes.go +++ b/server/routes.go @@ -9,7 +9,6 @@ import ( "io" "io/fs" "log/slog" - "math" "net" "net/http" "net/netip" @@ -17,7 +16,6 @@ import ( "os/signal" "path/filepath" "slices" - "strconv" "strings" "syscall" "time" @@ -56,8 +54,6 @@ func init() { gin.SetMode(mode) } -var defaultSessionDuration = 5 * time.Minute - func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) { opts := api.DefaultOptions() if err := opts.FromMap(model.Options); err != nil { @@ -133,14 +129,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - var sessionDuration time.Duration - if req.KeepAlive == nil { - sessionDuration = getDefaultSessionDuration() - } else { - sessionDuration = req.KeepAlive.Duration - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration) + rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) var runner *runnerRef select { case runner = <-rCh: @@ -320,32 +309,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { streamResponse(c, ch) } -func getDefaultSessionDuration() time.Duration { - if envconfig.KeepAlive != "" { - v, err := strconv.Atoi(envconfig.KeepAlive) - if err != nil { - d, err := time.ParseDuration(envconfig.KeepAlive) - if err != nil { - return defaultSessionDuration - } - - if d < 0 { - return time.Duration(math.MaxInt64) - } - - return d - } - - d := time.Duration(v) * time.Second - if d < 0 { - return time.Duration(math.MaxInt64) - } - return d - } - - return defaultSessionDuration -} - func (s *Server) EmbeddingsHandler(c *gin.Context) { var req api.EmbeddingRequest err := c.ShouldBindJSON(&req) @@ -380,14 +343,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - var sessionDuration time.Duration - if req.KeepAlive == nil { - sessionDuration = getDefaultSessionDuration() - } else { - sessionDuration = req.KeepAlive.Duration - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration) + rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) var runner *runnerRef select { case runner = <-rCh: @@ -1318,14 +1274,7 @@ func (s *Server) ChatHandler(c *gin.Context) { return } - var sessionDuration time.Duration - if req.KeepAlive == nil { - sessionDuration = getDefaultSessionDuration() - } else { - sessionDuration = req.KeepAlive.Duration - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration) + rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) var runner *runnerRef select { case runner = <-rCh: diff --git a/server/sched.go b/server/sched.go index 71b535ae..dc492cfb 100644 --- a/server/sched.go +++ b/server/sched.go @@ -24,7 +24,7 @@ type LlmRequest struct { model *Model opts api.Options origNumCtx int // Track the initial ctx request - sessionDuration time.Duration + sessionDuration *api.Duration successCh chan *runnerRef errCh chan error schedAttempts uint @@ -75,7 +75,7 @@ func InitScheduler(ctx context.Context) *Scheduler { } // context must be canceled to decrement ref count and release the runner -func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { +func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { if opts.NumCtx < 4 { opts.NumCtx = 4 } @@ -389,7 +389,9 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm runner.expireTimer.Stop() runner.expireTimer = nil } - runner.sessionDuration = pending.sessionDuration + if pending.sessionDuration != nil { + runner.sessionDuration = pending.sessionDuration.Duration + } pending.successCh <- runner go func() { <-pending.ctx.Done() @@ -402,6 +404,10 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, if numParallel < 1 { numParallel = 1 } + sessionDuration := envconfig.KeepAlive + if req.sessionDuration != nil { + sessionDuration = req.sessionDuration.Duration + } llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp @@ -419,7 +425,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, modelPath: req.model.ModelPath, llama: llama, Options: &req.opts, - sessionDuration: req.sessionDuration, + sessionDuration: sessionDuration, gpus: gpus, estimatedVRAM: llama.EstimatedVRAM(), estimatedTotal: llama.EstimatedTotal(), diff --git a/server/sched_test.go b/server/sched_test.go index be0830a3..d957927e 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -44,7 +44,7 @@ func TestLoad(t *testing.T) { opts: api.DefaultOptions(), successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), - sessionDuration: 2, + sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { @@ -142,7 +142,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV ctx: scenario.ctx, model: model, opts: api.DefaultOptions(), - sessionDuration: 5 * time.Millisecond, + sessionDuration: &api.Duration{Duration: 5 * time.Millisecond}, successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } @@ -156,18 +156,18 @@ func TestRequests(t *testing.T) { // Same model, same request scenario1a := newScenario(t, ctx, "ollama-model-1", 10) - scenario1a.req.sessionDuration = 5 * time.Millisecond + scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} scenario1b := newScenario(t, ctx, "ollama-model-1", 11) scenario1b.req.model = scenario1a.req.model scenario1b.ggml = scenario1a.ggml - scenario1b.req.sessionDuration = 0 + scenario1b.req.sessionDuration = &api.Duration{Duration: 0} // simple reload of same model scenario2a := newScenario(t, ctx, "ollama-model-1", 20) tmpModel := *scenario1a.req.model scenario2a.req.model = &tmpModel scenario2a.ggml = scenario1a.ggml - scenario2a.req.sessionDuration = 5 * time.Millisecond + scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} // Multiple loaded models scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte) @@ -318,11 +318,11 @@ func TestGetRunner(t *testing.T) { defer done() scenario1a := newScenario(t, ctx, "ollama-model-1a", 10) - scenario1a.req.sessionDuration = 0 + scenario1a.req.sessionDuration = &api.Duration{Duration: 0} scenario1b := newScenario(t, ctx, "ollama-model-1b", 10) - scenario1b.req.sessionDuration = 0 + scenario1b.req.sessionDuration = &api.Duration{Duration: 0} scenario1c := newScenario(t, ctx, "ollama-model-1c", 10) - scenario1c.req.sessionDuration = 0 + scenario1c.req.sessionDuration = &api.Duration{Duration: 0} envconfig.MaxQueuedRequests = 1 s := InitScheduler(ctx) s.getGpuFn = func() gpu.GpuInfoList { @@ -402,7 +402,7 @@ func TestPrematureExpired(t *testing.T) { case <-ctx.Done(): t.Fatal("timeout") } - time.Sleep(scenario1a.req.sessionDuration) + time.Sleep(scenario1a.req.sessionDuration.Duration) scenario1a.ctxDone() time.Sleep(20 * time.Millisecond) require.LessOrEqual(t, len(s.finishedReqCh), 1) @@ -423,7 +423,7 @@ func TestUseLoadedRunner(t *testing.T) { ctx: ctx, opts: api.DefaultOptions(), successCh: make(chan *runnerRef, 1), - sessionDuration: 2, + sessionDuration: &api.Duration{Duration: 2}, } finished := make(chan *LlmRequest) llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} @@ -614,7 +614,7 @@ func TestAlreadyCanceled(t *testing.T) { dctx, done2 := context.WithCancel(ctx) done2() scenario1a := newScenario(t, dctx, "ollama-model-1", 10) - scenario1a.req.sessionDuration = 0 + scenario1a.req.sessionDuration = &api.Duration{Duration: 0} s := InitScheduler(ctx) slog.Info("scenario1a") s.pendingReqCh <- scenario1a.req