mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 15:08:27 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
105
api/types.go
105
api/types.go
@@ -159,49 +159,18 @@ type Options struct {
|
|||||||
|
|
||||||
// Runner options which must be set when the model is loaded into memory
|
// Runner options which must be set when the model is loaded into memory
|
||||||
type Runner struct {
|
type Runner struct {
|
||||||
UseNUMA bool `json:"numa,omitempty"`
|
UseNUMA bool `json:"numa,omitempty"`
|
||||||
NumCtx int `json:"num_ctx,omitempty"`
|
NumCtx int `json:"num_ctx,omitempty"`
|
||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
F16KV bool `json:"f16_kv,omitempty"`
|
F16KV bool `json:"f16_kv,omitempty"`
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap TriState `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
UseMLock bool `json:"use_mlock,omitempty"`
|
UseMLock bool `json:"use_mlock,omitempty"`
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
}
|
|
||||||
|
|
||||||
type TriState int
|
|
||||||
|
|
||||||
const (
|
|
||||||
TriStateUndefined TriState = -1
|
|
||||||
TriStateFalse TriState = 0
|
|
||||||
TriStateTrue TriState = 1
|
|
||||||
)
|
|
||||||
|
|
||||||
func (b *TriState) UnmarshalJSON(data []byte) error {
|
|
||||||
var v bool
|
|
||||||
if err := json.Unmarshal(data, &v); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if v {
|
|
||||||
*b = TriStateTrue
|
|
||||||
}
|
|
||||||
*b = TriStateFalse
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *TriState) MarshalJSON() ([]byte, error) {
|
|
||||||
if *b == TriStateUndefined {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
var v bool
|
|
||||||
if *b == TriStateTrue {
|
|
||||||
v = true
|
|
||||||
}
|
|
||||||
return json.Marshal(v)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||||
@@ -444,19 +413,6 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
|
|
||||||
val, ok := val.(bool)
|
|
||||||
if !ok {
|
|
||||||
return fmt.Errorf("option %q must be of type boolean", key)
|
|
||||||
}
|
|
||||||
if val {
|
|
||||||
field.SetInt(int64(TriStateTrue))
|
|
||||||
} else {
|
|
||||||
field.SetInt(int64(TriStateFalse))
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
switch field.Kind() {
|
switch field.Kind() {
|
||||||
case reflect.Int:
|
case reflect.Int:
|
||||||
switch t := val.(type) {
|
switch t := val.(type) {
|
||||||
@@ -503,6 +459,17 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
slice[i] = str
|
slice[i] = str
|
||||||
}
|
}
|
||||||
field.Set(reflect.ValueOf(slice))
|
field.Set(reflect.ValueOf(slice))
|
||||||
|
case reflect.Pointer:
|
||||||
|
var b bool
|
||||||
|
if field.Type() == reflect.TypeOf(&b) {
|
||||||
|
val, ok := val.(bool)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("option %q must be of type boolean", key)
|
||||||
|
}
|
||||||
|
field.Set(reflect.ValueOf(&val))
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("unknown type loading config params: %v %v", field.Kind(), field.Type())
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
|
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
|
||||||
}
|
}
|
||||||
@@ -545,7 +512,7 @@ func DefaultOptions() Options {
|
|||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
UseMLock: false,
|
UseMLock: false,
|
||||||
UseMMap: TriStateUndefined,
|
UseMMap: nil,
|
||||||
UseNUMA: false,
|
UseNUMA: false,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -615,19 +582,6 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
|
|||||||
} else {
|
} else {
|
||||||
field := valueOpts.FieldByName(opt.Name)
|
field := valueOpts.FieldByName(opt.Name)
|
||||||
if field.IsValid() && field.CanSet() {
|
if field.IsValid() && field.CanSet() {
|
||||||
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
|
|
||||||
boolVal, err := strconv.ParseBool(vals[0])
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid bool value %s", vals)
|
|
||||||
}
|
|
||||||
if boolVal {
|
|
||||||
out[key] = TriStateTrue
|
|
||||||
} else {
|
|
||||||
out[key] = TriStateFalse
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
switch field.Kind() {
|
switch field.Kind() {
|
||||||
case reflect.Float32:
|
case reflect.Float32:
|
||||||
floatVal, err := strconv.ParseFloat(vals[0], 32)
|
floatVal, err := strconv.ParseFloat(vals[0], 32)
|
||||||
@@ -655,6 +609,17 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
|
|||||||
case reflect.Slice:
|
case reflect.Slice:
|
||||||
// TODO: only string slices are supported right now
|
// TODO: only string slices are supported right now
|
||||||
out[key] = vals
|
out[key] = vals
|
||||||
|
case reflect.Pointer:
|
||||||
|
var b bool
|
||||||
|
if field.Type() == reflect.TypeOf(&b) {
|
||||||
|
boolVal, err := strconv.ParseBool(vals[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid bool value %s", vals)
|
||||||
|
}
|
||||||
|
out[key] = &boolVal
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
|
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,25 +108,27 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestUseMmapParsingFromJSON(t *testing.T) {
|
func TestUseMmapParsingFromJSON(t *testing.T) {
|
||||||
|
tr := true
|
||||||
|
fa := false
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
req string
|
req string
|
||||||
exp TriState
|
exp *bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "Undefined",
|
name: "Undefined",
|
||||||
req: `{ }`,
|
req: `{ }`,
|
||||||
exp: TriStateUndefined,
|
exp: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "True",
|
name: "True",
|
||||||
req: `{ "use_mmap": true }`,
|
req: `{ "use_mmap": true }`,
|
||||||
exp: TriStateTrue,
|
exp: &tr,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "False",
|
name: "False",
|
||||||
req: `{ "use_mmap": false }`,
|
req: `{ "use_mmap": false }`,
|
||||||
exp: TriStateFalse,
|
exp: &fa,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -144,50 +146,52 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestUseMmapFormatParams(t *testing.T) {
|
func TestUseMmapFormatParams(t *testing.T) {
|
||||||
|
tr := true
|
||||||
|
fa := false
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
req map[string][]string
|
req map[string][]string
|
||||||
exp TriState
|
exp *bool
|
||||||
err error
|
err error
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "True",
|
name: "True",
|
||||||
req: map[string][]string{
|
req: map[string][]string{
|
||||||
"use_mmap": []string{"true"},
|
"use_mmap": {"true"},
|
||||||
},
|
},
|
||||||
exp: TriStateTrue,
|
exp: &tr,
|
||||||
err: nil,
|
err: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "False",
|
name: "False",
|
||||||
req: map[string][]string{
|
req: map[string][]string{
|
||||||
"use_mmap": []string{"false"},
|
"use_mmap": {"false"},
|
||||||
},
|
},
|
||||||
exp: TriStateFalse,
|
exp: &fa,
|
||||||
err: nil,
|
err: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Numeric True",
|
name: "Numeric True",
|
||||||
req: map[string][]string{
|
req: map[string][]string{
|
||||||
"use_mmap": []string{"1"},
|
"use_mmap": {"1"},
|
||||||
},
|
},
|
||||||
exp: TriStateTrue,
|
exp: &tr,
|
||||||
err: nil,
|
err: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Numeric False",
|
name: "Numeric False",
|
||||||
req: map[string][]string{
|
req: map[string][]string{
|
||||||
"use_mmap": []string{"0"},
|
"use_mmap": {"0"},
|
||||||
},
|
},
|
||||||
exp: TriStateFalse,
|
exp: &fa,
|
||||||
err: nil,
|
err: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "invalid string",
|
name: "invalid string",
|
||||||
req: map[string][]string{
|
req: map[string][]string{
|
||||||
"use_mmap": []string{"foo"},
|
"use_mmap": {"foo"},
|
||||||
},
|
},
|
||||||
exp: TriStateUndefined,
|
exp: nil,
|
||||||
err: fmt.Errorf("invalid bool value [foo]"),
|
err: fmt.Errorf("invalid bool value [foo]"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -195,11 +199,11 @@ func TestUseMmapFormatParams(t *testing.T) {
|
|||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(test.name, func(t *testing.T) {
|
||||||
resp, err := FormatParams(test.req)
|
resp, err := FormatParams(test.req)
|
||||||
require.Equal(t, err, test.err)
|
require.Equal(t, test.err, err)
|
||||||
respVal, ok := resp["use_mmap"]
|
respVal, ok := resp["use_mmap"]
|
||||||
if test.exp != TriStateUndefined {
|
if test.exp != nil {
|
||||||
assert.True(t, ok, "resp: %v", resp)
|
assert.True(t, ok, "resp: %v", resp)
|
||||||
assert.Equal(t, test.exp, respVal)
|
assert.Equal(t, *test.exp, *respVal.(*bool))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
|
|||||||
|
|
||||||
If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
|
If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
|
||||||
|
|
||||||
## Container fails to run on NVIDIA GPU
|
## NVIDIA GPU Discovery
|
||||||
|
|
||||||
Make sure you've set up the container runtime first as described in [docker.md](./docker.md)
|
When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results.
|
||||||
|
|
||||||
Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
|
### Linux NVIDIA Troubleshooting
|
||||||
|
|
||||||
- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
|
If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
|
||||||
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
|
|
||||||
|
Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
|
||||||
|
|
||||||
|
- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
|
||||||
|
- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
|
||||||
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
|
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
|
||||||
- Try rebooting
|
- Try rebooting
|
||||||
- Make sure you're running the latest nvidia drivers
|
- Make sure you're running the latest nvidia drivers
|
||||||
|
|||||||
@@ -4,12 +4,14 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type OllamaHost struct {
|
type OllamaHost struct {
|
||||||
@@ -34,17 +36,17 @@ var (
|
|||||||
// Set via OLLAMA_HOST in the environment
|
// Set via OLLAMA_HOST in the environment
|
||||||
Host *OllamaHost
|
Host *OllamaHost
|
||||||
// Set via OLLAMA_KEEP_ALIVE in the environment
|
// Set via OLLAMA_KEEP_ALIVE in the environment
|
||||||
KeepAlive string
|
KeepAlive time.Duration
|
||||||
// Set via OLLAMA_LLM_LIBRARY in the environment
|
// Set via OLLAMA_LLM_LIBRARY in the environment
|
||||||
LLMLibrary string
|
LLMLibrary string
|
||||||
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
|
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
|
||||||
MaxRunners int
|
MaxRunners int
|
||||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||||
MaxQueuedRequests int
|
MaxQueuedRequests int
|
||||||
// Set via OLLAMA_MODELS in the environment
|
|
||||||
ModelsDir string
|
|
||||||
// Set via OLLAMA_MAX_VRAM in the environment
|
// Set via OLLAMA_MAX_VRAM in the environment
|
||||||
MaxVRAM uint64
|
MaxVRAM uint64
|
||||||
|
// Set via OLLAMA_MODELS in the environment
|
||||||
|
ModelsDir string
|
||||||
// Set via OLLAMA_NOHISTORY in the environment
|
// Set via OLLAMA_NOHISTORY in the environment
|
||||||
NoHistory bool
|
NoHistory bool
|
||||||
// Set via OLLAMA_NOPRUNE in the environment
|
// Set via OLLAMA_NOPRUNE in the environment
|
||||||
@@ -132,6 +134,7 @@ func init() {
|
|||||||
NumParallel = 0 // Autoselect
|
NumParallel = 0 // Autoselect
|
||||||
MaxRunners = 0 // Autoselect
|
MaxRunners = 0 // Autoselect
|
||||||
MaxQueuedRequests = 512
|
MaxQueuedRequests = 512
|
||||||
|
KeepAlive = 5 * time.Minute
|
||||||
|
|
||||||
LoadConfig()
|
LoadConfig()
|
||||||
}
|
}
|
||||||
@@ -266,7 +269,10 @@ func LoadConfig() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
KeepAlive = clean("OLLAMA_KEEP_ALIVE")
|
ka := clean("OLLAMA_KEEP_ALIVE")
|
||||||
|
if ka != "" {
|
||||||
|
loadKeepAlive(ka)
|
||||||
|
}
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
ModelsDir, err = getModelsDir()
|
ModelsDir, err = getModelsDir()
|
||||||
@@ -344,3 +350,24 @@ func getOllamaHost() (*OllamaHost, error) {
|
|||||||
Port: port,
|
Port: port,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadKeepAlive(ka string) {
|
||||||
|
v, err := strconv.Atoi(ka)
|
||||||
|
if err != nil {
|
||||||
|
d, err := time.ParseDuration(ka)
|
||||||
|
if err == nil {
|
||||||
|
if d < 0 {
|
||||||
|
KeepAlive = time.Duration(math.MaxInt64)
|
||||||
|
} else {
|
||||||
|
KeepAlive = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
d := time.Duration(v) * time.Second
|
||||||
|
if d < 0 {
|
||||||
|
KeepAlive = time.Duration(math.MaxInt64)
|
||||||
|
} else {
|
||||||
|
KeepAlive = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,8 +2,10 @@ package envconfig
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
@@ -23,6 +25,21 @@ func TestConfig(t *testing.T) {
|
|||||||
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
|
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
|
||||||
LoadConfig()
|
LoadConfig()
|
||||||
require.True(t, FlashAttention)
|
require.True(t, FlashAttention)
|
||||||
|
t.Setenv("OLLAMA_KEEP_ALIVE", "")
|
||||||
|
LoadConfig()
|
||||||
|
require.Equal(t, 5*time.Minute, KeepAlive)
|
||||||
|
t.Setenv("OLLAMA_KEEP_ALIVE", "3")
|
||||||
|
LoadConfig()
|
||||||
|
require.Equal(t, 3*time.Second, KeepAlive)
|
||||||
|
t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
|
||||||
|
LoadConfig()
|
||||||
|
require.Equal(t, 1*time.Hour, KeepAlive)
|
||||||
|
t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
|
||||||
|
LoadConfig()
|
||||||
|
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
|
||||||
|
t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
|
||||||
|
LoadConfig()
|
||||||
|
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestClientFromEnvironment(t *testing.T) {
|
func TestClientFromEnvironment(t *testing.T) {
|
||||||
|
|||||||
23
gpu/gpu.go
23
gpu/gpu.go
@@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
if !bootstrapped {
|
if !bootstrapped {
|
||||||
slog.Debug("Detecting GPUs")
|
slog.Info("looking for compatible GPUs")
|
||||||
needRefresh = false
|
needRefresh = false
|
||||||
cpuCapability = GetCPUCapability()
|
cpuCapability = GetCPUCapability()
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
@@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
|
|
||||||
rocmGPUs = AMDGetGPUInfo()
|
rocmGPUs = AMDGetGPUInfo()
|
||||||
bootstrapped = true
|
bootstrapped = true
|
||||||
|
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
|
||||||
|
slog.Info("no compatible GPUs were discovered")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// For detected GPUs, load library if not loaded
|
// For detected GPUs, load library if not loaded
|
||||||
@@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
|
|||||||
defer C.free(unsafe.Pointer(lib))
|
defer C.free(unsafe.Pointer(lib))
|
||||||
C.nvcuda_init(lib, &resp)
|
C.nvcuda_init(lib, &resp)
|
||||||
if resp.err != nil {
|
if resp.err != nil {
|
||||||
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
|
// Decide what log level based on the type of error message to help users understand why
|
||||||
|
msg := C.GoString(resp.err)
|
||||||
|
switch resp.cudaErr {
|
||||||
|
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
|
||||||
|
slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
|
||||||
|
case C.CUDA_ERROR_NO_DEVICE:
|
||||||
|
slog.Info("no nvidia devices detected", "library", libPath)
|
||||||
|
case C.CUDA_ERROR_UNKNOWN:
|
||||||
|
slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
|
||||||
|
slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
|
||||||
|
default:
|
||||||
|
if strings.Contains(msg, "wrong ELF class") {
|
||||||
|
slog.Debug("skipping 32bit library", "library", libPath)
|
||||||
|
} else {
|
||||||
|
slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
C.free(unsafe.Pointer(resp.err))
|
C.free(unsafe.Pointer(resp.err))
|
||||||
} else {
|
} else {
|
||||||
return int(resp.num_devices), &resp.ch, libPath
|
return int(resp.num_devices), &resp.ch, libPath
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
CUresult ret;
|
CUresult ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->num_devices = 0;
|
resp->num_devices = 0;
|
||||||
|
resp->cudaErr = CUDA_SUCCESS;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i;
|
int i;
|
||||||
@@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
nvcuda_lib_path, msg);
|
nvcuda_lib_path, msg);
|
||||||
free(msg);
|
free(msg);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
resp->cudaErr = -1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
msg);
|
msg);
|
||||||
free(msg);
|
free(msg);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
resp->cudaErr = -1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
resp->ch.handle = NULL;
|
resp->ch.handle = NULL;
|
||||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
|
||||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
resp->cudaErr = ret;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->ch.handle = NULL;
|
resp->ch.handle = NULL;
|
||||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
resp->cudaErr = ret;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
if (h.handle == NULL) {
|
||||||
resp->err = strdup("nvcuda handle isn't initialized");
|
resp->err = strdup("cuda driver library handle isn't initialized");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.cuDeviceGet)(&device, i);
|
ret = (*h.cuDeviceGet)(&device, i);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
snprintf(buf, buflen, "nvcuda device failed to initialize");
|
snprintf(buf, buflen, "cuda driver library device failed to initialize");
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
// To get memory we have to set (and release) a context
|
// To get memory we have to set (and release) a context
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
|
snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
|
snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
// Best effort on failure...
|
// Best effort on failure...
|
||||||
(*h.cuCtxDestroy)(ctx);
|
(*h.cuCtxDestroy)(ctx);
|
||||||
@@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
|
|
||||||
ret = (*h.cuCtxDestroy)(ctx);
|
ret = (*h.cuCtxDestroy)(ctx);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda failed to release device context %d", ret);
|
LOG(1, "cuda driver library failed to release device context %d", ret);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
|||||||
|
|
||||||
ret = (*h.cuDeviceGet)(&device, i);
|
ret = (*h.cuDeviceGet)(&device, i);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda device failed to initialize");
|
LOG(1, "cuda driver library device failed to initialize");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
|||||||
// To get memory we have to set (and release) a context
|
// To get memory we have to set (and release) a context
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda failed to get device context %d", ret);
|
LOG(1, "cuda driver library failed to get device context %d", ret);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.cuMemGetInfo_v2)(free, total);
|
ret = (*h.cuMemGetInfo_v2)(free, total);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda device memory info lookup failure %d", ret);
|
LOG(1, "cuda driver library device memory info lookup failure %d", ret);
|
||||||
// Best effort on failure...
|
// Best effort on failure...
|
||||||
(*h.cuCtxDestroy)(ctx);
|
(*h.cuCtxDestroy)(ctx);
|
||||||
return;
|
return;
|
||||||
@@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
|||||||
|
|
||||||
ret = (*h.cuCtxDestroy)(ctx);
|
ret = (*h.cuCtxDestroy)(ctx);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda failed to release device context %d", ret);
|
LOG(1, "cuda driver library failed to release device context %d", ret);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void nvcuda_release(nvcuda_handle_t h) {
|
void nvcuda_release(nvcuda_handle_t h) {
|
||||||
LOG(h.verbose, "releasing nvcuda library\n");
|
LOG(h.verbose, "releasing cuda driver library\n");
|
||||||
UNLOAD_LIBRARY(h.handle);
|
UNLOAD_LIBRARY(h.handle);
|
||||||
// TODO and other context release logic?
|
// TODO and other context release logic?
|
||||||
h.handle = NULL;
|
h.handle = NULL;
|
||||||
|
|||||||
@@ -7,9 +7,12 @@
|
|||||||
typedef enum cudaError_enum {
|
typedef enum cudaError_enum {
|
||||||
CUDA_SUCCESS = 0,
|
CUDA_SUCCESS = 0,
|
||||||
CUDA_ERROR_INVALID_VALUE = 1,
|
CUDA_ERROR_INVALID_VALUE = 1,
|
||||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
||||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||||
|
CUDA_ERROR_NO_DEVICE = 100,
|
||||||
|
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
|
||||||
|
CUDA_ERROR_UNKNOWN = 999,
|
||||||
// Other values omitted for now...
|
// Other values omitted for now...
|
||||||
} CUresult;
|
} CUresult;
|
||||||
|
|
||||||
@@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp {
|
|||||||
char *err; // If err is non-null handle is invalid
|
char *err; // If err is non-null handle is invalid
|
||||||
nvcuda_handle_t ch;
|
nvcuda_handle_t ch;
|
||||||
int num_devices;
|
int num_devices;
|
||||||
|
CUresult cudaErr;
|
||||||
} nvcuda_init_resp_t;
|
} nvcuda_init_resp_t;
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
||||||
|
|||||||
2
llm/ext_server/server.cpp
vendored
2
llm/ext_server/server.cpp
vendored
@@ -1732,7 +1732,7 @@ struct llama_server_context
|
|||||||
slot.n_past -= 1;
|
slot.n_past -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
||||||
|
|
||||||
if (slot.ga_n != 1)
|
if (slot.ga_n != 1)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ func Init() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var variants []string
|
var variants []string
|
||||||
for v := range availableServers() {
|
for v := range getAvailableServers() {
|
||||||
variants = append(variants, v)
|
variants = append(variants, v)
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
|
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
|
||||||
@@ -50,7 +50,7 @@ func Init() error {
|
|||||||
// binary names may contain an optional variant separated by '_'
|
// binary names may contain an optional variant separated by '_'
|
||||||
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
|
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
|
||||||
// Any library without a variant is the lowest common denominator
|
// Any library without a variant is the lowest common denominator
|
||||||
func availableServers() map[string]string {
|
func getAvailableServers() map[string]string {
|
||||||
payloadsDir, err := gpu.PayloadsDir()
|
payloadsDir, err := gpu.PayloadsDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("payload lookup error", "error", err)
|
slog.Error("payload lookup error", "error", err)
|
||||||
@@ -80,7 +80,7 @@ func availableServers() map[string]string {
|
|||||||
// TODO - switch to metadata based mapping
|
// TODO - switch to metadata based mapping
|
||||||
func serversForGpu(info gpu.GpuInfo) []string {
|
func serversForGpu(info gpu.GpuInfo) []string {
|
||||||
// glob workDir for files that start with ollama_
|
// glob workDir for files that start with ollama_
|
||||||
availableServers := availableServers()
|
availableServers := getAvailableServers()
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != gpu.CPUCapabilityNone {
|
if info.Variant != gpu.CPUCapabilityNone {
|
||||||
requested += "_" + info.Variant.String()
|
requested += "_" + info.Variant.String()
|
||||||
@@ -115,27 +115,29 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
|||||||
servers = append(servers, alt...)
|
servers = append(servers, alt...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load up the best CPU variant if not primary requested
|
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||||
if info.Library != "cpu" {
|
// Load up the best CPU variant if not primary requested
|
||||||
variant := gpu.GetCPUCapability()
|
if info.Library != "cpu" {
|
||||||
// If no variant, then we fall back to default
|
variant := gpu.GetCPUCapability()
|
||||||
// If we have a variant, try that if we find an exact match
|
// If no variant, then we fall back to default
|
||||||
// Attempting to run the wrong CPU instructions will panic the
|
// If we have a variant, try that if we find an exact match
|
||||||
// process
|
// Attempting to run the wrong CPU instructions will panic the
|
||||||
if variant != gpu.CPUCapabilityNone {
|
// process
|
||||||
for cmp := range availableServers {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
if cmp == "cpu_"+variant.String() {
|
for cmp := range availableServers {
|
||||||
servers = append(servers, cmp)
|
if cmp == "cpu_"+variant.String() {
|
||||||
break
|
servers = append(servers, cmp)
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
servers = append(servers, "cpu")
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
servers = append(servers, "cpu")
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if len(servers) == 0 {
|
if len(servers) == 0 {
|
||||||
servers = []string{"cpu"}
|
servers = []string{"cpu"}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return servers
|
return servers
|
||||||
@@ -147,7 +149,7 @@ func serverForCpu() string {
|
|||||||
return "metal"
|
return "metal"
|
||||||
}
|
}
|
||||||
variant := gpu.GetCPUCapability()
|
variant := gpu.GetCPUCapability()
|
||||||
availableServers := availableServers()
|
availableServers := getAvailableServers()
|
||||||
if variant != gpu.CPUCapabilityNone {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant.String() {
|
if cmp == "cpu_"+variant.String() {
|
||||||
|
|||||||
@@ -131,7 +131,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
availableServers := availableServers()
|
availableServers := getAvailableServers()
|
||||||
|
if len(availableServers) == 0 {
|
||||||
|
if runtime.GOOS != "windows" {
|
||||||
|
slog.Warn("llama server binary disappeared, reinitializing payloads")
|
||||||
|
err = Init()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
availableServers = getAvailableServers()
|
||||||
|
} else {
|
||||||
|
return nil, finalErr
|
||||||
|
}
|
||||||
|
}
|
||||||
var servers []string
|
var servers []string
|
||||||
if cpuRunner != "" {
|
if cpuRunner != "" {
|
||||||
servers = []string{cpuRunner}
|
servers = []string{cpuRunner}
|
||||||
@@ -208,7 +221,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
if g.Library == "metal" &&
|
if g.Library == "metal" &&
|
||||||
uint64(opts.NumGPU) > 0 &&
|
uint64(opts.NumGPU) > 0 &&
|
||||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||||
opts.UseMMap = api.TriStateFalse
|
opts.UseMMap = new(bool)
|
||||||
|
*opts.UseMMap = false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -219,10 +233,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
// Windows CUDA should not use mmap for best performance
|
// Windows CUDA should not use mmap for best performance
|
||||||
// Linux with a model larger than free space, mmap leads to thrashing
|
// Linux with a model larger than free space, mmap leads to thrashing
|
||||||
// For CPU loads we want the memory to be allocated, not FS cache
|
// For CPU loads we want the memory to be allocated, not FS cache
|
||||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == nil) ||
|
||||||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == nil) ||
|
||||||
(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
|
(gpus[0].Library == "cpu" && opts.UseMMap == nil) ||
|
||||||
opts.UseMMap == api.TriStateFalse {
|
(opts.UseMMap != nil && !*opts.UseMMap) {
|
||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ var errorPrefixes = []string{
|
|||||||
"CUDA error",
|
"CUDA error",
|
||||||
"cudaMalloc failed",
|
"cudaMalloc failed",
|
||||||
"\"ERR\"",
|
"\"ERR\"",
|
||||||
"architecture",
|
"error loading model",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *StatusWriter) Write(b []byte) (int, error) {
|
func (w *StatusWriter) Write(b []byte) (int, error) {
|
||||||
|
|||||||
@@ -103,18 +103,9 @@ func (mp ModelPath) GetShortTagname() string {
|
|||||||
return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
|
return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
|
||||||
}
|
}
|
||||||
|
|
||||||
// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
|
|
||||||
// The models directory is where Ollama stores its model files and manifests.
|
|
||||||
func modelsDir() (string, error) {
|
|
||||||
return envconfig.ModelsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
|
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
|
||||||
func (mp ModelPath) GetManifestPath() (string, error) {
|
func (mp ModelPath) GetManifestPath() (string, error) {
|
||||||
dir, err := modelsDir()
|
dir := envconfig.ModelsDir
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
|
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
|
||||||
}
|
}
|
||||||
@@ -127,10 +118,7 @@ func (mp ModelPath) BaseURL() *url.URL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetManifestPath() (string, error) {
|
func GetManifestPath() (string, error) {
|
||||||
dir, err := modelsDir()
|
dir := envconfig.ModelsDir
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
path := filepath.Join(dir, "manifests")
|
path := filepath.Join(dir, "manifests")
|
||||||
if err := os.MkdirAll(path, 0o755); err != nil {
|
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||||
@@ -141,10 +129,7 @@ func GetManifestPath() (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetBlobsPath(digest string) (string, error) {
|
func GetBlobsPath(digest string) (string, error) {
|
||||||
dir, err := modelsDir()
|
dir := envconfig.ModelsDir
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// only accept actual sha256 digests
|
// only accept actual sha256 digests
|
||||||
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/netip"
|
"net/netip"
|
||||||
@@ -17,7 +16,6 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
@@ -56,8 +54,6 @@ func init() {
|
|||||||
gin.SetMode(mode)
|
gin.SetMode(mode)
|
||||||
}
|
}
|
||||||
|
|
||||||
var defaultSessionDuration = 5 * time.Minute
|
|
||||||
|
|
||||||
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
|
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
|
||||||
opts := api.DefaultOptions()
|
opts := api.DefaultOptions()
|
||||||
if err := opts.FromMap(model.Options); err != nil {
|
if err := opts.FromMap(model.Options); err != nil {
|
||||||
@@ -133,14 +129,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var sessionDuration time.Duration
|
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
|
||||||
if req.KeepAlive == nil {
|
|
||||||
sessionDuration = getDefaultSessionDuration()
|
|
||||||
} else {
|
|
||||||
sessionDuration = req.KeepAlive.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
|
||||||
var runner *runnerRef
|
var runner *runnerRef
|
||||||
select {
|
select {
|
||||||
case runner = <-rCh:
|
case runner = <-rCh:
|
||||||
@@ -320,32 +309,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDefaultSessionDuration() time.Duration {
|
|
||||||
if envconfig.KeepAlive != "" {
|
|
||||||
v, err := strconv.Atoi(envconfig.KeepAlive)
|
|
||||||
if err != nil {
|
|
||||||
d, err := time.ParseDuration(envconfig.KeepAlive)
|
|
||||||
if err != nil {
|
|
||||||
return defaultSessionDuration
|
|
||||||
}
|
|
||||||
|
|
||||||
if d < 0 {
|
|
||||||
return time.Duration(math.MaxInt64)
|
|
||||||
}
|
|
||||||
|
|
||||||
return d
|
|
||||||
}
|
|
||||||
|
|
||||||
d := time.Duration(v) * time.Second
|
|
||||||
if d < 0 {
|
|
||||||
return time.Duration(math.MaxInt64)
|
|
||||||
}
|
|
||||||
return d
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultSessionDuration
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||||
var req api.EmbeddingRequest
|
var req api.EmbeddingRequest
|
||||||
err := c.ShouldBindJSON(&req)
|
err := c.ShouldBindJSON(&req)
|
||||||
@@ -380,14 +343,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var sessionDuration time.Duration
|
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
|
||||||
if req.KeepAlive == nil {
|
|
||||||
sessionDuration = getDefaultSessionDuration()
|
|
||||||
} else {
|
|
||||||
sessionDuration = req.KeepAlive.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
|
||||||
var runner *runnerRef
|
var runner *runnerRef
|
||||||
select {
|
select {
|
||||||
case runner = <-rCh:
|
case runner = <-rCh:
|
||||||
@@ -1318,14 +1274,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var sessionDuration time.Duration
|
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive)
|
||||||
if req.KeepAlive == nil {
|
|
||||||
sessionDuration = getDefaultSessionDuration()
|
|
||||||
} else {
|
|
||||||
sessionDuration = req.KeepAlive.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
|
||||||
var runner *runnerRef
|
var runner *runnerRef
|
||||||
select {
|
select {
|
||||||
case runner = <-rCh:
|
case runner = <-rCh:
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ type LlmRequest struct {
|
|||||||
model *Model
|
model *Model
|
||||||
opts api.Options
|
opts api.Options
|
||||||
origNumCtx int // Track the initial ctx request
|
origNumCtx int // Track the initial ctx request
|
||||||
sessionDuration time.Duration
|
sessionDuration *api.Duration
|
||||||
successCh chan *runnerRef
|
successCh chan *runnerRef
|
||||||
errCh chan error
|
errCh chan error
|
||||||
schedAttempts uint
|
schedAttempts uint
|
||||||
@@ -75,7 +75,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// context must be canceled to decrement ref count and release the runner
|
// context must be canceled to decrement ref count and release the runner
|
||||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||||
if opts.NumCtx < 4 {
|
if opts.NumCtx < 4 {
|
||||||
opts.NumCtx = 4
|
opts.NumCtx = 4
|
||||||
}
|
}
|
||||||
@@ -389,7 +389,9 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|||||||
runner.expireTimer.Stop()
|
runner.expireTimer.Stop()
|
||||||
runner.expireTimer = nil
|
runner.expireTimer = nil
|
||||||
}
|
}
|
||||||
runner.sessionDuration = pending.sessionDuration
|
if pending.sessionDuration != nil {
|
||||||
|
runner.sessionDuration = pending.sessionDuration.Duration
|
||||||
|
}
|
||||||
pending.successCh <- runner
|
pending.successCh <- runner
|
||||||
go func() {
|
go func() {
|
||||||
<-pending.ctx.Done()
|
<-pending.ctx.Done()
|
||||||
@@ -402,6 +404,10 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
|||||||
if numParallel < 1 {
|
if numParallel < 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
}
|
}
|
||||||
|
sessionDuration := envconfig.KeepAlive
|
||||||
|
if req.sessionDuration != nil {
|
||||||
|
sessionDuration = req.sessionDuration.Duration
|
||||||
|
}
|
||||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// some older models are not compatible with newer versions of llama.cpp
|
// some older models are not compatible with newer versions of llama.cpp
|
||||||
@@ -419,7 +425,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
|||||||
modelPath: req.model.ModelPath,
|
modelPath: req.model.ModelPath,
|
||||||
llama: llama,
|
llama: llama,
|
||||||
Options: &req.opts,
|
Options: &req.opts,
|
||||||
sessionDuration: req.sessionDuration,
|
sessionDuration: sessionDuration,
|
||||||
gpus: gpus,
|
gpus: gpus,
|
||||||
estimatedVRAM: llama.EstimatedVRAM(),
|
estimatedVRAM: llama.EstimatedVRAM(),
|
||||||
estimatedTotal: llama.EstimatedTotal(),
|
estimatedTotal: llama.EstimatedTotal(),
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ func TestLoad(t *testing.T) {
|
|||||||
opts: api.DefaultOptions(),
|
opts: api.DefaultOptions(),
|
||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
sessionDuration: 2,
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||||
}
|
}
|
||||||
// Fail to load model first
|
// Fail to load model first
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
@@ -142,7 +142,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
|||||||
ctx: scenario.ctx,
|
ctx: scenario.ctx,
|
||||||
model: model,
|
model: model,
|
||||||
opts: api.DefaultOptions(),
|
opts: api.DefaultOptions(),
|
||||||
sessionDuration: 5 * time.Millisecond,
|
sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
|
||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
@@ -156,18 +156,18 @@ func TestRequests(t *testing.T) {
|
|||||||
|
|
||||||
// Same model, same request
|
// Same model, same request
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
||||||
scenario1a.req.sessionDuration = 5 * time.Millisecond
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||||
scenario1b.req.model = scenario1a.req.model
|
scenario1b.req.model = scenario1a.req.model
|
||||||
scenario1b.ggml = scenario1a.ggml
|
scenario1b.ggml = scenario1a.ggml
|
||||||
scenario1b.req.sessionDuration = 0
|
scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
|
||||||
|
|
||||||
// simple reload of same model
|
// simple reload of same model
|
||||||
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
|
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
|
||||||
tmpModel := *scenario1a.req.model
|
tmpModel := *scenario1a.req.model
|
||||||
scenario2a.req.model = &tmpModel
|
scenario2a.req.model = &tmpModel
|
||||||
scenario2a.ggml = scenario1a.ggml
|
scenario2a.ggml = scenario1a.ggml
|
||||||
scenario2a.req.sessionDuration = 5 * time.Millisecond
|
scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||||
|
|
||||||
// Multiple loaded models
|
// Multiple loaded models
|
||||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||||
@@ -318,11 +318,11 @@ func TestGetRunner(t *testing.T) {
|
|||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
||||||
scenario1b.req.sessionDuration = 0
|
scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
|
||||||
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
||||||
scenario1c.req.sessionDuration = 0
|
scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
|
||||||
envconfig.MaxQueuedRequests = 1
|
envconfig.MaxQueuedRequests = 1
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.getGpuFn = func() gpu.GpuInfoList {
|
s.getGpuFn = func() gpu.GpuInfoList {
|
||||||
@@ -402,7 +402,7 @@ func TestPrematureExpired(t *testing.T) {
|
|||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
t.Fatal("timeout")
|
t.Fatal("timeout")
|
||||||
}
|
}
|
||||||
time.Sleep(scenario1a.req.sessionDuration)
|
time.Sleep(scenario1a.req.sessionDuration.Duration)
|
||||||
scenario1a.ctxDone()
|
scenario1a.ctxDone()
|
||||||
time.Sleep(20 * time.Millisecond)
|
time.Sleep(20 * time.Millisecond)
|
||||||
require.LessOrEqual(t, len(s.finishedReqCh), 1)
|
require.LessOrEqual(t, len(s.finishedReqCh), 1)
|
||||||
@@ -423,7 +423,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
|||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
opts: api.DefaultOptions(),
|
opts: api.DefaultOptions(),
|
||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
sessionDuration: 2,
|
sessionDuration: &api.Duration{Duration: 2},
|
||||||
}
|
}
|
||||||
finished := make(chan *LlmRequest)
|
finished := make(chan *LlmRequest)
|
||||||
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
@@ -614,7 +614,7 @@ func TestAlreadyCanceled(t *testing.T) {
|
|||||||
dctx, done2 := context.WithCancel(ctx)
|
dctx, done2 := context.WithCancel(ctx)
|
||||||
done2()
|
done2()
|
||||||
scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
|
scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
slog.Info("scenario1a")
|
slog.Info("scenario1a")
|
||||||
s.pendingReqCh <- scenario1a.req
|
s.pendingReqCh <- scenario1a.req
|
||||||
|
|||||||
Reference in New Issue
Block a user