mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
|
|||||||
|
|
||||||
func HumanBytes2(b uint64) string {
|
func HumanBytes2(b uint64) string {
|
||||||
switch {
|
switch {
|
||||||
|
case b >= GibiByte:
|
||||||
|
return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
|
||||||
case b >= MebiByte:
|
case b >= MebiByte:
|
||||||
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
|
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
|
||||||
case b >= KibiByte:
|
case b >= KibiByte:
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||||||
// Split up the GPUs by type and try them
|
// Split up the GPUs by type and try them
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
for _, gpus := range allGpus.ByLibrary() {
|
||||||
var layerCount int
|
var layerCount int
|
||||||
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
if opts.NumGPU < 0 {
|
if opts.NumGPU < 0 {
|
||||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||||
return true, estimatedVRAM
|
return true, estimatedVRAM
|
||||||
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||||||
return false, estimatedVRAM
|
return false, estimatedVRAM
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||||
// The GPUs provided must all be the same Library
|
// The GPUs provided must all be the same Library
|
||||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
|
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
|
||||||
if gpus[0].Library == "cpu" {
|
|
||||||
return 0, 0
|
|
||||||
}
|
|
||||||
var memoryAvailable uint64
|
var memoryAvailable uint64
|
||||||
for _, info := range gpus {
|
for _, info := range gpus {
|
||||||
memoryAvailable += info.FreeMemory
|
memoryAvailable += info.FreeMemory
|
||||||
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||||
|
|
||||||
if memoryRequiredPartial > memoryAvailable {
|
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
|
||||||
return 0, 0
|
|
||||||
}
|
|
||||||
|
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return layerCount, uint64(memoryRequiredPartial)
|
if gpus[0].Library == "cpu" {
|
||||||
|
return 0, 0, memoryRequiredTotal
|
||||||
|
}
|
||||||
|
if memoryRequiredPartial > memoryAvailable {
|
||||||
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
|
return 0, 0, memoryRequiredTotal
|
||||||
|
}
|
||||||
|
|
||||||
|
return layerCount, memoryRequiredPartial, memoryRequiredTotal
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,7 +49,10 @@ type llmServer struct {
|
|||||||
options api.Options
|
options api.Options
|
||||||
|
|
||||||
// TODO - this should be broken down by GPU
|
// TODO - this should be broken down by GPU
|
||||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
||||||
|
estimatedTotal uint64 // Total size of model
|
||||||
|
totalLayers uint64
|
||||||
|
gpuCount int
|
||||||
|
|
||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
|
|
||||||
cpuRunner := ""
|
cpuRunner := ""
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
var estimatedTotal uint64
|
||||||
var systemMemory uint64
|
var systemMemory uint64
|
||||||
|
gpuCount := len(gpus)
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
||||||
|
|
||||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
||||||
|
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = serverForCpu()
|
||||||
|
gpuCount = 0
|
||||||
} else {
|
} else {
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
memInfo, err := gpu.GetCPUMem()
|
memInfo, err := gpu.GetCPUMem()
|
||||||
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var layers int
|
var layers int
|
||||||
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|
||||||
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
} else {
|
} else {
|
||||||
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
|
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
|
||||||
servers = []string{demandLib}
|
servers = []string{demandLib}
|
||||||
|
if strings.HasPrefix(demandLib, "cpu") {
|
||||||
|
// Omit the GPU flag to silence the warning
|
||||||
|
opts.NumGPU = -1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if strings.HasPrefix(servers[i], "cpu") {
|
||||||
|
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
||||||
|
gpuCount = 0
|
||||||
|
}
|
||||||
|
|
||||||
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
||||||
port := 0
|
port := 0
|
||||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||||
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
|
|
||||||
s := &llmServer{
|
s := &llmServer{
|
||||||
port: port,
|
port: port,
|
||||||
cmd: exec.Command(server, finalParams...),
|
cmd: exec.Command(server, finalParams...),
|
||||||
status: NewStatusWriter(os.Stderr),
|
status: NewStatusWriter(os.Stderr),
|
||||||
options: opts,
|
options: opts,
|
||||||
estimatedVRAM: estimatedVRAM,
|
estimatedVRAM: estimatedVRAM,
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
estimatedTotal: estimatedTotal,
|
||||||
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
|
totalLayers: ggml.KV().BlockCount() + 1,
|
||||||
|
gpuCount: gpuCount,
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cmd.Env = os.Environ()
|
s.cmd.Env = os.Environ()
|
||||||
@@ -307,6 +325,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
slog.Debug("subprocess", "environment", s.cmd.Env)
|
slog.Debug("subprocess", "environment", s.cmd.Env)
|
||||||
|
|
||||||
if err = s.cmd.Start(); err != nil {
|
if err = s.cmd.Start(); err != nil {
|
||||||
|
// Detect permission denied and augment them essage about noexec
|
||||||
|
if errors.Is(err, os.ErrPermission) {
|
||||||
|
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
|
||||||
|
continue
|
||||||
|
}
|
||||||
msg := ""
|
msg := ""
|
||||||
if s.status != nil && s.status.LastErrMsg != "" {
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
msg = s.status.LastErrMsg
|
msg = s.status.LastErrMsg
|
||||||
@@ -382,6 +405,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
|||||||
if s.status != nil && s.status.LastErrMsg != "" {
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
msg = s.status.LastErrMsg
|
msg = s.status.LastErrMsg
|
||||||
}
|
}
|
||||||
|
if s.cmd.ProcessState.ExitCode() == -1 {
|
||||||
|
// Most likely a signal killed it, log some more details to try to help troubleshoot
|
||||||
|
slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
|
||||||
|
}
|
||||||
return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,20 @@ func (m *Model) String() string {
|
|||||||
Args: m.ModelPath,
|
Args: m.ModelPath,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
for _, adapter := range m.AdapterPaths {
|
||||||
|
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||||
|
Name: "adapter",
|
||||||
|
Args: adapter,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, projector := range m.ProjectorPaths {
|
||||||
|
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||||
|
Name: "model",
|
||||||
|
Args: projector,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
if m.Template != "" {
|
if m.Template != "" {
|
||||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||||
Name: "template",
|
Name: "template",
|
||||||
@@ -82,20 +96,6 @@ func (m *Model) String() string {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, adapter := range m.AdapterPaths {
|
|
||||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
|
||||||
Name: "adapter",
|
|
||||||
Args: adapter,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, projector := range m.ProjectorPaths {
|
|
||||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
|
||||||
Name: "projector",
|
|
||||||
Args: projector,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
for k, v := range m.Options {
|
for k, v := range m.Options {
|
||||||
switch v := v.(type) {
|
switch v := v.(type) {
|
||||||
case []any:
|
case []any:
|
||||||
|
|||||||
@@ -935,6 +935,11 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if allowedHost(host) {
|
if allowedHost(host) {
|
||||||
|
if c.Request.Method == "OPTIONS" {
|
||||||
|
c.AbortWithStatus(http.StatusNoContent)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
c.Next()
|
c.Next()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -947,6 +952,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
|||||||
config := cors.DefaultConfig()
|
config := cors.DefaultConfig()
|
||||||
config.AllowWildcard = true
|
config.AllowWildcard = true
|
||||||
config.AllowBrowserExtensions = true
|
config.AllowBrowserExtensions = true
|
||||||
|
config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
|
||||||
config.AllowOrigins = envconfig.AllowOrigins
|
config.AllowOrigins = envconfig.AllowOrigins
|
||||||
|
|
||||||
r := gin.Default()
|
r := gin.Default()
|
||||||
|
|||||||
@@ -290,12 +290,14 @@ func (n Name) Filepath() string {
|
|||||||
if !n.IsFullyQualified() {
|
if !n.IsFullyQualified() {
|
||||||
panic("illegal attempt to get filepath of invalid name")
|
panic("illegal attempt to get filepath of invalid name")
|
||||||
}
|
}
|
||||||
return strings.ToLower(filepath.Join(
|
return filepath.Join(
|
||||||
n.Host,
|
strings.ToLower(filepath.Join(
|
||||||
n.Namespace,
|
n.Host,
|
||||||
n.Model,
|
n.Namespace,
|
||||||
|
n.Model,
|
||||||
|
)),
|
||||||
n.Tag,
|
n.Tag,
|
||||||
))
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LogValue returns a slog.Value that represents the name as a string.
|
// LogValue returns a slog.Value that represents the name as a string.
|
||||||
|
|||||||
@@ -19,6 +19,16 @@ func TestParseNameParts(t *testing.T) {
|
|||||||
wantFilepath string
|
wantFilepath string
|
||||||
wantValidDigest bool
|
wantValidDigest bool
|
||||||
}{
|
}{
|
||||||
|
{
|
||||||
|
in: "registry.ollama.ai/library/dolphin-mistral:7b-v2.6-dpo-laser-q6_K",
|
||||||
|
want: Name{
|
||||||
|
Host: "registry.ollama.ai",
|
||||||
|
Namespace: "library",
|
||||||
|
Model: "dolphin-mistral",
|
||||||
|
Tag: "7b-v2.6-dpo-laser-q6_K",
|
||||||
|
},
|
||||||
|
wantFilepath: filepath.Join("registry.ollama.ai", "library", "dolphin-mistral", "7b-v2.6-dpo-laser-q6_K"),
|
||||||
|
},
|
||||||
{
|
{
|
||||||
in: "scheme://host:port/namespace/model:tag",
|
in: "scheme://host:port/namespace/model:tag",
|
||||||
want: Name{
|
want: Name{
|
||||||
@@ -266,9 +276,9 @@ func TestFilepathAllocs(t *testing.T) {
|
|||||||
allocs := testing.AllocsPerRun(1000, func() {
|
allocs := testing.AllocsPerRun(1000, func() {
|
||||||
n.Filepath()
|
n.Filepath()
|
||||||
})
|
})
|
||||||
allowedAllocs := 2.0
|
var allowedAllocs float64 = 3
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
allowedAllocs = 4
|
allowedAllocs = 5
|
||||||
}
|
}
|
||||||
if allocs > allowedAllocs {
|
if allocs > allowedAllocs {
|
||||||
t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs)
|
t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs)
|
||||||
|
|||||||
Reference in New Issue
Block a user