Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-05-09 09:44:31 +08:00
committed by GitHub
7 changed files with 88 additions and 41 deletions

View File

@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
func HumanBytes2(b uint64) string { func HumanBytes2(b uint64) string {
switch { switch {
case b >= GibiByte:
return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
case b >= MebiByte: case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
case b >= KibiByte: case b >= KibiByte:

View File

@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
// Split up the GPUs by type and try them // Split up the GPUs by type and try them
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range allGpus.ByLibrary() {
var layerCount int var layerCount int
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
if opts.NumGPU < 0 { if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM return true, estimatedVRAM
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM return false, estimatedVRAM
} }
// Given a model and one or more GPU targets, predict how many layers and bytes we can load // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) { func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
if gpus[0].Library == "cpu" {
return 0, 0
}
var memoryAvailable uint64 var memoryAvailable uint64
for _, info := range gpus { for _, info := range gpus {
memoryAvailable += info.FreeMemory memoryAvailable += info.FreeMemory
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0
}
var memoryLayerOutput uint64 var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
), ),
), ),
) )
return layerCount, uint64(memoryRequiredPartial) if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal
}
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal
}
return layerCount, memoryRequiredPartial, memoryRequiredTotal
} }

View File

@@ -49,7 +49,10 @@ type llmServer struct {
options api.Options options api.Options
// TODO - this should be broken down by GPU // TODO - this should be broken down by GPU
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedTotal uint64 // Total size of model
totalLayers uint64
gpuCount int
sem *semaphore.Weighted sem *semaphore.Weighted
} }
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner := "" cpuRunner := ""
var estimatedVRAM uint64 var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64 var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem() memInfo, err := gpu.GetCPUMem()
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
} }
var layers int var layers int
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory { if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else { } else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib} servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
} }
} }
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue continue
} }
if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount = 0
}
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race // Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port := 0 port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
s := &llmServer{ s := &llmServer{
port: port, port: port,
cmd: exec.Command(server, finalParams...), cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr), status: NewStatusWriter(os.Stderr),
options: opts, options: opts,
estimatedVRAM: estimatedVRAM, estimatedVRAM: estimatedVRAM,
sem: semaphore.NewWeighted(int64(numParallel)), estimatedTotal: estimatedTotal,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount,
} }
s.cmd.Env = os.Environ() s.cmd.Env = os.Environ()
@@ -307,6 +325,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
slog.Debug("subprocess", "environment", s.cmd.Env) slog.Debug("subprocess", "environment", s.cmd.Env)
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment them essage about noexec
if errors.Is(err, os.ErrPermission) {
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
continue
}
msg := "" msg := ""
if s.status != nil && s.status.LastErrMsg != "" { if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg msg = s.status.LastErrMsg
@@ -382,6 +405,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
if s.status != nil && s.status.LastErrMsg != "" { if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg msg = s.status.LastErrMsg
} }
if s.cmd.ProcessState.ExitCode() == -1 {
// Most likely a signal killed it, log some more details to try to help troubleshoot
slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
}
return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
} }

View File

@@ -68,6 +68,20 @@ func (m *Model) String() string {
Args: m.ModelPath, Args: m.ModelPath,
}) })
for _, adapter := range m.AdapterPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "adapter",
Args: adapter,
})
}
for _, projector := range m.ProjectorPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "model",
Args: projector,
})
}
if m.Template != "" { if m.Template != "" {
modelfile.Commands = append(modelfile.Commands, model.Command{ modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "template", Name: "template",
@@ -82,20 +96,6 @@ func (m *Model) String() string {
}) })
} }
for _, adapter := range m.AdapterPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "adapter",
Args: adapter,
})
}
for _, projector := range m.ProjectorPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "projector",
Args: projector,
})
}
for k, v := range m.Options { for k, v := range m.Options {
switch v := v.(type) { switch v := v.(type) {
case []any: case []any:

View File

@@ -935,6 +935,11 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
} }
if allowedHost(host) { if allowedHost(host) {
if c.Request.Method == "OPTIONS" {
c.AbortWithStatus(http.StatusNoContent)
return
}
c.Next() c.Next()
return return
} }
@@ -947,6 +952,7 @@ func (s *Server) GenerateRoutes() http.Handler {
config := cors.DefaultConfig() config := cors.DefaultConfig()
config.AllowWildcard = true config.AllowWildcard = true
config.AllowBrowserExtensions = true config.AllowBrowserExtensions = true
config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
config.AllowOrigins = envconfig.AllowOrigins config.AllowOrigins = envconfig.AllowOrigins
r := gin.Default() r := gin.Default()

View File

@@ -290,12 +290,14 @@ func (n Name) Filepath() string {
if !n.IsFullyQualified() { if !n.IsFullyQualified() {
panic("illegal attempt to get filepath of invalid name") panic("illegal attempt to get filepath of invalid name")
} }
return strings.ToLower(filepath.Join( return filepath.Join(
n.Host, strings.ToLower(filepath.Join(
n.Namespace, n.Host,
n.Model, n.Namespace,
n.Model,
)),
n.Tag, n.Tag,
)) )
} }
// LogValue returns a slog.Value that represents the name as a string. // LogValue returns a slog.Value that represents the name as a string.

View File

@@ -19,6 +19,16 @@ func TestParseNameParts(t *testing.T) {
wantFilepath string wantFilepath string
wantValidDigest bool wantValidDigest bool
}{ }{
{
in: "registry.ollama.ai/library/dolphin-mistral:7b-v2.6-dpo-laser-q6_K",
want: Name{
Host: "registry.ollama.ai",
Namespace: "library",
Model: "dolphin-mistral",
Tag: "7b-v2.6-dpo-laser-q6_K",
},
wantFilepath: filepath.Join("registry.ollama.ai", "library", "dolphin-mistral", "7b-v2.6-dpo-laser-q6_K"),
},
{ {
in: "scheme://host:port/namespace/model:tag", in: "scheme://host:port/namespace/model:tag",
want: Name{ want: Name{
@@ -266,9 +276,9 @@ func TestFilepathAllocs(t *testing.T) {
allocs := testing.AllocsPerRun(1000, func() { allocs := testing.AllocsPerRun(1000, func() {
n.Filepath() n.Filepath()
}) })
allowedAllocs := 2.0 var allowedAllocs float64 = 3
if runtime.GOOS == "windows" { if runtime.GOOS == "windows" {
allowedAllocs = 4 allowedAllocs = 5
} }
if allocs > allowedAllocs { if allocs > allowedAllocs {
t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs) t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs)