Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-05-16 22:24:44 +08:00
committed by GitHub
18 changed files with 285 additions and 98 deletions

View File

@@ -377,7 +377,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Testcontainers](https://testcontainers.com/modules/ollama/) - [Testcontainers](https://testcontainers.com/modules/ollama/)
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama) - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama) - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript) - [LlamaScript](https://github.com/Project-Llama/llamascript)
### Mobile ### Mobile
- [Enchanted](https://github.com/AugustDev/enchanted) - [Enchanted](https://github.com/AugustDev/enchanted)

View File

@@ -354,6 +354,15 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
return &lr, nil return &lr, nil
} }
// List running models.
func (c *Client) ListRunning(ctx context.Context) (*ListResponse, error) {
var lr ListResponse
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
return nil, err
}
return &lr, nil
}
// Copy copies a model - creating a model with another name from an existing // Copy copies a model - creating a model with another name from an existing
// model. // model.
func (c *Client) Copy(ctx context.Context, req *CopyRequest) error { func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {

View File

@@ -289,10 +289,12 @@ type ListResponse struct {
type ModelResponse struct { type ModelResponse struct {
Name string `json:"name"` Name string `json:"name"`
Model string `json:"model"` Model string `json:"model"`
ModifiedAt time.Time `json:"modified_at"` ModifiedAt time.Time `json:"modified_at,omitempty"`
Size int64 `json:"size"` Size int64 `json:"size"`
Digest string `json:"digest"` Digest string `json:"digest"`
Details ModelDetails `json:"details,omitempty"` Details ModelDetails `json:"details,omitempty"`
ExpiresAt time.Time `json:"expires_at,omitempty"`
SizeVRAM int64 `json:"size_vram,omitempty"`
} }
type TokenResponse struct { type TokenResponse struct {

View File

@@ -12,6 +12,7 @@ import (
"fmt" "fmt"
"io" "io"
"log" "log"
"math"
"net" "net"
"net/http" "net/http"
"os" "os"
@@ -324,6 +325,18 @@ func RunHandler(cmd *cobra.Command, args []string) error {
} }
opts.Format = format opts.Format = format
keepAlive, err := cmd.Flags().GetString("keepalive")
if err != nil {
return err
}
if keepAlive != "" {
d, err := time.ParseDuration(keepAlive)
if err != nil {
return err
}
opts.KeepAlive = &api.Duration{Duration: d}
}
prompts := args[1:] prompts := args[1:]
// prepend stdin to the prompt if provided // prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) { if !term.IsTerminal(int(os.Stdin.Fd())) {
@@ -496,6 +509,52 @@ func ListHandler(cmd *cobra.Command, args []string) error {
return nil return nil
} }
func ListRunningHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
models, err := client.ListRunning(cmd.Context())
if err != nil {
return err
}
var data [][]string
for _, m := range models.Models {
if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
var procStr string
switch {
case m.SizeVRAM == 0:
procStr = "100% CPU"
case m.SizeVRAM == m.Size:
procStr = "100% GPU"
case m.SizeVRAM > m.Size || m.Size == 0:
procStr = "Unknown"
default:
sizeCPU := m.Size - m.SizeVRAM
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
}
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
}
}
table := tablewriter.NewWriter(os.Stdout)
table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
table.SetAlignment(tablewriter.ALIGN_LEFT)
table.SetHeaderLine(false)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.AppendBulk(data)
table.Render()
return nil
}
func DeleteHandler(cmd *cobra.Command, args []string) error { func DeleteHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment() client, err := api.ClientFromEnvironment()
if err != nil { if err != nil {
@@ -672,6 +731,7 @@ type runOptions struct {
Images []api.ImageData Images []api.ImageData
Options map[string]interface{} Options map[string]interface{}
MultiModal bool MultiModal bool
KeepAlive *api.Duration
} }
type displayResponseState struct { type displayResponseState struct {
@@ -766,6 +826,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
Options: opts.Options, Options: opts.Options,
} }
if opts.KeepAlive != nil {
req.KeepAlive = opts.KeepAlive
}
if err := client.Chat(cancelCtx, req, fn); err != nil { if err := client.Chat(cancelCtx, req, fn); err != nil {
if errors.Is(err, context.Canceled) { if errors.Is(err, context.Canceled) {
return nil, nil return nil, nil
@@ -841,14 +905,15 @@ func generate(cmd *cobra.Command, opts runOptions) error {
} }
request := api.GenerateRequest{ request := api.GenerateRequest{
Model: opts.Model, Model: opts.Model,
Prompt: opts.Prompt, Prompt: opts.Prompt,
Context: generateContext, Context: generateContext,
Images: opts.Images, Images: opts.Images,
Format: opts.Format, Format: opts.Format,
System: opts.System, System: opts.System,
Template: opts.Template, Template: opts.Template,
Options: opts.Options, Options: opts.Options,
KeepAlive: opts.KeepAlive,
} }
if err := client.Generate(ctx, &request, fn); err != nil { if err := client.Generate(ctx, &request, fn); err != nil {
@@ -1075,6 +1140,7 @@ func NewCLI() *cobra.Command {
RunE: RunHandler, RunE: RunHandler,
} }
runCmd.Flags().String("keepalive", "", "Duration to keep a model loaded (e.g. 5m)")
runCmd.Flags().Bool("verbose", false, "Show timings for response") runCmd.Flags().Bool("verbose", false, "Show timings for response")
runCmd.Flags().Bool("insecure", false, "Use an insecure registry") runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically") runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
@@ -1090,9 +1156,9 @@ func NewCLI() *cobra.Command {
Environment Variables: Environment Variables:
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434") OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
OLLAMA_ORIGINS A comma separated list of allowed origins. OLLAMA_ORIGINS A comma separated list of allowed origins
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models") OLLAMA_MODELS The path to the models directory (default "~/.ollama/models")
OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default is "5m") OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default "5m")
OLLAMA_DEBUG Set to 1 to enable additional debug logging OLLAMA_DEBUG Set to 1 to enable additional debug logging
`) `)
@@ -1123,6 +1189,14 @@ Environment Variables:
PreRunE: checkServerHeartbeat, PreRunE: checkServerHeartbeat,
RunE: ListHandler, RunE: ListHandler,
} }
psCmd := &cobra.Command{
Use: "ps",
Short: "List running models",
PreRunE: checkServerHeartbeat,
RunE: ListRunningHandler,
}
copyCmd := &cobra.Command{ copyCmd := &cobra.Command{
Use: "cp SOURCE DESTINATION", Use: "cp SOURCE DESTINATION",
Short: "Copy a model", Short: "Copy a model",
@@ -1146,6 +1220,7 @@ Environment Variables:
pullCmd, pullCmd,
pushCmd, pushCmd,
listCmd, listCmd,
psCmd,
copyCmd, copyCmd,
deleteCmd, deleteCmd,
} { } {
@@ -1160,6 +1235,7 @@ Environment Variables:
pullCmd, pullCmd,
pushCmd, pushCmd,
listCmd, listCmd,
psCmd,
copyCmd, copyCmd,
deleteCmd, deleteCmd,
) )

View File

@@ -17,6 +17,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/progress" "github.com/ollama/ollama/progress"
"github.com/ollama/ollama/readline" "github.com/ollama/ollama/readline"
"github.com/ollama/ollama/types/errtypes"
) )
type MultilineState int type MultilineState int
@@ -56,6 +57,11 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
Model: opts.Model, Model: opts.Model,
Messages: []api.Message{}, Messages: []api.Message{},
} }
if opts.KeepAlive != nil {
chatReq.KeepAlive = opts.KeepAlive
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error { err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear() p.StopAndClear()
if len(opts.Messages) > 0 { if len(opts.Messages) > 0 {
@@ -276,13 +282,20 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fn := func(resp api.ProgressResponse) error { return nil } fn := func(resp api.ProgressResponse) error { return nil }
err = client.Create(cmd.Context(), req, fn) err = client.Create(cmd.Context(), req, fn)
if err != nil { if err != nil {
fmt.Println("error: couldn't save model") if strings.Contains(err.Error(), errtypes.InvalidModelNameErrMsg) {
fmt.Printf("error: The model name '%s' is invalid\n", args[1])
continue
}
return err return err
} }
fmt.Printf("Created new model '%s'\n", args[1]) fmt.Printf("Created new model '%s'\n", args[1])
continue continue
case strings.HasPrefix(line, "/clear"): case strings.HasPrefix(line, "/clear"):
opts.Messages = []api.Message{} opts.Messages = []api.Message{}
if opts.System != "" {
newMessage := api.Message{Role: "system", Content: opts.System}
opts.Messages = append(opts.Messages, newMessage)
}
fmt.Println("Cleared session context") fmt.Println("Cleared session context")
continue continue
case strings.HasPrefix(line, "/set"): case strings.HasPrefix(line, "/set"):

View File

@@ -797,9 +797,9 @@ curl http://localhost:11434/api/show -d '{
```json ```json
{ {
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"", "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSSISTANT:", "parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ", "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
"details": { "details": {
"format": "gguf", "format": "gguf",
"family": "llama", "family": "llama",

View File

@@ -80,17 +80,19 @@ If Ollama is run as a systemd service, environment variables should be set using
### Setting environment variables on Windows ### Setting environment variables on Windows
On windows, Ollama inherits your user and system environment variables. On Windows, Ollama inherits your user and system environment variables.
1. First Quit Ollama by clicking on it in the task bar 1. First Quit Ollama by clicking on it in the task bar.
2. Edit system environment variables from the control panel 2. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc. 3. Click on _Edit environment variables for your account_.
4. Click OK/Apply to save 4. Edit or create a new variable for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
5. Run `ollama` from a new terminal window 5. Click OK/Apply to save.
6. Start the Ollama application from the Windows Start menu.
## How can I expose Ollama on my network? ## How can I expose Ollama on my network?

View File

@@ -60,7 +60,9 @@ func humanTime(t time.Time, zeroValue string) string {
} }
delta := time.Since(t) delta := time.Since(t)
if delta < 0 { if int(delta.Hours())/24/365 < -20 {
return "Forever"
} else if delta < 0 {
return humanDuration(-delta) + " from now" return humanDuration(-delta) + " from now"
} }

View File

@@ -32,4 +32,14 @@ func TestHumanTime(t *testing.T) {
v := now.Add(800 * time.Millisecond) v := now.Add(800 * time.Millisecond)
assertEqual(t, HumanTime(v, ""), "Less than a second from now") assertEqual(t, HumanTime(v, ""), "Less than a second from now")
}) })
t.Run("time way in the future", func(t *testing.T) {
v := now.Add(24 * time.Hour * 365 * 200)
assertEqual(t, HumanTime(v, ""), "Forever")
})
t.Run("time way in the future lowercase", func(t *testing.T) {
v := now.Add(24 * time.Hour * 365 * 200)
assertEqual(t, HumanTimeLower(v, ""), "forever")
})
} }

View File

@@ -165,7 +165,7 @@ if [ -z "${CUDART_LIB_DIR}" ]; then
CUDART_LIB_DIR="${CUDA_LIB_DIR}" CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi fi
if [ -d "${CUDA_LIB_DIR}" ]; then if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library" echo "CUDA libraries detected - building dynamic CUDA library"
init_vars init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
@@ -227,7 +227,7 @@ if [ -z "${CLBlast_DIR}" ]; then
fi fi
fi fi
if [ -d "${ROCM_PATH}" ]; then if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library" echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)

View File

@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
} }
layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok {
memoryMinimum += blk0.size()
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} }
layers := ggml.Tensors().Layers()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers) // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size() memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64 var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
@@ -100,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
var layerCount int var layerCount int
for i := 0; i < int(ggml.KV().BlockCount()); i++ { for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size() if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size()
// KV is proportional to the number of layers // KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount() memoryLayer += kv / ggml.KV().BlockCount()
memoryRequiredTotal += memoryLayer memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer { if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
memoryRequiredPartial += memoryLayer memoryRequiredPartial += memoryLayer
layerCount++ layerCount++
}
} }
} }
@@ -117,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryRequiredTotal += memoryLayerOutput memoryRequiredTotal += memoryLayerOutput
} }
if memoryAvailable > memoryRequiredTotal { if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
layerCount = int(ggml.KV().BlockCount()) + 1 layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal memoryRequiredPartial = memoryRequiredTotal
} }
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"offload to gpu", "offload to gpu",
slog.Group( slog.Group(
"layers", "layers",
// actual number of layers offloaded // requested number of layers to offload
"real", opts.NumGPU, "requested", opts.NumGPU,
// estimated number of layers that can be offloaded // estimated number of layers that can be offloaded
"estimate", layerCount, "real", layerCount,
), ),
slog.Group( slog.Group(
"memory", "memory",

View File

@@ -38,6 +38,7 @@ type LlamaServer interface {
Detokenize(ctx context.Context, tokens []int) (string, error) Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error Close() error
EstimatedVRAM() uint64 EstimatedVRAM() uint64
EstimatedTotal() uint64
} }
// llmServer is an instance of the llama.cpp server // llmServer is an instance of the llama.cpp server
@@ -88,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0 gpuCount = 0
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem() memInfo, err := gpu.GetCPUMem()
@@ -316,8 +318,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
slog.Info("starting llama server", "cmd", s.cmd.String()) slog.Info("starting llama server", "cmd", s.cmd.String())
// Log at debug as the environment is inherited and might contain sensitive information if envconfig.Debug {
slog.Debug("subprocess", "environment", s.cmd.Env) filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment them essage about noexec // Detect permission denied and augment them essage about noexec
@@ -955,6 +971,10 @@ func (s *llmServer) EstimatedVRAM() uint64 {
return s.estimatedVRAM return s.estimatedVRAM
} }
func (s *llmServer) EstimatedTotal() uint64 {
return s.estimatedTotal
}
func parseDurationMs(ms float64) time.Duration { func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil { if err != nil {

View File

@@ -221,7 +221,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
} }
defer resp.Body.Close() defer resp.Body.Close()
n, err := io.Copy(w, io.TeeReader(resp.Body, part)) n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size)
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
// rollback progress // rollback progress
b.Completed.Add(-n) b.Completed.Add(-n)

View File

@@ -30,6 +30,7 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/envconfig" "github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
) )
@@ -517,7 +518,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
name := model.ParseName(cmp.Or(req.Model, req.Name)) name := model.ParseName(cmp.Or(req.Model, req.Name))
if !name.IsValid() { if !name.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": errtypes.InvalidModelNameErrMsg})
return return
} }
@@ -708,7 +709,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
} }
var sb strings.Builder var sb strings.Builder
fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"") fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:") fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName) fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
fmt.Fprint(&sb, model.String()) fmt.Fprint(&sb, model.String())
@@ -724,7 +725,7 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
return return
} }
var models []api.ModelResponse models := []api.ModelResponse{}
if err := filepath.Walk(manifests, func(path string, info os.FileInfo, _ error) error { if err := filepath.Walk(manifests, func(path string, info os.FileInfo, _ error) error {
if !info.IsDir() { if !info.IsDir() {
rel, err := filepath.Rel(manifests, path) rel, err := filepath.Rel(manifests, path)
@@ -979,6 +980,7 @@ func (s *Server) GenerateRoutes() http.Handler {
r.POST("/api/show", s.ShowModelHandler) r.POST("/api/show", s.ShowModelHandler)
r.POST("/api/blobs/:digest", s.CreateBlobHandler) r.POST("/api/blobs/:digest", s.CreateBlobHandler)
r.HEAD("/api/blobs/:digest", s.HeadBlobHandler) r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
r.GET("/api/ps", s.ProcessHandler)
// Compatibility endpoints // Compatibility endpoints
r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler) r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler)
@@ -1137,6 +1139,42 @@ func streamResponse(c *gin.Context, ch chan any) {
}) })
} }
func (s *Server) ProcessHandler(c *gin.Context) {
models := []api.ModelResponse{}
for _, v := range s.sched.loaded {
model := v.model
modelDetails := api.ModelDetails{
Format: model.Config.ModelFormat,
Family: model.Config.ModelFamily,
Families: model.Config.ModelFamilies,
ParameterSize: model.Config.ModelType,
QuantizationLevel: model.Config.FileType,
}
mr := api.ModelResponse{
Model: model.ShortName,
Name: model.ShortName,
Size: int64(v.estimatedTotal),
SizeVRAM: int64(v.estimatedVRAM),
Digest: model.Digest,
Details: modelDetails,
ExpiresAt: v.expiresAt,
}
// The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just
// calculate the time w/ the sessionDuration instead.
var epoch time.Time
if v.expiresAt == epoch {
mr.ExpiresAt = time.Now().Add(v.sessionDuration)
}
models = append(models, mr)
}
c.JSON(http.StatusOK, api.ListResponse{Models: models})
}
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) { func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) {
encode := func(s string) ([]int, error) { encode := func(s string) ([]int, error) {

View File

@@ -95,6 +95,7 @@ func Test_Routes(t *testing.T) {
err = json.Unmarshal(body, &modelList) err = json.Unmarshal(body, &modelList)
assert.Nil(t, err) assert.Nil(t, err)
assert.NotNil(t, modelList.Models)
assert.Equal(t, 0, len(modelList.Models)) assert.Equal(t, 0, len(modelList.Models))
}, },
}, },

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"log/slog" "log/slog"
"reflect" "reflect"
"runtime"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@@ -177,7 +178,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
// Trigger an expiration to unload once it's done // Trigger an expiration to unload once it's done
runnerToExpire.refMu.Lock() runnerToExpire.refMu.Lock()
slog.Debug("resetting model to expire immediately to make room", "model", runnerToExpire.model, "refCount", runnerToExpire.refCount) slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount)
if runnerToExpire.expireTimer != nil { if runnerToExpire.expireTimer != nil {
runnerToExpire.expireTimer.Stop() runnerToExpire.expireTimer.Stop()
runnerToExpire.expireTimer = nil runnerToExpire.expireTimer = nil
@@ -190,13 +191,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
// Wait for the unload to happen // Wait for the unload to happen
// Note: at this point we're queueing up all incoming requests, even if they were for // Note: at this point we're queueing up all incoming requests, even if they were for
// a different model that's loaded and not scheduled to be removed. // a different model that's loaded and not scheduled to be removed.
slog.Debug("waiting for pending requests to complete and unload to occur", "model", runnerToExpire.model) slog.Debug("waiting for pending requests to complete and unload to occur", "modelPath", runnerToExpire.modelPath)
select { select {
case <-ctx.Done(): case <-ctx.Done():
slog.Debug("shutting down scheduler pending loop") slog.Debug("shutting down scheduler pending loop")
return return
case <-s.unloadedCh: case <-s.unloadedCh:
slog.Debug("unload completed", "model", runnerToExpire.model) slog.Debug("unload completed", "modelPath", runnerToExpire.modelPath)
continue continue
} }
} }
@@ -219,23 +220,23 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
runner := s.loaded[finished.model.ModelPath] runner := s.loaded[finished.model.ModelPath]
s.loadedMu.Unlock() s.loadedMu.Unlock()
if runner == nil { if runner == nil {
slog.Error("finished requeset signal received after model unloaded", "model", finished.model.ModelPath) slog.Error("finished requeset signal received after model unloaded", "modelPath", finished.model.ModelPath)
continue continue
} }
runner.refMu.Lock() runner.refMu.Lock()
runner.refCount-- runner.refCount--
if runner.refCount <= 0 { if runner.refCount <= 0 {
if runner.sessionDuration <= 0 { if runner.sessionDuration <= 0 {
slog.Debug("runner with zero duration has gone idle, expiring to unload", "model", runner.model) slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath)
if runner.expireTimer != nil { if runner.expireTimer != nil {
runner.expireTimer.Stop() runner.expireTimer.Stop()
runner.expireTimer = nil runner.expireTimer = nil
} }
s.expiredCh <- runner s.expiredCh <- runner
} else if runner.expireTimer == nil { } else if runner.expireTimer == nil {
slog.Debug("runner with non-zero duration has gone idle, adding timer", "model", runner.model, "duration", runner.sessionDuration) slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() { runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() {
slog.Debug("timer expired, expiring to unload", "model", runner.model) slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath)
runner.refMu.Lock() runner.refMu.Lock()
defer runner.refMu.Unlock() defer runner.refMu.Unlock()
if runner.expireTimer != nil { if runner.expireTimer != nil {
@@ -244,19 +245,21 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
} }
s.expiredCh <- runner s.expiredCh <- runner
}) })
runner.expiresAt = time.Now().Add(runner.sessionDuration)
} else { } else {
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "model", runner.model, "duration", runner.sessionDuration) slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
runner.expireTimer.Reset(runner.sessionDuration) runner.expireTimer.Reset(runner.sessionDuration)
runner.expiresAt = time.Now().Add(runner.sessionDuration)
} }
} }
slog.Debug("after processing request finished event", "model", runner.model, "refCount", runner.refCount) slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount)
runner.refMu.Unlock() runner.refMu.Unlock()
case runner := <-s.expiredCh: case runner := <-s.expiredCh:
slog.Debug("runner expired event received", "model", runner.model) slog.Debug("runner expired event received", "modelPath", runner.modelPath)
runner.refMu.Lock() runner.refMu.Lock()
if runner.refCount > 0 { if runner.refCount > 0 {
// Shouldn't happen, but safeguard to ensure no leaked runners // Shouldn't happen, but safeguard to ensure no leaked runners
slog.Debug("expired event with positive ref count, retrying", "model", runner.model, "refCount", runner.refCount) slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
go func(runner *runnerRef) { go func(runner *runnerRef) {
// We can't unload yet, but want to as soon as the current request completes // We can't unload yet, but want to as soon as the current request completes
// So queue up another expired event // So queue up another expired event
@@ -268,16 +271,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
} }
s.loadedMu.Lock() s.loadedMu.Lock()
slog.Debug("got lock to unload", "model", runner.model) slog.Debug("got lock to unload", "modelPath", runner.modelPath)
finished := runner.waitForVRAMRecovery() finished := runner.waitForVRAMRecovery()
runner.unload() runner.unload()
delete(s.loaded, runner.model) delete(s.loaded, runner.modelPath)
s.loadedMu.Unlock() s.loadedMu.Unlock()
slog.Debug("runner released", "model", runner.model) slog.Debug("runner released", "modelPath", runner.modelPath)
runner.refMu.Unlock() runner.refMu.Unlock()
<-finished <-finished
slog.Debug("sending an unloaded event", "model", runner.model) slog.Debug("sending an unloaded event", "modelPath", runner.modelPath)
s.unloadedCh <- struct{}{} s.unloadedCh <- struct{}{}
} }
} }
@@ -316,18 +319,20 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
req.errCh <- err req.errCh <- err
return return
} }
runner := &runnerRef{} runner := &runnerRef{
runner.model = req.model.ModelPath model: req.model,
runner.adapters = req.model.AdapterPaths modelPath: req.model.ModelPath,
runner.projectors = req.model.ProjectorPaths llama: llama,
runner.llama = llama Options: &req.opts,
runner.Options = &req.opts sessionDuration: req.sessionDuration,
runner.sessionDuration = req.sessionDuration gpus: gpus,
runner.gpus = gpus estimatedVRAM: llama.EstimatedVRAM(),
runner.estimatedVRAM = llama.EstimatedVRAM() estimatedTotal: llama.EstimatedTotal(),
runner.loading = true loading: true,
runner.refCount = 1 refCount: 1,
}
runner.refMu.Lock() runner.refMu.Lock()
s.loadedMu.Lock() s.loadedMu.Lock()
s.loaded[req.model.ModelPath] = runner s.loaded[req.model.ModelPath] = runner
slog.Info("loaded runners", "count", len(s.loaded)) slog.Info("loaded runners", "count", len(s.loaded))
@@ -339,7 +344,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
slog.Error("error loading llama server", "error", err) slog.Error("error loading llama server", "error", err)
runner.refCount-- runner.refCount--
req.errCh <- err req.errCh <- err
slog.Debug("triggering expiration for failed load", "model", runner.model) slog.Debug("triggering expiration for failed load", "model", runner.modelPath)
s.expiredCh <- runner s.expiredCh <- runner
return return
} }
@@ -408,17 +413,18 @@ type runnerRef struct {
refCount uint // prevent unloading if > 0 refCount uint // prevent unloading if > 0
// unloading bool // set to true when we are trying to unload the runner // unloading bool // set to true when we are trying to unload the runner
llama llm.LlamaServer llama llm.LlamaServer
loading bool // True only during initial load, then false forever loading bool // True only during initial load, then false forever
gpus gpu.GpuInfoList // Recorded at time of provisioning gpus gpu.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64 estimatedVRAM uint64
estimatedTotal uint64
sessionDuration time.Duration sessionDuration time.Duration
expireTimer *time.Timer expireTimer *time.Timer
expiresAt time.Time
model string model *Model
adapters []string modelPath string
projectors []string
*api.Options *api.Options
} }
@@ -431,9 +437,8 @@ func (runner *runnerRef) unload() {
if runner.llama != nil { if runner.llama != nil {
runner.llama.Close() runner.llama.Close()
} }
runner.model = nil
runner.llama = nil runner.llama = nil
runner.adapters = nil
runner.projectors = nil
runner.Options = nil runner.Options = nil
runner.gpus = nil runner.gpus = nil
} }
@@ -462,8 +467,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
ctx, cancel := context.WithTimeout(ctx, timeout) ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed? if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed? !reflect.DeepEqual(runner.model.ProjectorPaths, req.model.ProjectorPaths) || // have the projectors changed?
!reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed? !reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed?
runner.llama.Ping(ctx) != nil { runner.llama.Ping(ctx) != nil {
return true return true
@@ -483,8 +488,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
finished := make(chan interface{}, 1) finished := make(chan interface{}, 1)
// CPU or Metal don't need checking, so no waiting required // CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") { if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
finished <- struct{}{} finished <- struct{}{}
return finished return finished
} }

View File

@@ -164,7 +164,8 @@ func TestRequests(t *testing.T) {
// simple reload of same model // simple reload of same model
scenario2a := newScenario(t, ctx, "ollama-model-1", 20) scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
scenario2a.req.model = scenario1a.req.model tmpModel := *scenario1a.req.model
scenario2a.req.model = &tmpModel
scenario2a.ggml = scenario1a.ggml scenario2a.ggml = scenario1a.ggml
// Multiple loaded models // Multiple loaded models
@@ -496,10 +497,9 @@ func TestNeedsReload(t *testing.T) {
llm := &mockLlm{} llm := &mockLlm{}
do := api.DefaultOptions() do := api.DefaultOptions()
runner := &runnerRef{ runner := &runnerRef{
adapters: []string{"adapter1"}, model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
projectors: []string{"projector1"}, Options: &do,
Options: &do, llama: llm,
llama: llm,
} }
req := &LlmRequest{ req := &LlmRequest{
model: &Model{ model: &Model{
@@ -510,10 +510,10 @@ func TestNeedsReload(t *testing.T) {
} }
resp := runner.needsReload(ctx, req) resp := runner.needsReload(ctx, req)
require.True(t, resp) require.True(t, resp)
req.model.AdapterPaths = runner.adapters req.model.AdapterPaths = runner.model.AdapterPaths
resp = runner.needsReload(ctx, req) resp = runner.needsReload(ctx, req)
require.True(t, resp) require.True(t, resp)
req.model.ProjectorPaths = runner.projectors req.model.ProjectorPaths = runner.model.ProjectorPaths
runner.loading = true runner.loading = true
req.opts.NumBatch = 1234 req.opts.NumBatch = 1234
resp = runner.needsReload(ctx, req) resp = runner.needsReload(ctx, req)
@@ -558,11 +558,11 @@ func TestUnloadAllRunners(t *testing.T) {
func TestUnload(t *testing.T) { func TestUnload(t *testing.T) {
llm1 := &mockLlm{} llm1 := &mockLlm{}
r1 := &runnerRef{llama: llm1} r1 := &runnerRef{llama: llm1}
r2 := &runnerRef{adapters: []string{"A"}} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
r1.unload() r1.unload()
require.True(t, llm1.closeCalled) require.True(t, llm1.closeCalled)
r2.unload() r2.unload()
require.Nil(t, r2.adapters) require.Nil(t, r2.model)
} }
type mockLlm struct { type mockLlm struct {
@@ -578,6 +578,7 @@ type mockLlm struct {
closeResp error closeResp error
closeCalled bool closeCalled bool
estimatedVRAM uint64 estimatedVRAM uint64
estimatedTotal uint64
} }
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
@@ -598,4 +599,5 @@ func (s *mockLlm) Close() error {
s.closeCalled = true s.closeCalled = true
return s.closeResp return s.closeResp
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }

View File

@@ -7,6 +7,7 @@ import (
) )
const UnknownOllamaKeyErrMsg = "unknown ollama key" const UnknownOllamaKeyErrMsg = "unknown ollama key"
const InvalidModelNameErrMsg = "invalid model name"
// TODO: This should have a structured response from the API // TODO: This should have a structured response from the API
type UnknownOllamaKey struct { type UnknownOllamaKey struct {