From a210ec74d29ee718bca9b3c192e0a93cf86cbf21 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Mon, 25 Nov 2024 09:40:16 -0800 Subject: [PATCH 01/12] cmd: print location of model after pushing (#7695) After a user pushes their model it is not clear what to do next. Add a link to the output of `ollama push` that tells the user where their model can now be found. --- cmd/cmd.go | 12 +++++ cmd/cmd_test.go | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) diff --git a/cmd/cmd.go b/cmd/cmd.go index fad06ffd..01eb66f9 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -39,6 +39,7 @@ import ( "github.com/ollama/ollama/parser" "github.com/ollama/ollama/progress" "github.com/ollama/ollama/server" + "github.com/ollama/ollama/types/model" "github.com/ollama/ollama/version" ) @@ -558,6 +559,8 @@ func PushHandler(cmd *cobra.Command, args []string) error { } request := api.PushRequest{Name: args[0], Insecure: insecure} + + n := model.ParseName(args[0]) if err := client.Push(cmd.Context(), &request, fn); err != nil { if spinner != nil { spinner.Stop() @@ -568,7 +571,16 @@ func PushHandler(cmd *cobra.Command, args []string) error { return err } + p.Stop() spinner.Stop() + + destination := n.String() + if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") { + destination = "https://ollama.com/" + strings.TrimSuffix(n.DisplayShortest(), ":latest") + } + fmt.Printf("\nYou can find your model at:\n\n") + fmt.Printf("\t%s\n", destination) + return nil } diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go index fd8289cf..2e6428cf 100644 --- a/cmd/cmd_test.go +++ b/cmd/cmd_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "io" "net/http" "net/http/httptest" "os" @@ -369,3 +370,127 @@ func TestGetModelfileName(t *testing.T) { }) } } + +func TestPushHandler(t *testing.T) { + tests := []struct { + name string + modelName string + serverResponse map[string]func(w http.ResponseWriter, r *http.Request) + expectedError string + expectedOutput string + }{ + { + name: "successful push", + modelName: "test-model", + serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){ + "/api/push": func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST request, got %s", r.Method) + } + + var req api.PushRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + if req.Name != "test-model" { + t.Errorf("expected model name 'test-model', got %s", req.Name) + } + + // Simulate progress updates + responses := []api.ProgressResponse{ + {Status: "preparing manifest"}, + {Digest: "sha256:abc123456789", Total: 100, Completed: 50}, + {Digest: "sha256:abc123456789", Total: 100, Completed: 100}, + } + + for _, resp := range responses { + if err := json.NewEncoder(w).Encode(resp); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + w.(http.Flusher).Flush() + } + }, + }, + expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n", + }, + { + name: "unauthorized push", + modelName: "unauthorized-model", + serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){ + "/api/push": func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusUnauthorized) + err := json.NewEncoder(w).Encode(map[string]string{ + "error": "access denied", + }) + if err != nil { + t.Fatal(err) + } + }, + }, + expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if handler, ok := tt.serverResponse[r.URL.Path]; ok { + handler(w, r) + return + } + http.Error(w, "not found", http.StatusNotFound) + })) + defer mockServer.Close() + + t.Setenv("OLLAMA_HOST", mockServer.URL) + + cmd := &cobra.Command{} + cmd.Flags().Bool("insecure", false, "") + cmd.SetContext(context.TODO()) + + // Redirect stderr to capture progress output + oldStderr := os.Stderr + r, w, _ := os.Pipe() + os.Stderr = w + + // Capture stdout for the "Model pushed" message + oldStdout := os.Stdout + outR, outW, _ := os.Pipe() + os.Stdout = outW + + err := PushHandler(cmd, []string{tt.modelName}) + + // Restore stderr + w.Close() + os.Stderr = oldStderr + // drain the pipe + if _, err := io.ReadAll(r); err != nil { + t.Fatal(err) + } + + // Restore stdout and get output + outW.Close() + os.Stdout = oldStdout + stdout, _ := io.ReadAll(outR) + + if tt.expectedError == "" { + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if tt.expectedOutput != "" { + if got := string(stdout); got != tt.expectedOutput { + t.Errorf("expected output %q, got %q", tt.expectedOutput, got) + } + } + } else { + if err == nil || !strings.Contains(err.Error(), tt.expectedError) { + t.Errorf("expected error containing %q, got %v", tt.expectedError, err) + } + } + }) + } +} From 647513a7d48920f897f536fe9df45c6ca38fe83e Mon Sep 17 00:00:00 2001 From: Shikhar Bakhda Date: Mon, 25 Nov 2024 09:55:33 -0800 Subject: [PATCH 02/12] readme: add HoneyHive to community integrations (#7831) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 186fedb6..1e1d02bc 100644 --- a/README.md +++ b/README.md @@ -518,3 +518,4 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Observability - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics. +- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. From 2b7ed61ca22743598db2b407a94b8865042f1078 Mon Sep 17 00:00:00 2001 From: Blake Mizerany Date: Mon, 25 Nov 2024 15:08:34 -0800 Subject: [PATCH 03/12] server: fix Transport override (#7834) This changes makeRequest to update the http client Transport if and only if testMakeRequestDialContext is set. This is to avoid overriding the default Transport when testMakeRequestDialContext is nil, which broke existing behavior, included proxies, timeouts, and other behaviors. Fixes #7829 Fixes #7788 --- server/images.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/server/images.go b/server/images.go index 1f6a9712..29877db3 100644 --- a/server/images.go +++ b/server/images.go @@ -1076,17 +1076,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header req.ContentLength = contentLength } - resp, err := (&http.Client{ - Transport: &http.Transport{ - DialContext: testMakeRequestDialContext, - }, + c := &http.Client{ CheckRedirect: regOpts.CheckRedirect, - }).Do(req) - if err != nil { - return nil, err } - - return resp, nil + if testMakeRequestDialContext != nil { + tr := http.DefaultTransport.(*http.Transport).Clone() + tr.DialContext = testMakeRequestDialContext + c.Transport = tr + } + return c.Do(req) } func getValue(header, key string) string { From 30e88d7f31cd3af582346b995a8bb10b3ff37125 Mon Sep 17 00:00:00 2001 From: frob Date: Tue, 26 Nov 2024 01:43:29 +0100 Subject: [PATCH 04/12] cmd: don't submit svg files as images for now (#7830) --- cmd/interactive.go | 2 +- cmd/interactive_test.go | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cmd/interactive.go b/cmd/interactive.go index b495a109..9035b4c5 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -514,7 +514,7 @@ func extractFileNames(input string) []string { // Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20) // and followed by more characters and a file extension // This will capture non filename strings, but we'll check for file existence to remove mismatches - regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b` + regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b` re := regexp.MustCompile(regexPattern) return re.FindAllString(input, -1) diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index bb7e0aba..118f4264 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -12,44 +12,45 @@ import ( func TestExtractFilenames(t *testing.T) { // Unix style paths input := ` some preamble - ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 -/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg` + ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg +/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG` res := extractFileNames(input) assert.Len(t, res, 5) assert.Contains(t, res[0], "one.png") assert.Contains(t, res[1], "two.jpg") assert.Contains(t, res[2], "three.jpeg") assert.Contains(t, res[3], "four.png") - assert.Contains(t, res[4], "five.svg") + assert.Contains(t, res[4], "five.JPG") assert.NotContains(t, res[4], '"') - assert.NotContains(t, res, "inbtween") + assert.NotContains(t, res, "inbetween1") + assert.NotContains(t, res, "./1.svg") // Windows style paths input = ` some preamble c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4 -./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6 -d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 - d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending +./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6 +d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 + d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending ` res = extractFileNames(input) assert.Len(t, res, 10) - assert.NotContains(t, res, "inbtween") + assert.NotContains(t, res, "inbetween2") assert.Contains(t, res[0], "one.png") assert.Contains(t, res[0], "c:") assert.Contains(t, res[1], "two.jpg") assert.Contains(t, res[1], "c:") assert.Contains(t, res[2], "three.jpeg") assert.Contains(t, res[3], "four.png") - assert.Contains(t, res[4], "five.svg") + assert.Contains(t, res[4], "five.JPG") assert.Contains(t, res[5], "six.png") - assert.Contains(t, res[6], "seven.svg") + assert.Contains(t, res[6], "seven.JPEG") assert.Contains(t, res[6], "d:") assert.Contains(t, res[7], "eight.png") assert.Contains(t, res[7], "c:") assert.Contains(t, res[8], "nine.png") assert.Contains(t, res[8], "d:") - assert.Contains(t, res[9], "ten.svg") + assert.Contains(t, res[9], "ten.PNG") assert.Contains(t, res[9], "E:") } From 52bbad12f96e84f7d62c5dfdd7dbba2b10b37344 Mon Sep 17 00:00:00 2001 From: jake83741 <125723241+jake83741@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:56:30 -0500 Subject: [PATCH 05/12] readme: update description for vnc-lm community integration (#7832) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1e1d02bc..52f6fa55 100644 --- a/README.md +++ b/README.md @@ -504,7 +504,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.) - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama) - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.) -- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links) +- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.) - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality) - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator) - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator) From 2cd11ae365a9423578069457312dce6b9e1e5a37 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 25 Nov 2024 14:49:38 -0800 Subject: [PATCH 06/12] runner.go: Add unit tests for context shifting This also makes it easier to truncate long inputs the same as shifting but does not actually implement it. This type of truncation has a trade off between quality and time to first token. --- llama/runner/cache.go | 20 +++++++++--- llama/runner/cache_test.go | 63 ++++++++++++++++++++++++++++++++++++++ llama/runner/runner.go | 6 ++-- 3 files changed, 82 insertions(+), 7 deletions(-) diff --git a/llama/runner/cache.go b/llama/runner/cache.go index b487fe25..0f5f0a09 100644 --- a/llama/runner/cache.go +++ b/llama/runner/cache.go @@ -199,6 +199,20 @@ func countCommonPrefix(a []input, b []input) int { return count } +func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int { + targetFree := (c.numCtx - numKeep) / 2 + targetFree = max(targetFree, 1) + + currentFree := c.numCtx - inputLen + discard := targetFree - currentFree + + if discard < 0 { + discard = 0 + } + + return discard +} + // Frees up space in the KV cache by deleting the oldest half of history and shifting // the newest half into that space (saving numKeep inputs at the beginning). // @@ -208,11 +222,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error { return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx) } - targetFree := (c.numCtx - numKeep) / 2 - targetFree = max(targetFree, 1) - - currentFree := c.numCtx - len(slot.Inputs) - discard := targetFree - currentFree + discard := c.ShiftDiscard(len(slot.Inputs), numKeep) if discard <= 0 { return nil diff --git a/llama/runner/cache_test.go b/llama/runner/cache_test.go index 0e38c67d..79cd93cb 100644 --- a/llama/runner/cache_test.go +++ b/llama/runner/cache_test.go @@ -227,3 +227,66 @@ func TestFindCacheSlot(t *testing.T) { }) } } + +func TestShiftDiscard(t *testing.T) { + tests := []struct { + name string + numCtx int + numKeep int + inputLen int + expected int + }{ + { + name: "Shift", + numCtx: 2048, + numKeep: 5, + inputLen: 2048, + expected: 1021, + }, + { + name: "Max Keep", + numCtx: 2048, + numKeep: 2047, + inputLen: 2048, + expected: 1, + }, + { + name: "No Keep", + numCtx: 2048, + numKeep: 0, + inputLen: 2048, + expected: 1024, + }, + { + name: "Truncate", + numCtx: 2048, + numKeep: 5, + inputLen: 5000, + expected: 3973, + }, + { + name: "Truncate Keep", + numCtx: 2048, + numKeep: 2047, + inputLen: 5000, + expected: 2953, + }, + { + name: "No Op", + numCtx: 2048, + numKeep: 5, + inputLen: 512, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := InputCache{numCtx: tt.numCtx} + result := c.ShiftDiscard(tt.inputLen, tt.numKeep) + if result != tt.expected { + t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected) + } + }) + } +} diff --git a/llama/runner/runner.go b/llama/runner/runner.go index db8092f3..8762b3da 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -122,9 +122,11 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen params.numKeep = min(params.numKeep, s.cache.numCtx-1) if len(inputs) > s.cache.numCtx { - slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep) + discard := len(inputs) - s.cache.numCtx newInputs := inputs[:params.numKeep] - newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...) + newInputs = append(newInputs, inputs[params.numKeep+discard:]...) + + slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs)) inputs = newInputs } From 71e6a0d0d181e3be45f3e47a677d088479d73c76 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Nov 2024 15:08:24 -0800 Subject: [PATCH 07/12] runner.go: Don't try to extract image tags for text models When processing a prompt, we look for image tags of the form [img-0], which are inserted by the Ollama server process. However, this can cause errors if the original prompt has these tags - typically an image not found error is returned. This changes tag searching behavior to be similar to the 0.3.x series, which will largely avoid these problems. However,they can still happen when input text with these tags is used with image models. The correct solution is to escape the tags but this is a larger issue with special sequences in general so this is an incremental fix that should avoid the problem for the majority of cases. --- llama/runner/runner.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 8762b3da..0255ed55 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -164,10 +164,16 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen // generating image embeddings for each image func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) { var inputs []input + var parts []string + var matches [][]string - re := regexp.MustCompile(`\[img-(\d+)\]`) - parts := re.Split(prompt, -1) - matches := re.FindAllStringSubmatch(prompt, -1) + if s.image != nil { + re := regexp.MustCompile(`\[img-(\d+)\]`) + parts = re.Split(prompt, -1) + matches = re.FindAllStringSubmatch(prompt, -1) + } else { + parts = []string{prompt} + } for i, part := range parts { // text - tokenize From 940e62772e68c99cd4cb0b037acf5c16c23e0854 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 26 Nov 2024 16:08:09 -0800 Subject: [PATCH 08/12] openai: remove unused error code (#7850) The writeError takes a code argument which is no longer used. Remove it for clarity. --- openai/openai.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index 2bf9b9f9..10e5b09e 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -571,7 +571,7 @@ type EmbedWriter struct { model string } -func (w *BaseWriter) writeError(code int, data []byte) (int, error) { +func (w *BaseWriter) writeError(data []byte) (int, error) { var serr api.StatusError err := json.Unmarshal(data, &serr) if err != nil { @@ -630,7 +630,7 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) { func (w *ChatWriter) Write(data []byte) (int, error) { code := w.ResponseWriter.Status() if code != http.StatusOK { - return w.writeError(code, data) + return w.writeError(data) } return w.writeResponse(data) @@ -679,7 +679,7 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) { func (w *CompleteWriter) Write(data []byte) (int, error) { code := w.ResponseWriter.Status() if code != http.StatusOK { - return w.writeError(code, data) + return w.writeError(data) } return w.writeResponse(data) @@ -704,7 +704,7 @@ func (w *ListWriter) writeResponse(data []byte) (int, error) { func (w *ListWriter) Write(data []byte) (int, error) { code := w.ResponseWriter.Status() if code != http.StatusOK { - return w.writeError(code, data) + return w.writeError(data) } return w.writeResponse(data) @@ -730,7 +730,7 @@ func (w *RetrieveWriter) writeResponse(data []byte) (int, error) { func (w *RetrieveWriter) Write(data []byte) (int, error) { code := w.ResponseWriter.Status() if code != http.StatusOK { - return w.writeError(code, data) + return w.writeError(data) } return w.writeResponse(data) @@ -755,7 +755,7 @@ func (w *EmbedWriter) writeResponse(data []byte) (int, error) { func (w *EmbedWriter) Write(data []byte) (int, error) { code := w.ResponseWriter.Status() if code != http.StatusOK { - return w.writeError(code, data) + return w.writeError(data) } return w.writeResponse(data) From e3936d4fb37cc0cd3a7cd9ffb58f357c5f417fff Mon Sep 17 00:00:00 2001 From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com> Date: Thu, 28 Nov 2024 00:30:04 +0530 Subject: [PATCH 09/12] Support Multiple LoRa Adapters (#7667) Closes #7627 --- llama/runner/runner.go | 31 +++++++++++++++++++++++-------- llm/server.go | 9 +++------ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 0255ed55..9b1534e4 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { } } +type multiLPath []string + +func (m *multiLPath) Set(value string) error { + *m = append(*m, value) + return nil +} + +func (m *multiLPath) String() string { + return strings.Join(*m, ", ") +} + func (s *Server) loadModel( params llama.ModelParams, mpath string, - lpath string, + lpath multiLPath, ppath string, kvSize int, flashAttention bool, @@ -857,10 +868,12 @@ func (s *Server) loadModel( panic(err) } - if lpath != "" { - err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads) - if err != nil { - panic(err) + if lpath.String() != "" { + for _, path := range lpath { + err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads) + if err != nil { + panic(err) + } } } @@ -890,7 +903,6 @@ func main() { mainGpu := flag.Int("main-gpu", 0, "Main GPU") flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size") - lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") @@ -900,6 +912,9 @@ func main() { multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") requirements := flag.Bool("requirements", false, "print json requirement information") + var lpaths multiLPath + flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)") + flag.Parse() if *requirements { printRequirements(os.Stdout) @@ -946,7 +961,7 @@ func main() { params := llama.ModelParams{ NumGpuLayers: *nGpuLayers, MainGpu: *mainGpu, - UseMmap: !*noMmap && *lpath == "", + UseMmap: !*noMmap && lpaths.String() == "", UseMlock: *mlock, TensorSplit: tensorSplitFloats, Progress: func(progress float32) { @@ -955,7 +970,7 @@ func main() { } server.ready.Add(1) - go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) + go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) server.cond = sync.NewCond(&server.mu) diff --git a/llm/server.go b/llm/server.go index b2405905..2afc5562 100644 --- a/llm/server.go +++ b/llm/server.go @@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter // Loop through potential servers finalErr := errors.New("no suitable llama servers found") - if len(adapters) > 1 { - return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") - } - rDir, err := runners.Refresh(build.EmbedFS) if err != nil { return nil, err @@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter } if len(adapters) > 0 { - // TODO: applying multiple adapters is not supported by the llama.cpp server yet - params = append(params, "--lora", adapters[0]) + for _, adapter := range adapters { + params = append(params, "--lora", adapter) + } } if len(projectors) > 0 { From ce7455a8e1045ae12c5eaa9dc5bb5bdc84a098dc Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Wed, 27 Nov 2024 13:40:57 -0800 Subject: [PATCH 10/12] api: enable tool streaming (#7836) --- openai/openai.go | 13 +- server/model_test.go | 1 + server/routes.go | 32 ++++- server/routes_generate_test.go | 256 +++++++++++++++++++++++++++++++-- 4 files changed, 289 insertions(+), 13 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index 10e5b09e..6b469da7 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -200,9 +200,9 @@ func toolCallId() string { return "call_" + strings.ToLower(string(b)) } -func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { - toolCalls := make([]ToolCall, len(r.Message.ToolCalls)) - for i, tc := range r.Message.ToolCalls { +func toToolCalls(tc []api.ToolCall) []ToolCall { + toolCalls := make([]ToolCall, len(tc)) + for i, tc := range tc { toolCalls[i].ID = toolCallId() toolCalls[i].Type = "function" toolCalls[i].Function.Name = tc.Function.Name @@ -215,7 +215,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { toolCalls[i].Function.Arguments = string(args) } + return toolCalls +} +func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { + toolCalls := toToolCalls(r.Message.ToolCalls) return ChatCompletion{ Id: id, Object: "chat.completion", @@ -244,6 +248,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { } func toChunk(id string, r api.ChatResponse) ChatCompletionChunk { + toolCalls := toToolCalls(r.Message.ToolCalls) return ChatCompletionChunk{ Id: id, Object: "chat.completion.chunk", @@ -252,7 +257,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk { SystemFingerprint: "fp_ollama", Choices: []ChunkChoice{{ Index: 0, - Delta: Message{Role: "assistant", Content: r.Message.Content}, + Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls}, FinishReason: func(reason string) *string { if len(reason) > 0 { return &reason diff --git a/server/model_test.go b/server/model_test.go index 304d4655..47c4728e 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -39,6 +39,7 @@ func TestExecuteWithTools(t *testing.T) { {"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true}, + {"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false}, {"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function: [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true}, diff --git a/server/routes.go b/server/routes.go index c13cd023..d9e4fb66 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1458,6 +1458,7 @@ func (s *Server) ChatHandler(c *gin.Context) { prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools) if err != nil { + slog.Error("chat prompt error", "error", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -1467,6 +1468,8 @@ func (s *Server) ChatHandler(c *gin.Context) { ch := make(chan any) go func() { defer close(ch) + var sb strings.Builder + var hasToolCalls bool if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Images: images, @@ -1492,7 +1495,34 @@ func (s *Server) ChatHandler(c *gin.Context) { res.LoadDuration = checkpointLoaded.Sub(checkpointStart) } - ch <- res + // TODO: tool call checking and filtering should be moved outside of this callback once streaming + // however this was a simple change for now without reworking streaming logic of this (and other) + // handlers + if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 { + ch <- res + return + } + + // Streaming tool calls: + // If tools are recognized, use a flag to track the sending of a tool downstream + // This ensures that content is cleared from the message on the last chunk sent + sb.WriteString(r.Content) + if toolCalls, ok := m.parseToolCalls(sb.String()); ok { + res.Message.ToolCalls = toolCalls + res.Message.Content = "" + sb.Reset() + hasToolCalls = true + ch <- res + return + } + + if r.Done { + // Send any remaining content if no tool calls were detected + if !hasToolCalls { + res.Message.Content = sb.String() + } + ch <- res + } }); err != nil { ch <- gin.H{"error": err.Error()} } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 53501cc6..4bde55bb 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "strings" + "sync" "testing" "time" @@ -25,10 +26,14 @@ type mockRunner struct { // CompletionRequest is only valid until the next call to Completion llm.CompletionRequest llm.CompletionResponse + CompletionFn func(context.Context, llm.CompletionRequest, func(llm.CompletionResponse)) error } -func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { +func (m *mockRunner) Completion(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { m.CompletionRequest = r + if m.CompletionFn != nil { + return m.CompletionFn(ctx, r, fn) + } fn(m.CompletionResponse) return nil } @@ -88,9 +93,14 @@ func TestGenerateChat(t *testing.T) { Model: "test", Modelfile: fmt.Sprintf(`FROM %s TEMPLATE """ -{{- if .System }}System: {{ .System }} {{ end }} -{{- if .Prompt }}User: {{ .Prompt }} {{ end }} -{{- if .Response }}Assistant: {{ .Response }} {{ end }}""" +{{- if .Tools }} +{{ .Tools }} +{{ end }} +{{- range .Messages }} +{{- .Role }}: {{ .Content }} +{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} +{{- end }} +{{ end }}""" `, createBinFile(t, llm.KV{ "general.architecture": "llama", "llama.block_count": uint32(1), @@ -263,7 +273,7 @@ func TestGenerateChat(t *testing.T) { t.Errorf("expected status 200, got %d", w.Code) } - if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" { + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "user: Hello!\n"); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } @@ -292,7 +302,7 @@ func TestGenerateChat(t *testing.T) { t.Errorf("expected status 200, got %d", w.Code) } - if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" { + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\n"); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } @@ -314,7 +324,7 @@ func TestGenerateChat(t *testing.T) { t.Errorf("expected status 200, got %d", w.Code) } - if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" { + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You can perform magic tricks.\nuser: Hello!\n"); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } @@ -337,12 +347,242 @@ func TestGenerateChat(t *testing.T) { t.Errorf("expected status 200, got %d", w.Code) } - if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" { + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\nassistant: I can help you with that.\nsystem: You can perform magic tricks.\nuser: Help me write tests.\n"); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } checkChatResponse(t, w.Body, "test-system", "Abra kadabra!") }) + + t.Run("messages with tools (non-streaming)", func(t *testing.T) { + if w.Code != http.StatusOK { + t.Fatalf("failed to create test-system model: %d", w.Code) + } + + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather", + Parameters: struct { + Type string `json:"type"` + Required []string `json:"required"` + Properties map[string]struct { + Type string `json:"type"` + Description string `json:"description"` + Enum []string `json:"enum,omitempty"` + } `json:"properties"` + }{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]struct { + Type string `json:"type"` + Description string `json:"description"` + Enum []string `json:"enum,omitempty"` + }{ + "location": { + Type: "string", + Description: "The city and state", + }, + "unit": { + Type: "string", + Enum: []string{"celsius", "fahrenheit"}, + }, + }, + }, + }, + }, + } + + mock.CompletionResponse = llm.CompletionResponse{ + Content: `{"name":"get_weather","arguments":{"location":"Seattle, WA","unit":"celsius"}}`, + Done: true, + DoneReason: "done", + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + } + + streamRequest := true + + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-system", + Messages: []api.Message{ + {Role: "user", Content: "What's the weather in Seattle?"}, + }, + Tools: tools, + Stream: &streamRequest, + }) + + if w.Code != http.StatusOK { + var errResp struct { + Error string `json:"error"` + } + if err := json.NewDecoder(w.Body).Decode(&errResp); err != nil { + t.Logf("Failed to decode error response: %v", err) + } else { + t.Logf("Error response: %s", errResp.Error) + } + } + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + var resp api.ChatResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatal(err) + } + + if resp.Message.ToolCalls == nil { + t.Error("expected tool calls, got nil") + } + + expectedToolCall := api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "get_weather", + Arguments: api.ToolCallFunctionArguments{ + "location": "Seattle, WA", + "unit": "celsius", + }, + }, + } + + if diff := cmp.Diff(resp.Message.ToolCalls[0], expectedToolCall); diff != "" { + t.Errorf("tool call mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("messages with tools (streaming)", func(t *testing.T) { + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather", + Parameters: struct { + Type string `json:"type"` + Required []string `json:"required"` + Properties map[string]struct { + Type string `json:"type"` + Description string `json:"description"` + Enum []string `json:"enum,omitempty"` + } `json:"properties"` + }{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]struct { + Type string `json:"type"` + Description string `json:"description"` + Enum []string `json:"enum,omitempty"` + }{ + "location": { + Type: "string", + Description: "The city and state", + }, + "unit": { + Type: "string", + Enum: []string{"celsius", "fahrenheit"}, + }, + }, + }, + }, + }, + } + + // Simulate streaming response with multiple chunks + var wg sync.WaitGroup + wg.Add(1) + + mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { + defer wg.Done() + + // Send chunks with small delays to simulate streaming + responses := []llm.CompletionResponse{ + { + Content: `{"name":"get_`, + Done: false, + PromptEvalCount: 1, + PromptEvalDuration: 1, + }, + { + Content: `weather","arguments":{"location":"Seattle`, + Done: false, + PromptEvalCount: 2, + PromptEvalDuration: 1, + }, + { + Content: `, WA","unit":"celsius"}}`, + Done: true, + DoneReason: "tool_call", + PromptEvalCount: 3, + PromptEvalDuration: 1, + }, + } + + for _, resp := range responses { + select { + case <-ctx.Done(): + return ctx.Err() + default: + fn(resp) + time.Sleep(10 * time.Millisecond) // Small delay between chunks + } + } + return nil + } + + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-system", + Messages: []api.Message{ + {Role: "user", Content: "What's the weather in Seattle?"}, + }, + Tools: tools, + Stream: &stream, + }) + + wg.Wait() + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + // Read and validate the streamed responses + decoder := json.NewDecoder(w.Body) + var finalToolCall api.ToolCall + + for { + var resp api.ChatResponse + if err := decoder.Decode(&resp); err == io.EOF { + break + } else if err != nil { + t.Fatal(err) + } + + if resp.Done { + if len(resp.Message.ToolCalls) != 1 { + t.Errorf("expected 1 tool call in final response, got %d", len(resp.Message.ToolCalls)) + } + finalToolCall = resp.Message.ToolCalls[0] + } + } + + expectedToolCall := api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "get_weather", + Arguments: api.ToolCallFunctionArguments{ + "location": "Seattle, WA", + "unit": "celsius", + }, + }, + } + + if diff := cmp.Diff(finalToolCall, expectedToolCall); diff != "" { + t.Errorf("final tool call mismatch (-got +want):\n%s", diff) + } + }) } func TestGenerate(t *testing.T) { From 30a9f063c94ec171b62e0b12948e1efe6c112e9f Mon Sep 17 00:00:00 2001 From: TheCookingSenpai <153772003+tcsenpai@users.noreply.github.com> Date: Fri, 29 Nov 2024 00:16:27 +0100 Subject: [PATCH 11/12] readme: add SpaceLlama, YouLama, and DualMind to community integrations (#7216) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 52f6fa55..41ad8d8c 100644 --- a/README.md +++ b/README.md @@ -346,6 +346,9 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page) - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.) - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama) +- [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar) +- [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well) +- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface) - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol) - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app) - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings) From 39e29ae5ddb9ff710c0e28652b61850f458e1205 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 28 Nov 2024 17:27:11 -0800 Subject: [PATCH 12/12] llama: fix typo and formatting in readme (#7876) --- llama/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llama/README.md b/llama/README.md index 79bf4fde..3b6b2067 100644 --- a/llama/README.md +++ b/llama/README.md @@ -93,7 +93,7 @@ make -j ## Vendoring -Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes. +Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes. If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory. @@ -105,35 +105,35 @@ make apply-patches **Pin to new base commit** -To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env` +To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring` #### Applying patches When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution. -Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure. +Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure. ``` make apply-patches ``` -If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated. +If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated. ``` make create-patches sync ``` -Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo. +Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo. ### Generating Patches -When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied: +When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied: ``` make apply-patches ``` -Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama: +Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama: ``` make sync @@ -142,9 +142,9 @@ go build . ``` > [!IMPORTANT] -> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s). +> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s). -Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with +Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with ``` make create-patches @@ -157,4 +157,4 @@ In your `./vendor/` directory, create a branch, and cherry-pick the new commit t Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches. -After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already. \ No newline at end of file +After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.