diff --git a/README.md b/README.md index b5813d20..f42731bf 100644 --- a/README.md +++ b/README.md @@ -53,10 +53,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla ## Quickstart -To run and chat with [Llama 3](https://ollama.com/library/llama3): +To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1): ``` -ollama run llama3 +ollama run llama3.1 ``` ## Model library @@ -67,8 +67,9 @@ Here are some example models that can be downloaded: | Model | Parameters | Size | Download | | ------------------ | ---------- | ----- | ------------------------------ | -| Llama 3 | 8B | 4.7GB | `ollama run llama3` | -| Llama 3 | 70B | 40GB | `ollama run llama3:70b` | +| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` | +| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` | +| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | | Gemma 2 | 9B | 5.5GB | `ollama run gemma2` | @@ -115,16 +116,16 @@ See the [guide](docs/import.md) on importing models for more information. ### Customize a prompt -Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model: +Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model: ``` -ollama pull llama3 +ollama pull llama3.1 ``` Create a `Modelfile`: ``` -FROM llama3 +FROM llama3.1 # set the temperature to 1 [higher is more creative, lower is more coherent] PARAMETER temperature 1 @@ -159,7 +160,7 @@ ollama create mymodel -f ./Modelfile ### Pull a model ``` -ollama pull llama3 +ollama pull llama3.1 ``` > This command can also be used to update a local model. Only the diff will be pulled. @@ -167,13 +168,13 @@ ollama pull llama3 ### Remove a model ``` -ollama rm llama3 +ollama rm llama3.1 ``` ### Copy a model ``` -ollama cp llama3 my-model +ollama cp llama3.1 my-model ``` ### Multiline input @@ -197,14 +198,14 @@ The image features a yellow smiley face, which is likely the central focus of th ### Pass the prompt as an argument ``` -$ ollama run llama3 "Summarize this file: $(cat README.md)" +$ ollama run llama3.1 "Summarize this file: $(cat README.md)" Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications. ``` ### Show model information ``` -ollama show llama3 +ollama show llama3.1 ``` ### List models on your computer @@ -232,7 +233,7 @@ Next, start the server: Finally, in a separate shell, run a model: ``` -./ollama run llama3 +./ollama run llama3.1 ``` ## REST API @@ -243,7 +244,7 @@ Ollama has a REST API for running and managing models. ``` curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt":"Why is the sky blue?" }' ``` @@ -252,7 +253,7 @@ curl http://localhost:11434/api/generate -d '{ ``` curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", "content": "why is the sky blue?" } ] @@ -407,7 +408,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama) - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot) - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) -- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace) +- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend) - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support) diff --git a/api/types.go b/api/types.go index 65a99c76..ea5161ff 100644 --- a/api/types.go +++ b/api/types.go @@ -114,6 +114,11 @@ func (t Tools) String() string { return string(bts) } +func (t Tool) String() string { + bts, _ := json.Marshal(t) + return string(bts) +} + // Message is a single message in a chat sequence. The message contains the // role ("system", "user", or "assistant"), the content and an optional list // of images. @@ -209,6 +214,7 @@ type Options struct { NumPredict int `json:"num_predict,omitempty"` TopK int `json:"top_k,omitempty"` TopP float32 `json:"top_p,omitempty"` + MinP float32 `json:"min_p,omitempty"` TFSZ float32 `json:"tfs_z,omitempty"` TypicalP float32 `json:"typical_p,omitempty"` RepeatLastN int `json:"repeat_last_n,omitempty"` diff --git a/app/ollama.iss b/app/ollama.iss index 6bedb9ff..dc6178f7 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi ;FinishedHeadingLabel=Run your first model -;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3 +;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3.1 ;ClickFinish=%n [Registry] diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1 index 9af37a46..46777a3a 100644 --- a/app/ollama_welcome.ps1 +++ b/app/ollama_welcome.ps1 @@ -4,5 +4,5 @@ write-host "Welcome to Ollama!" write-host "" write-host "Run your first model:" write-host "" -write-host "`tollama run llama3" +write-host "`tollama run llama3.1" write-host "" \ No newline at end of file diff --git a/cmd/cmd.go b/cmd/cmd.go index b761d018..610fddcb 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1341,6 +1341,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_NUM_PARALLEL"], envVars["OLLAMA_NOPRUNE"], envVars["OLLAMA_ORIGINS"], + envVars["OLLAMA_SCHED_SPREAD"], envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], diff --git a/cmd/interactive.go b/cmd/interactive.go index adbc3e9f..70afc6ea 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -1,6 +1,7 @@ package cmd import ( + "cmp" "errors" "fmt" "io" @@ -9,13 +10,14 @@ import ( "path/filepath" "regexp" "slices" - "sort" "strings" "github.com/spf13/cobra" + "golang.org/x/exp/maps" "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/parser" "github.com/ollama/ollama/progress" "github.com/ollama/ollama/readline" "github.com/ollama/ollama/types/errtypes" @@ -138,6 +140,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Fprintln(os.Stderr, " /set parameter num_predict Max number of tokens to predict") fmt.Fprintln(os.Stderr, " /set parameter top_k Pick from top k num of tokens") fmt.Fprintln(os.Stderr, " /set parameter top_p Pick token based on sum of probabilities") + fmt.Fprintln(os.Stderr, " /set parameter min_p Pick token based on top token probability * min_p") fmt.Fprintln(os.Stderr, " /set parameter num_ctx Set the context size") fmt.Fprintln(os.Stderr, " /set parameter temperature Set creativity level") fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty How strongly to penalize repetitions") @@ -375,9 +378,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { return err } req := &api.ShowRequest{ - Name: opts.Model, - System: opts.System, - Options: opts.Options, + Name: opts.Model, + System: opts.System, + Options: opts.Options, } resp, err := client.Show(cmd.Context(), req) if err != nil { @@ -506,31 +509,35 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { } func buildModelfile(opts runOptions) string { - var mf strings.Builder - model := opts.ParentModel - if model == "" { - model = opts.Model - } - fmt.Fprintf(&mf, "FROM %s\n", model) + var f parser.File + f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)}) + if opts.System != "" { - fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System) + f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System}) } - keys := make([]string, 0) - for k := range opts.Options { - keys = append(keys, k) - } - sort.Strings(keys) + keys := maps.Keys(opts.Options) + slices.Sort(keys) for _, k := range keys { - fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k]) + v := opts.Options[k] + var cmds []parser.Command + switch t := v.(type) { + case []string: + for _, s := range t { + cmds = append(cmds, parser.Command{Name: k, Args: s}) + } + default: + cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)}) + } + + f.Commands = append(f.Commands, cmds...) } - fmt.Fprintln(&mf) for _, msg := range opts.Messages { - fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content) + f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)}) } - return mf.String() + return f.String() } func normalizeFilePath(fp string) string { diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index 711f3860..bb7e0aba 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -1,12 +1,10 @@ package cmd import ( - "bytes" "testing" - "text/template" + "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" ) @@ -57,58 +55,53 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 func TestModelfileBuilder(t *testing.T) { opts := runOptions{ - Model: "hork", - System: "You are part horse and part shark, but all hork. Do horklike things", + Model: "hork", + System: "You are part horse and part shark, but all hork. Do horklike things", Messages: []api.Message{ {Role: "user", Content: "Hey there hork!"}, {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, }, - Options: map[string]interface{}{}, + Options: map[string]any{ + "temperature": 0.9, + "seed": 42, + "penalize_newline": false, + "stop": []string{"hi", "there"}, + }, } - opts.Options["temperature"] = 0.9 - opts.Options["seed"] = 42 - opts.Options["penalize_newline"] = false - opts.Options["stop"] = []string{"hi", "there"} - - mf := buildModelfile(opts) - expectedModelfile := `FROM {{.Model}} -SYSTEM """{{.System}}""" + t.Run("model", func(t *testing.T) { + expect := `FROM hork +SYSTEM You are part horse and part shark, but all hork. Do horklike things PARAMETER penalize_newline false PARAMETER seed 42 -PARAMETER stop [hi there] +PARAMETER stop hi +PARAMETER stop there PARAMETER temperature 0.9 - -MESSAGE user """Hey there hork!""" -MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +MESSAGE user Hey there hork! +MESSAGE assistant Yes it is true, I am half horse, half shark. ` - tmpl, err := template.New("").Parse(expectedModelfile) - require.NoError(t, err) + actual := buildModelfile(opts) + if diff := cmp.Diff(expect, actual); diff != "" { + t.Errorf("mismatch (-want +got):\n%s", diff) + } + }) - var buf bytes.Buffer - err = tmpl.Execute(&buf, opts) - require.NoError(t, err) - assert.Equal(t, buf.String(), mf) - - opts.ParentModel = "horseshark" - mf = buildModelfile(opts) - expectedModelfile = `FROM {{.ParentModel}} -SYSTEM """{{.System}}""" + t.Run("parent model", func(t *testing.T) { + opts.ParentModel = "horseshark" + expect := `FROM horseshark +SYSTEM You are part horse and part shark, but all hork. Do horklike things PARAMETER penalize_newline false PARAMETER seed 42 -PARAMETER stop [hi there] +PARAMETER stop hi +PARAMETER stop there PARAMETER temperature 0.9 - -MESSAGE user """Hey there hork!""" -MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +MESSAGE user Hey there hork! +MESSAGE assistant Yes it is true, I am half horse, half shark. ` - - tmpl, err = template.New("").Parse(expectedModelfile) - require.NoError(t, err) - - var parentBuf bytes.Buffer - err = tmpl.Execute(&parentBuf, opts) - require.NoError(t, err) - assert.Equal(t, parentBuf.String(), mf) + actual := buildModelfile(opts) + if diff := cmp.Diff(expect, actual); diff != "" { + t.Errorf("mismatch (-want +got):\n%s", diff) + } + }) } diff --git a/docs/api.md b/docs/api.md index 2d4fe28f..c0202ef3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -336,6 +336,7 @@ curl http://localhost:11434/api/generate -d '{ "num_predict": 100, "top_k": 20, "top_p": 0.9, + "min_p": 0.0, "tfs_z": 0.5, "typical_p": 0.7, "repeat_last_n": 33, @@ -586,7 +587,7 @@ Final response: ##### Request -Send a chat message with a conversation history. +Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64. ```shell curl http://localhost:11434/api/chat -d '{ diff --git a/docs/docker.md b/docs/docker.md index 0b58562b..a34c3291 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114 Now you can run a model: ``` -docker exec -it ollama ollama run llama3 +docker exec -it ollama ollama run llama3.1 ``` ### Try different models diff --git a/docs/faq.md b/docs/faq.md index da1848f7..324116d1 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -227,7 +227,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}' To preload a model using the CLI, use the command: ```shell -ollama run llama3 "" +ollama run llama3.1 "" ``` ## How do I keep a model loaded in memory or make it unload immediately? @@ -272,4 +272,8 @@ The following server settings may be used to adjust how Ollama handles concurren - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory. - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 -Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. \ No newline at end of file +Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. + +## How does Ollama load models on multiple GPUs? + +Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. \ No newline at end of file diff --git a/docs/modelfile.md b/docs/modelfile.md index c3645b06..852bf96c 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -141,6 +141,7 @@ PARAMETER | num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 | | top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 | | top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 | +| min_p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float | min_p 0.05 | ### TEMPLATE diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md index 4d60afb6..f925869b 100644 --- a/docs/tutorials/langchainjs.md +++ b/docs/tutorials/langchainjs.md @@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama"; const ollama = new Ollama({ baseUrl: "http://localhost:11434", - model: "llama3", + model: "llama3.1", }); const answer = await ollama.invoke(`why is the sky blue?`); @@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`); console.log(answer); ``` -That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app. +That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app. ```bash npm install cheerio diff --git a/docs/windows.md b/docs/windows.md index 69c2aa6d..dbfc1440 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -23,6 +23,8 @@ Logs will often be helpful in diagnosing the problem (see * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card +Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings. + ## API Access Here's a quick example showing API access from `powershell` diff --git a/examples/go-chat/main.go b/examples/go-chat/main.go index 5266f03e..7663fb8f 100644 --- a/examples/go-chat/main.go +++ b/examples/go-chat/main.go @@ -35,7 +35,7 @@ func main() { ctx := context.Background() req := &api.ChatRequest{ - Model: "llama3", + Model: "llama3.1", Messages: messages, } diff --git a/examples/go-generate-streaming/main.go b/examples/go-generate-streaming/main.go index 49403351..3acfb22a 100644 --- a/examples/go-generate-streaming/main.go +++ b/examples/go-generate-streaming/main.go @@ -16,7 +16,7 @@ func main() { // By default, GenerateRequest is streaming. req := &api.GenerateRequest{ - Model: "gemma", + Model: "gemma2", Prompt: "how many planets are there?", } diff --git a/examples/go-generate/main.go b/examples/go-generate/main.go index 50fbf64b..2fe28742 100644 --- a/examples/go-generate/main.go +++ b/examples/go-generate/main.go @@ -15,7 +15,7 @@ func main() { } req := &api.GenerateRequest{ - Model: "gemma", + Model: "gemma2", Prompt: "how many planets are there?", // set streaming to false diff --git a/examples/go-http-generate/README.md b/examples/go-http-generate/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/langchain-python-rag-document/README.md b/examples/langchain-python-rag-document/README.md index 20a73a88..e2f3bc02 100644 --- a/examples/langchain-python-rag-document/README.md +++ b/examples/langchain-python-rag-document/README.md @@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document. ## Setup +1. Ensure you have the `llama3.1` model installed: + +``` +ollama pull llama3.1 +``` + +2. Install the Python Requirements. + ``` pip install -r requirements.txt ``` diff --git a/examples/langchain-python-rag-document/main.py b/examples/langchain-python-rag-document/main.py index 3ed9499f..6f7cec9b 100644 --- a/examples/langchain-python-rag-document/main.py +++ b/examples/langchain-python-rag-document/main.py @@ -51,7 +51,7 @@ while True: template=template, ) - llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])) + llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])) qa_chain = RetrievalQA.from_chain_type( llm, retriever=vectorstore.as_retriever(), diff --git a/examples/langchain-python-rag-websummary/README.md b/examples/langchain-python-rag-websummary/README.md index 3f3b9873..29c706a3 100644 --- a/examples/langchain-python-rag-websummary/README.md +++ b/examples/langchain-python-rag-websummary/README.md @@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso ## Running the Example -1. Ensure you have the `llama2` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama2 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/langchain-python-rag-websummary/main.py b/examples/langchain-python-rag-websummary/main.py index d1b05ba8..77b09fbb 100644 --- a/examples/langchain-python-rag-websummary/main.py +++ b/examples/langchain-python-rag-websummary/main.py @@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally") docs = loader.load() -llm = Ollama(model="llama3") +llm = Ollama(model="llama3.1") chain = load_summarize_chain(llm, chain_type="stuff") -result = chain.invoke(docs) +result = chain.invoke(docs) print(result) diff --git a/examples/langchain-python-simple/README.md b/examples/langchain-python-simple/README.md index d4102dec..60db2c8c 100644 --- a/examples/langchain-python-simple/README.md +++ b/examples/langchain-python-simple/README.md @@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama. ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py index 7cb65286..a7ed81d6 100644 --- a/examples/langchain-python-simple/main.py +++ b/examples/langchain-python-simple/main.py @@ -1,6 +1,6 @@ from langchain.llms import Ollama input = input("What is your question?") -llm = Ollama(model="llama3") +llm = Ollama(model="llama3.1") res = llm.predict(input) print (res) diff --git a/examples/modelfile-mario/Modelfile b/examples/modelfile-mario/Modelfile index 33d5952b..a3747086 100644 --- a/examples/modelfile-mario/Modelfile +++ b/examples/modelfile-mario/Modelfile @@ -1,4 +1,4 @@ -FROM llama3 +FROM llama3.1 PARAMETER temperature 1 SYSTEM """ You are Mario from super mario bros, acting as an assistant. diff --git a/examples/modelfile-mario/readme.md b/examples/modelfile-mario/readme.md index e4f0d417..c3f34197 100644 --- a/examples/modelfile-mario/readme.md +++ b/examples/modelfile-mario/readme.md @@ -2,12 +2,12 @@ # Example character: Mario -This example shows how to create a basic character using Llama3 as the base model. +This example shows how to create a basic character using Llama3.1 as the base model. To run this example: 1. Download the Modelfile -2. `ollama pull llama3` to get the base model used in the model file. +2. `ollama pull llama3.1` to get the base model used in the model file. 3. `ollama create NAME -f ./Modelfile` 4. `ollama run NAME` @@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?" What the model file looks like: ``` -FROM llama3 +FROM llama3.1 PARAMETER temperature 1 SYSTEM """ You are Mario from Super Mario Bros, acting as an assistant. diff --git a/examples/python-dockerit/dockerit.py b/examples/python-dockerit/dockerit.py index b013102f..6a288d90 100644 --- a/examples/python-dockerit/dockerit.py +++ b/examples/python-dockerit/dockerit.py @@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ") client = docker.from_env() s = requests.Session() output="" -with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r: +with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r: for line in r.iter_lines(): if line: j = json.loads(line) diff --git a/examples/python-json-datagenerator/predefinedschema.py b/examples/python-json-datagenerator/predefinedschema.py index 1fd54892..68090ad7 100644 --- a/examples/python-json-datagenerator/predefinedschema.py +++ b/examples/python-json-datagenerator/predefinedschema.py @@ -2,7 +2,7 @@ import requests import json import random -model = "llama3" +model = "llama3.1" template = { "firstName": "", "lastName": "", diff --git a/examples/python-json-datagenerator/randomaddresses.py b/examples/python-json-datagenerator/randomaddresses.py index 72b1fefb..878c9803 100644 --- a/examples/python-json-datagenerator/randomaddresses.py +++ b/examples/python-json-datagenerator/randomaddresses.py @@ -12,7 +12,7 @@ countries = [ "France", ] country = random.choice(countries) -model = "llama3" +model = "llama3.1" prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters." diff --git a/examples/python-json-datagenerator/readme.md b/examples/python-json-datagenerator/readme.md index 88357044..5b444dff 100644 --- a/examples/python-json-datagenerator/readme.md +++ b/examples/python-json-datagenerator/readme.md @@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/python-simplechat/client.py b/examples/python-simplechat/client.py index f82a16b3..85043d5f 100644 --- a/examples/python-simplechat/client.py +++ b/examples/python-simplechat/client.py @@ -2,7 +2,7 @@ import json import requests # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve` -model = "llama3" # TODO: update this for whatever model you wish to use +model = "llama3.1" # TODO: update this for whatever model you wish to use def chat(messages): diff --git a/examples/python-simplechat/readme.md b/examples/python-simplechat/readme.md index dd2576bc..4c2ded4d 100644 --- a/examples/python-simplechat/readme.md +++ b/examples/python-simplechat/readme.md @@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/typescript-simplechat/client.ts b/examples/typescript-simplechat/client.ts index a1e0eea3..8ad113b1 100644 --- a/examples/typescript-simplechat/client.ts +++ b/examples/typescript-simplechat/client.ts @@ -1,6 +1,6 @@ import * as readline from "readline"; -const model = "llama3"; +const model = "llama3.1"; type Message = { role: "assistant" | "user" | "system"; content: string; diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 1fa4c625..2487183f 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -10,6 +10,7 @@ import ( "path/filepath" "regexp" "slices" + "sort" "strconv" "strings" @@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo { // The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract // from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU) matches, _ := filepath.Glob(GPUPropertiesFileGlob) + sort.Slice(matches, func(i, j int) bool { + // /sys/class/kfd/kfd/topology/nodes//properties + a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64) + if err != nil { + slog.Debug("parse err", "error", err, "match", matches[i]) + return false + } + b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64) + if err != nil { + slog.Debug("parse err", "error", err, "match", matches[i]) + return false + } + return a < b + }) cpuCount := 0 for _, match := range matches { slog.Debug("evaluating amdgpu node " + match) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 14d921c0..0d51460c 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } - else if (arg == "--lora-base") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { server_verbose = true; diff --git a/llm/llama.cpp b/llm/llama.cpp deleted file mode 160000 index a8db2a9c..00000000 --- a/llm/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 646bc49c..0d40fc3c 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..7113ba64 100644 +index a207451f..2ddf431d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5433,16 +5433,7 @@ static void llm_load_vocab( +@@ -5347,16 +5347,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5526,7 +5517,8 @@ static void llm_load_vocab( - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; - vocab.tokenizer_clean_spaces = false; +@@ -5443,7 +5434,8 @@ static void llm_load_vocab( + tokenizer_pre == "codeshell") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index fc1017a6..10c66d1d 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp index dbb724fb..c26fe6ee 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,29 @@ std::tuple llama_init_from_gpt_par +@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644 + int err = llama_model_apply_lora_from_file(model, + lora_adapter.c_str(), + lora_scale, -+ ((i > 0) || params.lora_base.empty()) -+ ? NULL -+ : params.lora_base.c_str(), ++ nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff deleted file mode 100644 index 39f38fea..00000000 --- a/llm/patches/10-llama3-rope.diff +++ /dev/null @@ -1,70 +0,0 @@ -From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Tue, 23 Jul 2024 14:33:29 -0700 -Subject: [PATCH] llama 3.1 rope scaling - ---- - src/llama.cpp | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..a9969df8 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -2472,6 +2472,7 @@ struct llama_layer { - // long rope factors - struct ggml_tensor * rope_long = nullptr; - struct ggml_tensor * rope_short = nullptr; -+ struct ggml_tensor * rope_freqs = nullptr; - - // bitnet scale - struct ggml_tensor * wq_scale; -@@ -6143,6 +6144,8 @@ static bool llm_load_tensors( - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - -+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); -+ - if (n_expert == 0) { - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); -@@ -8620,6 +8623,10 @@ struct llm_build_context { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - -+ if (model.layers[il].rope_freqs != nullptr) { -+ return model.layers[il].rope_freqs; -+ } -+ - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } -@@ -8814,6 +8821,9 @@ struct llm_build_context { - - // self-attention - { -+ // rope freq factors for llama3; may return nullptr for llama2 and other models -+ struct ggml_tensor * rope_factors = build_rope_factors(il); -+ - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); -@@ -8837,14 +8847,14 @@ struct llm_build_context { - } - - Qcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); --- -2.45.2 diff --git a/llm/server.go b/llm/server.go index 55732773..8127960f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -727,6 +727,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu "temperature": req.Options.Temperature, "top_k": req.Options.TopK, "top_p": req.Options.TopP, + "min_p": req.Options.MinP, "tfs_z": req.Options.TFSZ, "typical_p": req.Options.TypicalP, "repeat_last_n": req.Options.RepeatLastN, diff --git a/macapp/src/app.tsx b/macapp/src/app.tsx index ab17df60..a627e63d 100644 --- a/macapp/src/app.tsx +++ b/macapp/src/app.tsx @@ -19,7 +19,7 @@ export default function () { const [step, setStep] = useState(Step.WELCOME) const [commandCopied, setCommandCopied] = useState(false) - const command = 'ollama run llama3' + const command = 'ollama run llama3.1' return (
diff --git a/openai/openai.go b/openai/openai.go index de6f4bd5..5bd80660 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -218,6 +218,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { Index: 0, Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls}, FinishReason: func(reason string) *string { + if len(toolCalls) > 0 { + reason = "tool_calls" + } if len(reason) > 0 { return &reason } diff --git a/parser/parser_test.go b/parser/parser_test.go index 2b5c4c88..48044bc0 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -451,6 +451,7 @@ func TestParseFileParameters(t *testing.T) { "num_predict 1": {"num_predict", "1"}, "top_k 1": {"top_k", "1"}, "top_p 1.0": {"top_p", "1.0"}, + "min_p 0.05": {"min_p", "0.05"}, "tfs_z 1.0": {"tfs_z", "1.0"}, "typical_p 1.0": {"typical_p", "1.0"}, "repeat_last_n 1": {"repeat_last_n", "1"}, diff --git a/scripts/install.sh b/scripts/install.sh index 2a06c350..aa8b3e5e 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -198,19 +198,29 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then exit 0 fi +CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA. Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora install_cuda_driver_yum() { status 'Installing NVIDIA repository...' + case $PACKAGE_MANAGER in yum) $SUDO $PACKAGE_MANAGER -y install yum-utils - $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + else + error $CUDA_REPO_ERR_MSG + fi ;; dnf) - $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + else + error $CUDA_REPO_ERR_MSG + fi ;; esac @@ -235,7 +245,11 @@ install_cuda_driver_yum() { # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian install_cuda_driver_apt() { status 'Installing NVIDIA repository...' - curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then + curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + else + error $CUDA_REPO_ERR_MSG + fi case $1 in debian)