From 66ab48772f4f41f3f27fb93e15ef0cf756bda3d0 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 29 May 2024 21:37:07 -0700 Subject: [PATCH 01/13] proper utf16 support --- parser/parser.go | 92 +++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/parser/parser.go b/parser/parser.go index 4f44f6af..c9afc20b 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -3,12 +3,15 @@ package parser import ( "bufio" "bytes" + "encoding/binary" "errors" "fmt" "io" + "log/slog" "strconv" "strings" - "unicode" + "unicode/utf16" + "unicode/utf8" ) type File struct { @@ -69,33 +72,31 @@ func ParseFile(r io.Reader) (*File, error) { var b bytes.Buffer var role string - var lineCount int - var linePos int - - var utf16 bool - var f File br := bufio.NewReader(r) - for { - r, _, err := br.ReadRune() - if errors.Is(err, io.EOF) { - break - } else if err != nil { + + var sc scannerDecoder = utf8ScannerDecoder{} + if bom, err := br.Peek(2); err != nil { + slog.Warn("error reading byte-order mark", "error", err) + } else if bytes.Equal(bom, []byte{0xFE, 0xFF}) { + sc = utf16ScannerDecoder{binary.LittleEndian} + //nolint:errcheck + br.Discard(2) + } else if bytes.Equal(bom, []byte{0xFF, 0xFE}) { + sc = utf16ScannerDecoder{binary.BigEndian} + //nolint:errcheck + br.Discard(2) + } + + scanner := bufio.NewScanner(br) + scanner.Split(sc.ScanBytes) + for scanner.Scan() { + r, err := sc.DecodeRune(scanner.Bytes()) + if err != nil { return nil, err } - // the utf16 byte order mark will be read as "unreadable" by ReadRune() - if isUnreadable(r) && lineCount == 0 && linePos == 0 { - utf16 = true - continue - } - - // skip the second byte if we're reading utf16 - if utf16 && r == 0 { - continue - } - next, r, err := parseRuneForState(r, curr) if errors.Is(err, io.ErrUnexpectedEOF) { return nil, fmt.Errorf("%w: %s", err, b.String()) @@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) { return nil, err } - if isNewline(r) { - lineCount++ - linePos = 0 - } else { - linePos++ - } - // process the state transition, some transitions need to be intercepted and redirected if next != curr { switch curr { @@ -309,10 +303,6 @@ func isNewline(r rune) bool { return r == '\r' || r == '\n' } -func isUnreadable(r rune) bool { - return r == unicode.ReplacementChar -} - func isValidMessageRole(role string) bool { return role == "system" || role == "user" || role == "assistant" } @@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool { return false } } + +type scannerDecoder interface { + ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) + DecodeRune([]byte) (rune, error) +} + +type utf8ScannerDecoder struct{} + +func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + return scanBytesN(data, 1, atEOF) +} + +func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) { + r, _ := utf8.DecodeRune(data) + return r, nil +} + +type utf16ScannerDecoder struct { + binary.ByteOrder +} + +func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + return scanBytesN(data, 2, atEOF) +} + +func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) { + return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil +} + +func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + return n, data[:n], nil +} From b9ce7bf75effbe56a4d8f8b237363eb96223c7ae Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 7 Jun 2024 16:45:15 -0700 Subject: [PATCH 02/13] update import.md --- docs/import.md | 227 +++++++++++++++++-------------------------------- 1 file changed, 78 insertions(+), 149 deletions(-) diff --git a/docs/import.md b/docs/import.md index 7041b74d..7abe39b2 100644 --- a/docs/import.md +++ b/docs/import.md @@ -1,170 +1,99 @@ -# Import a model +# Import -This guide walks through importing a GGUF, PyTorch or Safetensors model. +GGUF models and select Safetensors models can be imported directly into Ollama. -## Importing (GGUF) +## Import GGUF -### Step 1: Write a `Modelfile` +A binary GGUF file can be imported directly into Ollama through a Modelfile. -Start by creating a `Modelfile`. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more. - -``` -FROM ./mistral-7b-v0.1.Q4_0.gguf +```dockerfile +FROM /path/to/file.gguf ``` -(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`: +## Import Safetensors -``` -FROM ./mistral-7b-v0.1.Q4_0.gguf -TEMPLATE "[INST] {{ .Prompt }} [/INST]" +If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile: + + - LlamaForCausalLM + - MistralForCausalLM + - GemmaForCausalLM + +```dockerfile +FROM /path/to/safetensors/directory ``` -### Step 2: Create the Ollama model +For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf). -Finally, create a model from your `Modelfile`: +## Automatic Quantization +> [!NOTE] +> Automatic quantization requires v0.1.35 or higher. + +Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`. + +```dockerfile +FROM /path/to/my/gemma/f16/model ``` -ollama create example -f Modelfile -``` - -### Step 3: Run your model - -Next, test the model with `ollama run`: - -``` -ollama run example "What is your favourite condiment?" -``` - -## Importing (PyTorch & Safetensors) - -> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress. - -### Setup - -First, clone the `ollama/ollama` repo: - -``` -git clone git@github.com:ollama/ollama.git ollama -cd ollama -``` - -and then fetch its `llama.cpp` submodule: ```shell -git submodule init -git submodule update llm/llama.cpp +$ ollama create -q Q4_K_M mymodel +transferring model data +quantizing F16 model to Q4_K_M +creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd +creating new layer sha256:0853f0ad24e5865173bbf9ffcc7b0f5d56b66fd690ab1009867e45e7d2c4db0f +writing manifest +success ``` -Next, install the Python dependencies: +### Supported Quantizations -``` -python3 -m venv llm/llama.cpp/.venv -source llm/llama.cpp/.venv/bin/activate -pip install -r llm/llama.cpp/requirements.txt +
+Legacy Quantization + +- `Q4_0` +- `Q4_1` +- `Q5_0` +- `Q5_1` +- `Q8_0` + +
+ +
+K-means Quantization` + +- `Q3_K_S` +- `Q3_K_M` +- `Q3_K_L` +- `Q4_K_S` +- `Q4_K_M` +- `Q5_K_S` +- `Q5_K_M` +- `Q6_K` + +
+ +> [!NOTE] +> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf). + +## Template Detection + +> [!NOTE] +> Template detection requires v0.1.42 or higher. + +Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing. + +```dockerfile +FROM /path/to/my/gemma/model ``` -Then build the `quantize` tool: - -``` -make -C llm/llama.cpp quantize +```shell +$ ollama create mymodel +transferring model data +using autodetected template gemma-instruct +creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84 +creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb +writing manifest +success ``` -### Clone the HuggingFace repository (optional) - -If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model. - -Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository: - -``` -git lfs install -git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model -``` - -### Convert the model - -> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py` - -``` -python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin -``` - -### Quantize the model - -``` -llm/llama.cpp/quantize converted.bin quantized.bin q4_0 -``` - -### Step 3: Write a `Modelfile` - -Next, create a `Modelfile` for your model: - -``` -FROM quantized.bin -TEMPLATE "[INST] {{ .Prompt }} [/INST]" -``` - -### Step 4: Create the Ollama model - -Finally, create a model from your `Modelfile`: - -``` -ollama create example -f Modelfile -``` - -### Step 5: Run your model - -Next, test the model with `ollama run`: - -``` -ollama run example "What is your favourite condiment?" -``` - -## Publishing your model (optional – early alpha) - -Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps: - -1. Create [an account](https://ollama.com/signup) -2. Copy your Ollama public key: - - macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy` - - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub` - - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub` -3. Add your public key to your [Ollama account](https://ollama.com/settings/keys) - -Next, copy your model to your username's namespace: - -``` -ollama cp example /example -``` - -> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`. - -Then push the model: - -``` -ollama push /example -``` - -After publishing, your model will be available at `https://ollama.com//example`. - -## Quantization reference - -The quantization options are as follow (from highest highest to lowest levels of quantization). Note: some architectures such as Falcon do not support K quants. - -- `q2_K` -- `q3_K` -- `q3_K_S` -- `q3_K_M` -- `q3_K_L` -- `q4_0` (recommended) -- `q4_1` -- `q4_K` -- `q4_K_S` -- `q4_K_M` -- `q5_0` -- `q5_1` -- `q5_K` -- `q5_K_S` -- `q5_K_M` -- `q6_K` -- `q8_0` -- `f16` +Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one. From 620d5c569e965ac93ac5c58bca5d3d8938cb98bc Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Sat, 8 Jun 2024 12:32:02 -0700 Subject: [PATCH 03/13] fix parsing big endian gguf --- llm/ggml.go | 13 +++++-------- llm/gguf.go | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/llm/ggml.go b/llm/ggml.go index 645447d5..16da4c9d 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -231,8 +231,7 @@ const ( // Magic constant for `ggla` files (LoRA adapter). FILE_MAGIC_GGLA = 0x67676C61 // Magic constant for `gguf` files (versioned, gguf) - FILE_MAGIC_GGUF_LE = 0x46554747 - FILE_MAGIC_GGUF_BE = 0x47475546 + FILE_MAGIC_GGUF = 0x46554747 ) var ErrUnsupportedFormat = errors.New("unsupported model format") @@ -247,7 +246,7 @@ func DetectGGMLType(b []byte) string { return "ggjt" case FILE_MAGIC_GGLA: return "ggla" - case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE: + case FILE_MAGIC_GGUF: return "gguf" default: return "" @@ -255,21 +254,19 @@ func DetectGGMLType(b []byte) string { } func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { - var magic uint32 + var magic [4]byte if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { return nil, 0, err } var c container - switch magic { + switch binary.LittleEndian.Uint32(magic[:]) { case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: return nil, 0, ErrUnsupportedFormat case FILE_MAGIC_GGLA: c = &containerGGLA{} - case FILE_MAGIC_GGUF_LE: + case FILE_MAGIC_GGUF: c = &containerGGUF{ByteOrder: binary.LittleEndian} - case FILE_MAGIC_GGUF_BE: - c = &containerGGUF{ByteOrder: binary.BigEndian} default: return nil, 0, errors.New("invalid file magic") } diff --git a/llm/gguf.go b/llm/gguf.go index 234efe57..8c64e166 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -36,10 +36,23 @@ func (c *containerGGUF) Name() string { } func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { - if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil { + var version [4]byte + if err := binary.Read(rs, c.ByteOrder, &version); err != nil { return nil, err } + // if the lower 16 bits are 0, the byte order is probably wrong + if c.ByteOrder.Uint32(version[:])&1<<4 == 0 { + switch c.ByteOrder { + case binary.LittleEndian: + c.ByteOrder = binary.BigEndian + case binary.BigEndian: + c.ByteOrder = binary.LittleEndian + } + } + + c.Version = c.ByteOrder.Uint32(version[:]) + var err error switch c.Version { case 1: From 943172cbf4d6cc0b8682021bfc9c2d816152615d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 8 Jun 2024 23:04:32 -0700 Subject: [PATCH 04/13] Update api.md --- docs/api.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/api.md b/docs/api.md index 64bfbed8..f761f68c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1044,11 +1044,10 @@ GET /api/ps List models that are currently loaded into memory. -\* If a model is loaded completely into system memory, `size_vram` is omitted from the response. - #### Examples ### Request + ```shell curl http://localhost:11434/api/ps ``` @@ -1080,4 +1079,4 @@ A single JSON object will be returned. } ] } -``` \ No newline at end of file +``` From 5528dd9d1170e7a78a4fdb7684e8944e2052ca8f Mon Sep 17 00:00:00 2001 From: dcasota <14890243+dcasota@users.noreply.github.com> Date: Sun, 9 Jun 2024 19:41:07 +0200 Subject: [PATCH 05/13] Error handling load_single_document() in ingest.py (#4852) load_single_document() handles - corrupt files - empty (zero byte) files - unsupported file extensions --- .../langchain-python-rag-privategpt/ingest.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/langchain-python-rag-privategpt/ingest.py b/examples/langchain-python-rag-privategpt/ingest.py index 35324775..0f71ccf0 100755 --- a/examples/langchain-python-rag-privategpt/ingest.py +++ b/examples/langchain-python-rag-privategpt/ingest.py @@ -77,13 +77,21 @@ LOADER_MAPPING = { def load_single_document(file_path: str) -> List[Document]: - ext = "." + file_path.rsplit(".", 1)[-1] - if ext in LOADER_MAPPING: - loader_class, loader_args = LOADER_MAPPING[ext] - loader = loader_class(file_path, **loader_args) - return loader.load() + if os.path.getsize(file_path) != 0: + filename, ext = os.path.splitext(file_path) + if ext in LOADER_MAPPING: + loader_class, loader_args = LOADER_MAPPING[ext] + try: + loader = loader_class(file_path, **loader_args) + if loader: + return loader.load() + except: + print(f"Corrupted file {file_path}. Ignoring it.") + else: + print(f"Unsupported file {file_path}. Ignoring it.") + else: + print(f"Empty file {file_path}. Ignoring it.") - raise ValueError(f"Unsupported file extension '{ext}'") def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: """ @@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum results = [] with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): - results.extend(docs) + if docs: + results.extend(docs) pbar.update() return results From 896495de7b5814cca32ba83d2e9f2bf176ba98c8 Mon Sep 17 00:00:00 2001 From: Napuh <55241721+Napuh@users.noreply.github.com> Date: Sun, 9 Jun 2024 19:49:03 +0200 Subject: [PATCH 06/13] Add instructions to easily install specific versions on faq.md (#4084) * Added instructions to easily install specific versions on faq.md * Small typo * Moved instructions on how to install specific version to linux.md * Update docs/linux.md * Update docs/linux.md --------- Co-authored-by: Jeffrey Morgan --- docs/linux.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/linux.md b/docs/linux.md index 9e7e06fa..ec730656 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -100,6 +100,16 @@ sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama sudo chmod +x /usr/bin/ollama ``` +## Installing specific versions + +Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). + +For example: + +``` +curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh +``` + ## Viewing logs To view logs of Ollama running as a startup service, run: From b84aea1685329f107c5547b1cca4efcf15c19f52 Mon Sep 17 00:00:00 2001 From: Craig Hughes Date: Sun, 9 Jun 2024 13:57:09 -0400 Subject: [PATCH 07/13] Critical fix from llama.cpp JSON grammar to forbid un-escaped escape characters inside strings, which breaks parsing. (#3782) --- llm/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 6cb01fa0..0a815798 100644 --- a/llm/server.go +++ b/llm/server.go @@ -606,7 +606,7 @@ array ::= string ::= "\"" ( - [^"\\] | + [^"\\\x7F\x00-\x1F] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws From 94d37fdcae30ddeb6c9f65c8707004f5ec9eaf33 Mon Sep 17 00:00:00 2001 From: Jim Scardelis Date: Sun, 9 Jun 2024 10:58:09 -0700 Subject: [PATCH 08/13] fix: examples/langchain-python-rag-privategpt/requirements.txt (#3382) --- examples/langchain-python-rag-privategpt/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/langchain-python-rag-privategpt/requirements.txt b/examples/langchain-python-rag-privategpt/requirements.txt index d4c5cb2e..0aad1fe5 100644 --- a/examples/langchain-python-rag-privategpt/requirements.txt +++ b/examples/langchain-python-rag-privategpt/requirements.txt @@ -11,4 +11,5 @@ tabulate==0.9.0 pandoc==2.3 pypandoc==1.11 tqdm==4.66.1 -sentence_transformers==2.2.2 \ No newline at end of file +sentence_transformers==2.2.2 +numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability \ No newline at end of file From b27268aaefa03f401f0f09c1e21b63872288ddc7 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 10 Jun 2024 11:31:34 -0700 Subject: [PATCH 09/13] add test --- server/routes_delete_test.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go index d0990009..0f003574 100644 --- a/server/routes_delete_test.go +++ b/server/routes_delete_test.go @@ -1,12 +1,15 @@ package server import ( + "bytes" + "encoding/json" "fmt" "net/http" "path/filepath" "testing" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/types/model" ) func TestDelete(t *testing.T) { @@ -69,3 +72,33 @@ func TestDelete(t *testing.T) { checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{}) checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{}) } + +func TestDeleteDuplicateLayers(t *testing.T) { + p := t.TempDir() + t.Setenv("OLLAMA_MODELS", p) + var s Server + + n := model.ParseName("test") + + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(&ConfigV2{}); err != nil { + t.Fatal(err) + } + + config, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") + if err != nil { + t.Fatal(err) + } + + // create a manifest with duplicate layers + if err := WriteManifest(n, config, []*Layer{config}); err != nil { + t.Fatal(err) + } + + w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"}) + if w.Code != http.StatusOK { + t.Errorf("expected status code 200, actual %d", w.Code) + } + + checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{}) +} From 515f497e6d5c0a102f913693fc8d4ed5eefd4ebb Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 10 Jun 2024 11:15:03 -0700 Subject: [PATCH 10/13] fix: skip removing layers that no longer exist --- server/manifest.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/manifest.go b/server/manifest.go index d0675724..61dd1ab4 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -3,6 +3,7 @@ package server import ( "crypto/sha256" "encoding/json" + "errors" "fmt" "io" "log/slog" @@ -43,7 +44,9 @@ func (m *Manifest) Remove() error { func (m *Manifest) RemoveLayers() error { for _, layer := range append(m.Layers, m.Config) { - if err := layer.Remove(); err != nil { + if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { + slog.Debug("layer does not exist", "digest", layer.Digest) + } else if err != nil { return err } } From 2ff45d571de4463fcebf779373ae7337cf969ebf Mon Sep 17 00:00:00 2001 From: James Montgomery Date: Tue, 11 Jun 2024 14:15:05 -0400 Subject: [PATCH 11/13] Add Ollama-hpp to Community Libraries in README. (#4983) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d29d04f7..2fdc63cb 100644 --- a/README.md +++ b/README.md @@ -332,6 +332,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp) - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai) - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs) +- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp) - [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j) - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama) - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit) From ead259d877fc8b20f7943f1f9e8eeaae0acfa52a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 11 Jun 2024 14:24:41 -0700 Subject: [PATCH 12/13] llm: fix seed value not being applied to requests (#4986) --- docs/api.md | 5 ++--- llm/ext_server/server.cpp | 7 +------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/docs/api.md b/docs/api.md index f761f68c..35f1def3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -250,7 +250,7 @@ curl http://localhost:11434/api/generate -d '{ #### Request (Reproducible outputs) -For reproducible outputs, set `temperature` to 0 and `seed` to a number: +For reproducible outputs, set `seed` to a number: ##### Request @@ -259,8 +259,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "mistral", "prompt": "Why is the sky blue?", "options": { - "seed": 123, - "temperature": 0 + "seed": 123 } }' ``` diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 7d14e48e..93e71562 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -359,7 +359,6 @@ struct llama_server_context // slots / clients std::vector slots; - json default_generation_settings_for_props; llama_server_queue queue_tasks; llama_server_response queue_results; @@ -483,9 +482,6 @@ struct llama_server_context slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["seed"] = -1; - batch = llama_batch_init(n_ctx, 0, params.n_parallel); } @@ -584,7 +580,7 @@ struct llama_server_context slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.seed = json_value(data, "seed", default_params.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -811,7 +807,6 @@ struct llama_server_context llama_sampling_free(slot->ctx_sampling); } slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; From 7bdcd1da9417ecc1a1069f2d73f0b88a3a43857d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 11 Jun 2024 15:55:44 -0700 Subject: [PATCH 13/13] Revert "Merge pull request #4938 from ollama/mxyng/fix-byte-order" This reverts commit f5f245cc154580fa7b4052c001d2a7e3d771cfb8, reversing changes made to 94d37fdcae30ddeb6c9f65c8707004f5ec9eaf33. this change broke gguf v2 which is incorrectly detected as big endian --- llm/ggml.go | 13 ++++++++----- llm/gguf.go | 15 +-------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/llm/ggml.go b/llm/ggml.go index 16da4c9d..645447d5 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -231,7 +231,8 @@ const ( // Magic constant for `ggla` files (LoRA adapter). FILE_MAGIC_GGLA = 0x67676C61 // Magic constant for `gguf` files (versioned, gguf) - FILE_MAGIC_GGUF = 0x46554747 + FILE_MAGIC_GGUF_LE = 0x46554747 + FILE_MAGIC_GGUF_BE = 0x47475546 ) var ErrUnsupportedFormat = errors.New("unsupported model format") @@ -246,7 +247,7 @@ func DetectGGMLType(b []byte) string { return "ggjt" case FILE_MAGIC_GGLA: return "ggla" - case FILE_MAGIC_GGUF: + case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE: return "gguf" default: return "" @@ -254,19 +255,21 @@ func DetectGGMLType(b []byte) string { } func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { - var magic [4]byte + var magic uint32 if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { return nil, 0, err } var c container - switch binary.LittleEndian.Uint32(magic[:]) { + switch magic { case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: return nil, 0, ErrUnsupportedFormat case FILE_MAGIC_GGLA: c = &containerGGLA{} - case FILE_MAGIC_GGUF: + case FILE_MAGIC_GGUF_LE: c = &containerGGUF{ByteOrder: binary.LittleEndian} + case FILE_MAGIC_GGUF_BE: + c = &containerGGUF{ByteOrder: binary.BigEndian} default: return nil, 0, errors.New("invalid file magic") } diff --git a/llm/gguf.go b/llm/gguf.go index 8c64e166..234efe57 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -36,23 +36,10 @@ func (c *containerGGUF) Name() string { } func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { - var version [4]byte - if err := binary.Read(rs, c.ByteOrder, &version); err != nil { + if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil { return nil, err } - // if the lower 16 bits are 0, the byte order is probably wrong - if c.ByteOrder.Uint32(version[:])&1<<4 == 0 { - switch c.ByteOrder { - case binary.LittleEndian: - c.ByteOrder = binary.BigEndian - case binary.BigEndian: - c.ByteOrder = binary.LittleEndian - } - } - - c.Version = c.ByteOrder.Uint32(version[:]) - var err error switch c.Version { case 1: