diff --git a/README.md b/README.md index a9601ec2..c4119560 100644 --- a/README.md +++ b/README.md @@ -350,6 +350,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp) - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai) - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs) +- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp) - [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j) - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama) - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit) diff --git a/docs/api.md b/docs/api.md index 64bfbed8..35f1def3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -250,7 +250,7 @@ curl http://localhost:11434/api/generate -d '{ #### Request (Reproducible outputs) -For reproducible outputs, set `temperature` to 0 and `seed` to a number: +For reproducible outputs, set `seed` to a number: ##### Request @@ -259,8 +259,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "mistral", "prompt": "Why is the sky blue?", "options": { - "seed": 123, - "temperature": 0 + "seed": 123 } }' ``` @@ -1044,11 +1043,10 @@ GET /api/ps List models that are currently loaded into memory. -\* If a model is loaded completely into system memory, `size_vram` is omitted from the response. - #### Examples ### Request + ```shell curl http://localhost:11434/api/ps ``` @@ -1080,4 +1078,4 @@ A single JSON object will be returned. } ] } -``` \ No newline at end of file +``` diff --git a/docs/import.md b/docs/import.md index 7041b74d..7abe39b2 100644 --- a/docs/import.md +++ b/docs/import.md @@ -1,170 +1,99 @@ -# Import a model +# Import -This guide walks through importing a GGUF, PyTorch or Safetensors model. +GGUF models and select Safetensors models can be imported directly into Ollama. -## Importing (GGUF) +## Import GGUF -### Step 1: Write a `Modelfile` +A binary GGUF file can be imported directly into Ollama through a Modelfile. -Start by creating a `Modelfile`. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more. - -``` -FROM ./mistral-7b-v0.1.Q4_0.gguf +```dockerfile +FROM /path/to/file.gguf ``` -(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`: +## Import Safetensors -``` -FROM ./mistral-7b-v0.1.Q4_0.gguf -TEMPLATE "[INST] {{ .Prompt }} [/INST]" +If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile: + + - LlamaForCausalLM + - MistralForCausalLM + - GemmaForCausalLM + +```dockerfile +FROM /path/to/safetensors/directory ``` -### Step 2: Create the Ollama model +For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf). -Finally, create a model from your `Modelfile`: +## Automatic Quantization +> [!NOTE] +> Automatic quantization requires v0.1.35 or higher. + +Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`. + +```dockerfile +FROM /path/to/my/gemma/f16/model ``` -ollama create example -f Modelfile -``` - -### Step 3: Run your model - -Next, test the model with `ollama run`: - -``` -ollama run example "What is your favourite condiment?" -``` - -## Importing (PyTorch & Safetensors) - -> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress. - -### Setup - -First, clone the `ollama/ollama` repo: - -``` -git clone git@github.com:ollama/ollama.git ollama -cd ollama -``` - -and then fetch its `llama.cpp` submodule: ```shell -git submodule init -git submodule update llm/llama.cpp +$ ollama create -q Q4_K_M mymodel +transferring model data +quantizing F16 model to Q4_K_M +creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd +creating new layer sha256:0853f0ad24e5865173bbf9ffcc7b0f5d56b66fd690ab1009867e45e7d2c4db0f +writing manifest +success ``` -Next, install the Python dependencies: +### Supported Quantizations -``` -python3 -m venv llm/llama.cpp/.venv -source llm/llama.cpp/.venv/bin/activate -pip install -r llm/llama.cpp/requirements.txt +
+Legacy Quantization + +- `Q4_0` +- `Q4_1` +- `Q5_0` +- `Q5_1` +- `Q8_0` + +
+ +
+K-means Quantization` + +- `Q3_K_S` +- `Q3_K_M` +- `Q3_K_L` +- `Q4_K_S` +- `Q4_K_M` +- `Q5_K_S` +- `Q5_K_M` +- `Q6_K` + +
+ +> [!NOTE] +> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf). + +## Template Detection + +> [!NOTE] +> Template detection requires v0.1.42 or higher. + +Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing. + +```dockerfile +FROM /path/to/my/gemma/model ``` -Then build the `quantize` tool: - -``` -make -C llm/llama.cpp quantize +```shell +$ ollama create mymodel +transferring model data +using autodetected template gemma-instruct +creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84 +creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb +writing manifest +success ``` -### Clone the HuggingFace repository (optional) - -If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model. - -Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository: - -``` -git lfs install -git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model -``` - -### Convert the model - -> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py` - -``` -python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin -``` - -### Quantize the model - -``` -llm/llama.cpp/quantize converted.bin quantized.bin q4_0 -``` - -### Step 3: Write a `Modelfile` - -Next, create a `Modelfile` for your model: - -``` -FROM quantized.bin -TEMPLATE "[INST] {{ .Prompt }} [/INST]" -``` - -### Step 4: Create the Ollama model - -Finally, create a model from your `Modelfile`: - -``` -ollama create example -f Modelfile -``` - -### Step 5: Run your model - -Next, test the model with `ollama run`: - -``` -ollama run example "What is your favourite condiment?" -``` - -## Publishing your model (optional – early alpha) - -Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps: - -1. Create [an account](https://ollama.com/signup) -2. Copy your Ollama public key: - - macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy` - - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub` - - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub` -3. Add your public key to your [Ollama account](https://ollama.com/settings/keys) - -Next, copy your model to your username's namespace: - -``` -ollama cp example /example -``` - -> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`. - -Then push the model: - -``` -ollama push /example -``` - -After publishing, your model will be available at `https://ollama.com//example`. - -## Quantization reference - -The quantization options are as follow (from highest highest to lowest levels of quantization). Note: some architectures such as Falcon do not support K quants. - -- `q2_K` -- `q3_K` -- `q3_K_S` -- `q3_K_M` -- `q3_K_L` -- `q4_0` (recommended) -- `q4_1` -- `q4_K` -- `q4_K_S` -- `q4_K_M` -- `q5_0` -- `q5_1` -- `q5_K` -- `q5_K_S` -- `q5_K_M` -- `q6_K` -- `q8_0` -- `f16` +Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one. diff --git a/docs/linux.md b/docs/linux.md index 9e7e06fa..ec730656 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -100,6 +100,16 @@ sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama sudo chmod +x /usr/bin/ollama ``` +## Installing specific versions + +Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). + +For example: + +``` +curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh +``` + ## Viewing logs To view logs of Ollama running as a startup service, run: diff --git a/examples/langchain-python-rag-privategpt/ingest.py b/examples/langchain-python-rag-privategpt/ingest.py index 35324775..0f71ccf0 100755 --- a/examples/langchain-python-rag-privategpt/ingest.py +++ b/examples/langchain-python-rag-privategpt/ingest.py @@ -77,13 +77,21 @@ LOADER_MAPPING = { def load_single_document(file_path: str) -> List[Document]: - ext = "." + file_path.rsplit(".", 1)[-1] - if ext in LOADER_MAPPING: - loader_class, loader_args = LOADER_MAPPING[ext] - loader = loader_class(file_path, **loader_args) - return loader.load() + if os.path.getsize(file_path) != 0: + filename, ext = os.path.splitext(file_path) + if ext in LOADER_MAPPING: + loader_class, loader_args = LOADER_MAPPING[ext] + try: + loader = loader_class(file_path, **loader_args) + if loader: + return loader.load() + except: + print(f"Corrupted file {file_path}. Ignoring it.") + else: + print(f"Unsupported file {file_path}. Ignoring it.") + else: + print(f"Empty file {file_path}. Ignoring it.") - raise ValueError(f"Unsupported file extension '{ext}'") def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: """ @@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum results = [] with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): - results.extend(docs) + if docs: + results.extend(docs) pbar.update() return results diff --git a/examples/langchain-python-rag-privategpt/requirements.txt b/examples/langchain-python-rag-privategpt/requirements.txt index d4c5cb2e..0aad1fe5 100644 --- a/examples/langchain-python-rag-privategpt/requirements.txt +++ b/examples/langchain-python-rag-privategpt/requirements.txt @@ -11,4 +11,5 @@ tabulate==0.9.0 pandoc==2.3 pypandoc==1.11 tqdm==4.66.1 -sentence_transformers==2.2.2 \ No newline at end of file +sentence_transformers==2.2.2 +numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability \ No newline at end of file diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 7d14e48e..93e71562 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -359,7 +359,6 @@ struct llama_server_context // slots / clients std::vector slots; - json default_generation_settings_for_props; llama_server_queue queue_tasks; llama_server_response queue_results; @@ -483,9 +482,6 @@ struct llama_server_context slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["seed"] = -1; - batch = llama_batch_init(n_ctx, 0, params.n_parallel); } @@ -584,7 +580,7 @@ struct llama_server_context slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.seed = json_value(data, "seed", default_params.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -811,7 +807,6 @@ struct llama_server_context llama_sampling_free(slot->ctx_sampling); } slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; diff --git a/llm/server.go b/llm/server.go index 6cb01fa0..0a815798 100644 --- a/llm/server.go +++ b/llm/server.go @@ -606,7 +606,7 @@ array ::= string ::= "\"" ( - [^"\\] | + [^"\\\x7F\x00-\x1F] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws diff --git a/parser/parser.go b/parser/parser.go index 4f44f6af..c9afc20b 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -3,12 +3,15 @@ package parser import ( "bufio" "bytes" + "encoding/binary" "errors" "fmt" "io" + "log/slog" "strconv" "strings" - "unicode" + "unicode/utf16" + "unicode/utf8" ) type File struct { @@ -69,33 +72,31 @@ func ParseFile(r io.Reader) (*File, error) { var b bytes.Buffer var role string - var lineCount int - var linePos int - - var utf16 bool - var f File br := bufio.NewReader(r) - for { - r, _, err := br.ReadRune() - if errors.Is(err, io.EOF) { - break - } else if err != nil { + + var sc scannerDecoder = utf8ScannerDecoder{} + if bom, err := br.Peek(2); err != nil { + slog.Warn("error reading byte-order mark", "error", err) + } else if bytes.Equal(bom, []byte{0xFE, 0xFF}) { + sc = utf16ScannerDecoder{binary.LittleEndian} + //nolint:errcheck + br.Discard(2) + } else if bytes.Equal(bom, []byte{0xFF, 0xFE}) { + sc = utf16ScannerDecoder{binary.BigEndian} + //nolint:errcheck + br.Discard(2) + } + + scanner := bufio.NewScanner(br) + scanner.Split(sc.ScanBytes) + for scanner.Scan() { + r, err := sc.DecodeRune(scanner.Bytes()) + if err != nil { return nil, err } - // the utf16 byte order mark will be read as "unreadable" by ReadRune() - if isUnreadable(r) && lineCount == 0 && linePos == 0 { - utf16 = true - continue - } - - // skip the second byte if we're reading utf16 - if utf16 && r == 0 { - continue - } - next, r, err := parseRuneForState(r, curr) if errors.Is(err, io.ErrUnexpectedEOF) { return nil, fmt.Errorf("%w: %s", err, b.String()) @@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) { return nil, err } - if isNewline(r) { - lineCount++ - linePos = 0 - } else { - linePos++ - } - // process the state transition, some transitions need to be intercepted and redirected if next != curr { switch curr { @@ -309,10 +303,6 @@ func isNewline(r rune) bool { return r == '\r' || r == '\n' } -func isUnreadable(r rune) bool { - return r == unicode.ReplacementChar -} - func isValidMessageRole(role string) bool { return role == "system" || role == "user" || role == "assistant" } @@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool { return false } } + +type scannerDecoder interface { + ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) + DecodeRune([]byte) (rune, error) +} + +type utf8ScannerDecoder struct{} + +func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + return scanBytesN(data, 1, atEOF) +} + +func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) { + r, _ := utf8.DecodeRune(data) + return r, nil +} + +type utf16ScannerDecoder struct { + binary.ByteOrder +} + +func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + return scanBytesN(data, 2, atEOF) +} + +func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) { + return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil +} + +func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + return n, data[:n], nil +} diff --git a/server/manifest.go b/server/manifest.go index d0675724..61dd1ab4 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -3,6 +3,7 @@ package server import ( "crypto/sha256" "encoding/json" + "errors" "fmt" "io" "log/slog" @@ -43,7 +44,9 @@ func (m *Manifest) Remove() error { func (m *Manifest) RemoveLayers() error { for _, layer := range append(m.Layers, m.Config) { - if err := layer.Remove(); err != nil { + if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { + slog.Debug("layer does not exist", "digest", layer.Digest) + } else if err != nil { return err } } diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go index d0990009..0f003574 100644 --- a/server/routes_delete_test.go +++ b/server/routes_delete_test.go @@ -1,12 +1,15 @@ package server import ( + "bytes" + "encoding/json" "fmt" "net/http" "path/filepath" "testing" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/types/model" ) func TestDelete(t *testing.T) { @@ -69,3 +72,33 @@ func TestDelete(t *testing.T) { checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{}) checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{}) } + +func TestDeleteDuplicateLayers(t *testing.T) { + p := t.TempDir() + t.Setenv("OLLAMA_MODELS", p) + var s Server + + n := model.ParseName("test") + + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(&ConfigV2{}); err != nil { + t.Fatal(err) + } + + config, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") + if err != nil { + t.Fatal(err) + } + + // create a manifest with duplicate layers + if err := WriteManifest(n, config, []*Layer{config}); err != nil { + t.Fatal(err) + } + + w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"}) + if w.Code != http.StatusOK { + t.Errorf("expected status code 200, actual %d", w.Code) + } + + checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{}) +}