Merge branch 'ollama:main' into main

readme: add crewAI to community integrations (#6699 )
readme: add crewAI with mesop to community integrations
2025-12-22 14:53:56 +00:00 · 2024-09-09 19:33:57 +08:00 · 2024-09-08 00:36:24 -07:00 · 2024-09-08 00:35:59 -07:00 · 2024-09-08 11:44:55 +08:00 · 2024-09-07 09:08:08 -07:00
111 changed files with 3225 additions and 25841 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -187,6 +187,13 @@ jobs:
  generate-windows-cuda:
    environment: release
    runs-on: windows
+    strategy:
+      matrix:
+        cuda:
+          - version: "11"
+            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+          - version: "12"
+            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
@@ -220,11 +227,11 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA'
+      - name: 'Install CUDA ${{ matrix.cuda.version }}'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
          write-host "Installing CUDA"
          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
          write-host "Completed CUDA"
@@ -256,15 +263,16 @@ jobs:
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-${{ matrix.cuda.version }}
          path: dist/deps/*

+
  # Import the prior generation steps and build the final windows assets
  build-windows:
    environment: release
@@ -314,10 +322,16 @@ jobs:
          name: generate-windows-cpu
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-11
      - uses: actions/download-artifact@v4
        with:
-          name: windows-cuda-deps
+          name: generate-windows-cuda-12
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-12
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
@@ -363,7 +377,6 @@ jobs:
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
-          mv dist/deps/* dist/
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@@ -459,7 +472,10 @@ jobs:
          merge-multiple: true
      - run: |
          ls -lh dist/
-          (cd dist; sha256sum * > sha256sum.txt)
+          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
+          mv sha256sum.txt dist/
+          mv dist/linux-???64 .
+          mv dist/linux-amd64-rocm .
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -32,6 +32,10 @@ linters:
 linters-settings:
  gci:
    sections: [standard, default, localmodule]
+  staticcheck:
+    checks:
+      - all
+      - -SA1019 # omit Deprecated check
 severity:
  default-severity: error
  rules:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -18,7 +18,7 @@ See the [development documentation](./docs/development.md) for instructions on h

 * New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
 * Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
-* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
+* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.

 ### Issues that may not be accepted

--- a/137
+++ b/137
@@ -1,7 +1,9 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+ARG CUDA_VERSION_12=12.4.0
+ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2

 # Copy the minimal context we need to run the generate scripts
@@ -10,7 +12,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm

-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -18,9 +20,34 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+ARG CUDA_V11_ARCHITECTURES
+ENV GOARCH amd64
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
+ENV GOARCH amd64
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+    CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -28,7 +55,32 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+ARG CUDA_V11_ARCHITECTURES
+ENV GOARCH arm64
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+    CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+    bash gen_linux.sh
+

 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -40,15 +92,11 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-RUN mkdir /tmp/scratch && \
-    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
-    done && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
-
+ENV GOARCH amd64
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )

 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -59,16 +107,21 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh

 FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
@@ -79,12 +132,15 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh


 # Intermediate stage used for ./scripts/build_linux.sh
@@ -95,12 +151,16 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/bin/ollama .

 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -109,23 +169,38 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
+
+# Strip out ROCm dependencies to keep the primary image lean
+FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
+RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
-RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=amd64-libs-without-rocm /scratch/ /lib/
+RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
-RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+RUN ln -s /opt/rocm/lib /lib/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

--- a/README.md
+++ b/README.md
@@ -313,13 +313,24 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
+- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
+- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
+- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
+- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
+- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
+- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)

 ### Terminal

@@ -345,6 +356,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gollama](https://github.com/sammcj/gollama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)

+### Apple Vision Pro
+- [Enchanted](https://github.com/AugustDev/enchanted)
+
 ### Database

 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
@@ -353,23 +367,28 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Package managers

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
+- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
+- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
+- [Flox](https://flox.dev/blog/ollama-part-one)

 ### Libraries

 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
+- [crewAI](https://github.com/crewAIInc/crewAI)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
+- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
+- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
@@ -386,11 +405,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
+- [Gollm](https://docs.gollm.co/examples/ollama-example)
+- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
+- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
+- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)

 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)

 ### Extensions & Plugins

@@ -415,11 +439,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
+- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
+- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@@ -296,15 +296,17 @@ type EmbeddingResponse struct {
 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
 	Model     string `json:"model"`
-	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
 	Quantize  string `json:"quantize,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`

-	// Quantization is deprecated, see Quantize
+	// Deprecated: set the file content with Modelfile instead
+	Path string `json:"path"`
+
+	// Deprecated: use Quantize instead
 	Quantization string `json:"quantization,omitempty"`
 }

@@ -312,7 +314,7 @@ type CreateRequest struct {
 type DeleteRequest struct {
 	Model string `json:"model"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@@ -327,7 +329,7 @@ type ShowRequest struct {

 	Options map[string]interface{} `json:"options"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@@ -359,7 +361,7 @@ type PullRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@@ -380,7 +382,7 @@ type PushRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,19 +88,10 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-#if DirExists("..\dist\windows-amd64\cuda")
-  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\oneapi")
-  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\rocm")
-  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
-#endif
-
+Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"sort"
 	"sync"
+	"syscall"
 	"unsafe"

 	"golang.org/x/sys/windows"
@@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
 	t.muNID.Lock()
 	defer t.muNID.Unlock()
 	t.nid.Icon = h
-	t.nid.Flags |= NIF_ICON
+	t.nid.Flags |= NIF_ICON | NIF_TIP
+	if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
+		copy(t.nid.Tip[:], toolTipUTF16)
+	} else {
+		return err
+	}
 	t.nid.Size = uint32(unsafe.Sizeof(*t.nid))

 	return t.nid.modify()
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -61,6 +61,7 @@ const (
 	MIIM_SUBMENU        = 0x00000004
 	MIM_APPLYTOSUBMENUS = 0x80000000
 	NIF_ICON            = 0x00000002
+	NIF_TIP             = 0x00000004
 	NIF_INFO            = 0x00000010
 	NIF_MESSAGE         = 0x00000001
 	SW_HIDE             = 0
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapters.safetensors
+		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapter_model.safetensors
+		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@@ -223,6 +229,14 @@ func tempZipFiles(path string) (string, error) {
 	}
 	files = append(files, js...)

+	// bert models require a nested config.json
+	// TODO(mxyng): merge this with the glob above
+	js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
+	if err != nil {
+		return "", err
+	}
+	files = append(files, js...)
+
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -252,6 +266,11 @@ func tempZipFiles(path string) (string, error) {
 			return "", err
 		}

+		zfi.Name, err = filepath.Rel(path, file)
+		if err != nil {
+			return "", err
+		}
+
 		zf, err := zipfile.CreateHeader(zfi)
 		if err != nil {
 			return "", err
@@ -707,14 +726,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 }

 func showInfo(resp *api.ShowResponse) {
-	arch := resp.ModelInfo["general.architecture"].(string)
-
 	modelData := [][]string{
-		{"arch", arch},
 		{"parameters", resp.Details.ParameterSize},
 		{"quantization", resp.Details.QuantizationLevel},
-		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
-		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
+	}
+	if resp.ModelInfo != nil {
+		arch := resp.ModelInfo["general.architecture"].(string)
+		modelData = append(modelData,
+			[]string{"arch", arch},
+			[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+			[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
+		)
 	}

 	mainTableData := [][]string{
@@ -1399,6 +1421,8 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
+				envVars["OLLAMA_GPU_OVERHEAD"],
+				envVars["OLLAMA_LOAD_TIMEOUT"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -7,16 +7,27 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"strings"

 	"github.com/ollama/ollama/llm"
 )

-type Parameters struct {
+type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 }

-func (Parameters) KV(t *Tokenizer) llm.KV {
+type AdapterParameters struct {
+	Alpha          uint32 `json:"lora_alpha"`
+	LoraLayers     uint32 `json:"lora_layers"`
+	LoraParameters struct {
+		Rank  uint32  `json:"rank"`
+		Alpha float32 `json:"alpha"`
+		Scale float32 `json:"scale"`
+	} `json:"lora_parameters"`
+}
+
+func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
@@ -43,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (Parameters) specialTokenTypes() []string {
+func (p AdapterParameters) KV() llm.KV {
+	var alpha float32
+	if p.LoraParameters.Alpha == 0 {
+		alpha = float32(p.Alpha)
+	} else {
+		alpha = p.LoraParameters.Alpha
+	}
+
+	kv := llm.KV{
+		"adapter.lora.alpha": alpha,
+		"adapter.type":       "lora",
+		"general.file_type":  uint32(1),
+		"general.type":       "adapter",
+		"general.version":    "v0.2",
+	}
+
+	return kv
+}
+
+func (ModelParameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }

-func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }

-type Converter interface {
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
+}
+
+type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string

-	// tensorName returns the LLM tensor name for a specific input name
-	tensorName(string) string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
+	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }

+type moreParser interface {
+	parseMore(fs.FS) error
+}
+
+type AdapterConverter interface {
+	// KV maps parameters to LLM key-values
+	KV(llm.KV) llm.KV
+	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
+	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
+
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+}
+
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+	bts, err := fs.ReadFile(fsys, "adapter_config.json")
+	if err != nil {
+		return err
+	}
+
+	var p AdapterParameters
+	if err := json.Unmarshal(bts, &p); err != nil {
+		return err
+	}
+
+	arch, ok := baseKV["general.architecture"]
+	if !ok {
+		return errors.New("architecture not set for the base model")
+	}
+
+	var conv AdapterConverter
+	switch arch {
+	case "llama":
+		conv = &llamaAdapter{}
+	case "gemma2":
+		conv = &gemma2Adapter{}
+	default:
+		return errors.New("unsupported architecture")
+	}
+
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
+	if err != nil {
+		return err
+	}
+
+	if err := json.Unmarshal(bts, conv); err != nil {
+		return err
+	}
+
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func Convert(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}

-	var p Parameters
+	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
@@ -85,16 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}

-	var conv Converter
+	var conv ModelConverter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		conv = &llama{}
+		conv = &llamaModel{}
 	case "MixtralForCausalLM":
-		conv = &mixtral{}
+		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
-		conv = &gemma{}
+		conv = &gemmaModel{}
+	case "Gemma2ForCausalLM":
+		conv = &gemma2Model{}
 	case "Phi3ForCausalLM":
-		conv = &phi3{}
+		conv = &phi3Model{}
+	case "BertModel":
+		conv = &bertModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
@@ -103,6 +197,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

+	if t, ok := conv.(moreParser); ok {
+		if err := t.parseMore(fsys); err != nil {
+			return err
+		}
+	}
+
 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
@@ -119,7 +219,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}

-	ts, err := parseTensors(fsys)
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
 		return err
 	}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -0,0 +1,174 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type bertModel struct {
+	ModelParameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	NLayer                uint32  `json:"n_layer"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	NCtx                  uint32  `json:"n_ctx"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	NEmbd                 uint32  `json:"n_embd"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NInner                uint32  `json:"n_inner"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NHead                 uint32  `json:"n_head"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	NormEpsilon           float32 `json:"norm_epsilon"`
+
+	PoolingType uint32
+}
+
+var (
+	_ ModelConverter = (*bertModel)(nil)
+	_ moreParser     = (*bertModel)(nil)
+)
+
+func (p *bertModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		if m.Type == "sentence_transformers.models.Pooling" {
+			pooling = m.Path
+			break
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *bertModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "bert"
+	kv["bert.attention.causal"] = false
+	kv["bert.pooling_type"] = p.PoolingType
+
+	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+
+	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
+		kv["bert.context_length"] = contextLength
+	}
+
+	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
+		kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	}
+
+	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
+		kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
+	}
+
+	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
+		kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
+		kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
+			// noop
+		} else if strings.HasPrefix(e, "##") {
+			t.Tokens[i] = e[2:]
+		} else {
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (bertModel) Replacements() []string {
+	return []string{
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+		"embeddings.position_embeddings", "position_embd",
+		"attention.self.query", "attn_q",
+		"attention.self.key", "attn_k",
+		"attention.self.value", "attn_v",
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	}
+}
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -9,8 +9,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type gemma struct {
-	Parameters
+type gemmaModel struct {
+	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
@@ -21,12 +21,11 @@ type gemma struct {
 	HeadDim               uint32  `json:"head_dim"`
 }

-var _ Converter = (*gemma)(nil)
+var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemma) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
-	kv["general.name"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
 	kv["gemma.block_count"] = p.HiddenLayers
@@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
+func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "_norm.weight") {
+		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *gemma) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *gemmaModel) Replacements() []string {
+	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",
@@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		"block_sparse_moe.gate", "ffn_inp",
-	).Replace(n)
+	}
 }

-func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))

--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -0,0 +1,53 @@
+package convert
+
+import (
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Model struct {
+	gemmaModel
+	SlidingWindow         uint32  `json:"sliding_window"`
+	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
+	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
+}
+
+func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma2"
+	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
+	kv["gemma2.embedding_length"] = p.HiddenSize
+	kv["gemma2.block_count"] = p.HiddenLayers
+	kv["gemma2.feed_forward_length"] = p.IntermediateSize
+	kv["gemma2.attention.head_count"] = p.NumAttentionHeads
+	kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["gemma2.attention.key_length"] = p.HeadDim
+	kv["gemma2.attention.value_length"] = p.HeadDim
+	kv["gemma2.attention.sliding_window"] = p.SlidingWindow
+	kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
+	kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
+	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
+	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
+	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
+	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
+	return kv
+}
+
+func (p *gemma2Model) Replacements() []string {
+	return []string{
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+	}
+}
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -0,0 +1,91 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Adapter struct {
+	AdapterParameters
+}
+
+var _ AdapterConverter = (*gemma2Adapter)(nil)
+
+func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "gemma2"
+	return kv
+}
+
+func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *gemma2Adapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -3,6 +3,7 @@ package convert
 import (
 	"cmp"
 	"fmt"
+	"math"
 	"strings"

 	"github.com/pdevine/tensor"
@@ -11,8 +12,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type llama struct {
-	Parameters
+type llamaModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@@ -27,8 +28,14 @@ type llama struct {
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
-		Type   string  `json:"type"`
-		Factor float32 `json:"factor"`
+		Type                            string  `json:"type"`
+		RopeType                        string  `json:"rope_type"`
+		Factor                          float32 `json:"factor"`
+		LowFrequencyFactor              float32 `json:"low_freq_factor"`
+		HighFrequencyFactor             float32 `json:"high_freq_factor"`
+		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
+
+		factors ropeFactor
 	} `json:"rope_scaling"`
 	RMSNormEPS       float32 `json:"rms_norm_eps"`
 	LayerNormEPS     float32 `json:"layer_norm_eps"`
@@ -37,12 +44,11 @@ type llama struct {
 	HeadDim          uint32  `json:"head_dim"`
 }

-var _ Converter = (*llama)(nil)
+var _ ModelConverter = (*llamaModel)(nil)

-func (p *llama) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
-	kv["general.name"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize

 	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	if p.RopeScaling.Type == "linear" {
 		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
 		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
+	} else if p.RopeScaling.RopeType == "llama3" {
+		dim := p.HiddenSize / p.NumAttentionHeads
+		for i := uint32(0); i < dim; i += 2 {
+			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
+			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
+			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
+
+			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
+			lambdaLow := float32(original) / factorLow
+			lambdaHigh := float32(original) / factorHigh
+
+			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
+			if lambda < float64(lambdaHigh) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
+			} else if lambda > float64(lambdaLow) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
+			} else {
+				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
+			}
+		}
 	}

 	if p.NumKeyValueHeads > 0 {
@@ -93,17 +120,26 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
+
+	if p.RopeScaling.factors != nil {
+		out = append(out, llm.Tensor{
+			Name:     "rope_freqs.weight",
+			Kind:     0,
+			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
+			WriterTo: p.RopeScaling.factors,
+		})
+	}
+
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "attn_q.weight") ||
-			strings.HasSuffix(name, "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *llama) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *llamaModel) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -128,21 +164,19 @@ func (p *llama) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		// mixtral
-		"block_sparse_moe.gate", "ffn_gate_inp",
-	).Replace(n)
+	}
 }

-func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -0,0 +1,169 @@
+package convert
+
+import (
+	"cmp"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type llamaAdapter struct {
+	AdapterParameters
+	NumAttentionHeads uint32 `json:"num_attention_heads"`
+	NumKeyValueHeads  uint32 `json:"num_key_value_heads"`
+}
+
+var _ AdapterConverter = (*llamaAdapter)(nil)
+
+func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "llama"
+	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
+	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+
+	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+
+	return kv
+}
+
+func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repackAndTranspose)
+		} else {
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *llamaAdapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return data, nil
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
+
+func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	}
+
+	if heads > 0 {
+		if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+			return nil, err
+		}
+
+		if err := n.T(0, 2, 1, 3); err != nil {
+			return nil, err
+		}
+
+		if err := n.Reshape(dims...); err != nil {
+			return nil, err
+		}
+
+		if err := n.Transpose(); err != nil {
+			return nil, err
+		}
+	}
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -9,16 +9,14 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type mixtral struct {
-	llama
+type mixtralModel struct {
+	llamaModel
 	NumLocalExperts    uint32 `json:"num_local_experts"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-var _ Converter = (*mixtral)(nil)
-
-func (p *mixtral) KV(t *Tokenizer) llm.KV {
-	kv := p.llama.KV(t)
+func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
 		kv["llama.expert_count"] = p.NumLocalExperts
@@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 		})
 	}

-	return append(out, p.llama.Tensors(ts)...)
+	return append(out, p.llamaModel.Tensors(ts)...)
+}
+
+func (p *mixtralModel) Replacements() []string {
+	return append(
+		p.llamaModel.Replacements(),
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	)
 }

 type experts []Tensor
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type phi3 struct {
-	Parameters
+type phi3Model struct {
+	ModelParameters
 	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 	NLayers           uint32  `json:"n_layers"`
 	HiddenSize        uint32  `json:"hidden_size"`
@@ -35,12 +35,11 @@ type phi3 struct {
 	SlidingWindow                 uint32  `json:"sliding_window"`
 }

-var _ Converter = (*phi3)(nil)
+var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
-	kv["general.name"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
 	kv["phi3.feed_forward_length"] = p.IntermediateSize
@@ -69,13 +68,12 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 	var addRopeFactors sync.Once

 	out := make([]llm.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasPrefix(name, "blk.0.") {
+		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 				out = append(out, llm.Tensor{
 					Name:     "rope_factors_long.weight",
@@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *phi3) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *phi3Model) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	).Replace(n)
+	}
 }

 type ropeFactor []float32
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -1,7 +1,9 @@
 package convert

 import (
+	"bytes"
 	"crypto/sha256"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"flag"
@@ -13,6 +15,7 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
+	"strings"
 	"testing"

 	"golang.org/x/exp/maps"
@@ -20,6 +23,12 @@ import (
 	"github.com/ollama/ollama/llm"
 )

+type tensorData struct {
+	Offsets []int  `json:"data_offsets"`
+	Type    string `json:"dtype"`
+	Shape   []int  `json:"shape"`
+}
+
 func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	t.Helper()

@@ -29,7 +38,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()

-	if err := Convert(fsys, f); err != nil {
+	if err := ConvertModel(fsys, f); err != nil {
 		t.Fatal(err)
 	}

@@ -51,6 +60,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

+func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
+	actual := make(map[string]string)
+	for k, v := range kv {
+		if s, ok := v.(json.Marshaler); !ok {
+			actual[k] = fmt.Sprintf("%v", v)
+		} else {
+			bts, err := json.Marshal(s)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
+		}
+	}
+
+	for _, tensor := range tensors.Items {
+		sha256sum := sha256.New()
+		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
+		if _, err := io.Copy(sha256sum, sr); err != nil {
+			t.Fatal(err)
+		}
+
+		actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
+	}
+
+	return actual
+}
+
 func TestMain(m *testing.M) {
 	var level slog.Level
 	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@@ -59,14 +96,18 @@ func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }

-func TestConvertFull(t *testing.T) {
+func TestConvertModel(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
+		"Meta-Llama-3.1-8B-Instruct",
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
+		"gemma-2-2b-it",
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
+		"all-MiniLM-L6-v2",
+		"gemma-2-9b-it",
 	}

 	for i := range cases {
@@ -82,29 +123,7 @@ func TestConvertFull(t *testing.T) {
 			}

 			f, kv, tensors := convertFull(t, os.DirFS(p))
-			actual := make(map[string]string)
-			for k, v := range kv {
-				if s, ok := v.(json.Marshaler); !ok {
-					actual[k] = fmt.Sprintf("%v", v)
-				} else {
-					bts, err := json.Marshal(s)
-					if err != nil {
-						t.Fatal(err)
-					}
-
-					actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
-				}
-			}
-
-			for _, tensor := range tensors.Items {
-				sha256sum := sha256.New()
-				sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
-				if _, err := io.Copy(sha256sum, sr); err != nil {
-					t.Fatal(err)
-				}
-
-				actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
-			}
+			actual := generateResultsJSON(t, f, kv, tensors)

 			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
 			if err != nil {
@@ -128,3 +147,330 @@ func TestConvertFull(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertInvalidTensorNames(t *testing.T) {
+	f, err := os.CreateTemp(t.TempDir(), "testmodel")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	tempDir := t.TempDir()
+
+	td := map[string]*tensorData{}
+	offset := 4096
+
+	td["model.layers.0.self_attn.q_proj.weight"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "F32",
+		Shape:   []int{4096, 4096},
+	}
+	td["blk.0.attn_q.weight"] = &tensorData{
+		Offsets: []int{offset, offset * 2},
+		Type:    "F32",
+		Shape:   []int{4096, 4096},
+	}
+	generateSafetensorTestData(t, tempDir, td)
+
+	err = ConvertModel(os.DirFS(tempDir), f)
+	if err == nil || !strings.HasPrefix(err.Error(), "duplicate tensor name") {
+		t.Errorf("expected error but didn't get one")
+	}
+}
+
+func TestConvertInvalidDatatype(t *testing.T) {
+	f, err := os.CreateTemp(t.TempDir(), "testmodel")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	tempDir := t.TempDir()
+
+	td := map[string]*tensorData{}
+	offset := 4096 * 14336
+
+	td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "I8",
+		Shape:   []int{4096, 14336},
+	}
+	td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
+		Offsets: []int{offset, offset},
+		Type:    "U8",
+		Shape:   []int{},
+	}
+	generateSafetensorTestData(t, tempDir, td)
+
+	err = ConvertModel(os.DirFS(tempDir), f)
+	if err == nil || err.Error() != "unsupported safetensors model" {
+		t.Errorf("expected error but didn't get one")
+	}
+}
+
+func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[string]*tensorData) {
+	data, err := json.Marshal(tensorData)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ]
+}
+`
+
+	f, err := os.Create(filepath.Join(tempDir, "config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tokenizerData := `
+{
+}
+`
+
+	f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(tokenizerData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestConvertAdapter(t *testing.T) {
+	type AdapterCase struct {
+		Name     string
+		BaseKV   map[string]any
+		Expected map[string]string
+	}
+
+	cases := []AdapterCase{
+		{
+			Name: "discollama",
+			BaseKV: map[string]any{
+				"general.architecture":          "llama",
+				"llama.attention.head_count":    uint32(32),
+				"llama.attention.head_count_kv": uint32(8),
+			},
+			Expected: map[string]string{
+				"general.architecture":          "llama",
+				"general.file_type":             "1",
+				"general.parameter_count":       "106496",
+				"general.type":                  "adapter",
+				"general.version":               "v0.2",
+				"adapter.lora.alpha":            "16",
+				"adapter.type":                  "lora",
+				"llama.attention.head_count":    "32",
+				"llama.attention.head_count_kv": "8",
+				"blk.31.attn_q.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_q.weight.lora_b":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_b":   "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
+			},
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.Name, func(t *testing.T) {
+			t.Parallel()
+
+			f, err := os.CreateTemp(t.TempDir(), "f16")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			generateLoraTestData(t, tempDir)
+
+			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
+				t.Fatal(err)
+			}
+
+			r, err := os.Open(f.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if _, err := r.Seek(0, io.SeekStart); err != nil {
+				t.Fatal(err)
+			}
+
+			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
+
+			keys := maps.Keys(c.Expected)
+			slices.Sort(keys)
+			for _, k := range keys {
+				if v, ok := actual[k]; !ok {
+					t.Errorf("missing %s", k)
+				} else if v != c.Expected[k] {
+					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
+				}
+			}
+		})
+	}
+}
+
+func generateLoraTestData(t *testing.T, tempDir string) {
+	offset := 4096 * 8 * 4
+
+	td := map[string]*tensorData{"__metadata__": nil}
+	td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset, offset * 2},
+		Type:    "F32",
+		Shape:   []int{8, 4096},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
+		Offsets: []int{offset * 2, offset * 3},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset * 3, offset*3 + 8*1024*4},
+		Type:    "F32",
+		Shape:   []int{8, 1024},
+	}
+
+	data, err := json.Marshal(td)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// write some data for the tensors
+
+	ones := make([]float32, 4096*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	for range 3 {
+		err = binary.Write(&buf, binary.LittleEndian, ones)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	ones = make([]float32, 1024*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	err = binary.Write(&buf, binary.LittleEndian, ones)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+    "adapter_path": "adapters-test",
+    "batch_size": 8,
+    "config": "config-tiny.json",
+    "data": "../discollama-completion",
+    "grad_checkpoint": null,
+    "iters": 1000,
+    "learning_rate": 1e-05,
+    "lora_layers": 1,
+    "lora_parameters": {
+        "rank": 8,
+        "alpha": 16,
+        "dropout": 0.0,
+        "scale": 2.0
+    },
+    "lr_schedule": null,
+    "max_seq_length": 2048,
+    "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
+    "resume_adapter_file": null,
+    "save_every": 100,
+    "seed": 0,
+    "steps_per_eval": 200,
+    "steps_per_report": 10,
+    "test": false,
+    "test_batches": 500,
+    "train": true,
+    "use_dora": false,
+    "val_batches": 25
+}
+`
+	f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -35,7 +35,9 @@ const (
 )

 func (t tensorBase) Kind() uint32 {
-	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
+	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
+		t.name == "token_types.weight" {
+		// these tensors are always F32
 		return 0
 	}

@@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {

 type repacker func(string, []float32, []uint64) ([]float32, error)

-func parseTensors(fsys fs.FS) ([]Tensor, error) {
+func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
-		Func    func(fs.FS, ...string) ([]Tensor, error)
+		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
+		{"adapters.safetensors", parseSafetensors},
+		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
@@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
 		}

 		if len(matches) > 0 {
-			return pattern.Func(fsys, matches...)
+			return pattern.Func(fsys, replacer, matches...)
 		}
 	}

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -4,10 +4,12 @@ import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"slices"
+	"strings"

 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
@@ -20,7 +22,7 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }

-func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		f, err := fsys.Open(p)
@@ -47,8 +49,19 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 		keys := maps.Keys(headers)
 		slices.Sort(keys)

+		names := make(map[string]struct{}, len(keys))
+
 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
+				// bitsandbytes quantized models are unsupported
+				if len(value.Shape) == 0 {
+					return nil, errors.New("unsupported safetensors model")
+				}
+				ggufName := replacer.Replace(key)
+				if _, ok := names[ggufName]; ok {
+					return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
+				}
+				names[ggufName] = struct{}{}
 				ts = append(ts, safetensor{
 					fs:     fsys,
 					path:   p,
@@ -56,7 +69,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  key,
+						name:  ggufName,
 						shape: value.Shape,
 					},
 				})
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -3,12 +3,13 @@ package convert
 import (
 	"io"
 	"io/fs"
+	"strings"

 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )

-func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 			ts = append(ts, torch{
 				storage: t.(*pytorch.Tensor).Source,
 				tensorBase: &tensorBase{
-					name:  k.(string),
+					name:  replacer.Replace(k.(string)),
 					shape: shape,
 				},
 			})
--- a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
+++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
@@ -0,0 +1,3 @@
+{
+  "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
+}
--- a/convert/testdata/all-MiniLM-L6-v2.json
+++ b/convert/testdata/all-MiniLM-L6-v2.json
@@ -0,0 +1,124 @@
+{
+  "general.architecture": "bert",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "bert.attention.causal": "false",
+  "bert.attention.head_count": "12",
+  "bert.attention.layer_norm_epsilon": "1e-12",
+  "bert.block_count": "6",
+  "bert.context_length": "512",
+  "bert.embedding_length": "384",
+  "bert.feed_forward_length": "1536",
+  "bert.pooling_type": "1",
+  "tokenizer.ggml.model": "bert",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "100",
+  "tokenizer.ggml.cls_token_id": "101",
+  "tokenizer.ggml.seperator_token_id": "102",
+  "tokenizer.ggml.mask_token_id": "103",
+  "tokenizer.ggml.token_type_count": "2",
+  "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
+  "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
+  "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
+  "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
+  "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
+  "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
+  "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
+  "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
+  "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
+  "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
+  "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
+  "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
+  "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
+  "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
+  "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
+  "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
+  "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
+  "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
+  "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
+  "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
+  "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
+  "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
+  "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
+  "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
+  "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
+  "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
+  "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
+  "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
+  "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
+  "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
+  "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
+  "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
+  "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
+  "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
+  "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
+  "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
+  "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
+  "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
+  "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
+  "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
+  "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
+  "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
+  "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
+  "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
+  "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
+  "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
+  "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
+  "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
+  "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
+  "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
+  "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
+  "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
+  "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
+  "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
+  "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
+  "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
+  "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
+  "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
+  "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
+  "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
+  "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
+  "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
+  "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
+  "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
+  "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
+  "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
+  "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
+  "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
+  "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
+  "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
+  "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
+  "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
+  "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
+  "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
+  "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
+  "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
+  "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
+  "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
+  "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
+  "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
+  "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
+  "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
+  "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
+  "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
+  "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
+  "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
+  "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
+  "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
+  "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
+  "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
+  "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
+  "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
+  "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
+  "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
+  "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
+  "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
+  "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
+  "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
+  "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
+  "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
+  "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
+  "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
+  "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
+  "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
+}
--- a/convert/testdata/gemma-2-2b-it.json
+++ b/convert/testdata/gemma-2-2b-it.json
@@ -0,0 +1,312 @@
+{
+  "general.architecture": "gemma2",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "gemma2.block_count": "26",
+  "gemma2.context_length": "8192",
+  "gemma2.embedding_length": "2304",
+  "gemma2.feed_forward_length": "9216",
+  "gemma2.attention.head_count": "8",
+  "gemma2.attention.head_count_kv": "4",
+  "gemma2.attention.key_length": "256",
+  "gemma2.attention.value_length": "256",
+  "gemma2.attention.layer_norm_rms_epsilon": "1e-06",
+  "tokenizer.ggml.model": "llama",
+  "tokenizer.ggml.add_bos_token": "true",
+  "tokenizer.ggml.add_eos_token": "false",
+  "tokenizer.ggml.bos_token_id": "2",
+  "tokenizer.ggml.eos_token_id": "1",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "3",
+  "tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
+  "tokenizer.ggml.token_type": "8d40143b3477df77beea4139420335ede458bf5e14102f01b0170197b55da8d8",
+  "tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
+  "token_embd.weight": "64a9d30707e659e2e673656d71f5aef7a9fb9fd83bb9a77558dfc5abbe218a05",
+  "blk.0.attn_k.weight": "d8b4437c5edb3cddf6af9987038e1bb2b191c4f0fce0e160d2abace717f5d5d7",
+  "blk.0.attn_norm.weight": "1eb73e3f7aa8e502f6ca31cd19efbb8e4fd9a89692e13e48ac8205545a7fa7e8",
+  "blk.0.attn_output.weight": "39e7b78e57d356a22dd89ce1c4d7163b970712ba756545e1703f97866cd2192e",
+  "blk.0.attn_q.weight": "795058e23b6109febd9d55c89e1eebe6af0714ec8c56fd86a160876a6135ffe8",
+  "blk.0.attn_v.weight": "0cd6e583d1887c020472e961bbb113fe5a0d23ae2f1c2c876fc366cdb7692b52",
+  "blk.0.ffn_down.weight": "51eb4d962189e945a84e94e0dc1aad3f8f90cc1a11e18029670afcd0ea0acb1b",
+  "blk.0.ffn_gate.weight": "9811a29b8ad48432925897ab21dfcb13c5cbd372aeccbbefca9b7866883b4ce3",
+  "blk.0.ffn_norm.weight": "92cbf4652ef503c1de5b10f2be00b3fcf00100980cb3baa8f3013a8d8bf3d851",
+  "blk.0.ffn_up.weight": "af87de21746879483ed1b374cdd76b19ba11ca2b6dbb1beba98efdf3be3e8077",
+  "blk.0.post_attention_norm.weight": "32e135f1f258ffe407018899e39af1725d59d66d60022b9a21575ba160e0357a",
+  "blk.0.post_ffw_norm.weight": "ba286f5ac11b07fbc986173708c66f1920427be5a6d108af38fa0a837c1c8eb6",
+  "blk.1.attn_k.weight": "51584435552051f7fade76beca582b3f7190cf7fc07adcf527c2774d4b1c3901",
+  "blk.1.attn_norm.weight": "6833104c7fbf35a7e799ae56c262b97fffa14789642aee14381b25acd21ed80a",
+  "blk.1.attn_output.weight": "14c39481369087bf292ac9a3ab2ef166f9fe376a9f90c246653213ef264febdc",
+  "blk.1.attn_q.weight": "443f64ae2229f857c69d6bebb7800b685786cb77884c3ae19d4286aeed081325",
+  "blk.1.attn_v.weight": "0df482de2038f1e4c8a7733ac0ddb69ad90759dab5968b942af0155588de4c4a",
+  "blk.1.ffn_down.weight": "66f30763a8bbbcaea609a0087ed75fadb5e771c06378dd2cea94cf17e492e8cf",
+  "blk.1.ffn_gate.weight": "a7151bff00a545fa18b2c92dcd2a14572ccf9beb957a6c494f1374e8ebe174c9",
+  "blk.1.ffn_norm.weight": "e197d71ea11b5276bc0167d2663b88089b3ff42b47ba91e85f6c5d95f6306435",
+  "blk.1.ffn_up.weight": "57c182e0b14cccd1350d388f0c616991702e74281db54637451b70f4ccc24f9b",
+  "blk.1.post_attention_norm.weight": "3c56f837168d784c2d8bac247c130bdca6610c095c8da4558c536ccad7605609",
+  "blk.1.post_ffw_norm.weight": "d2a51d320fd01069dd7ccaa7082f16a7faeb671885607d7900b10a89c354d0fa",
+  "blk.2.attn_k.weight": "bc103c818192de7ce36caaf89dc117be4df13fb902e0bd9a23c64edace5df9b6",
+  "blk.2.attn_norm.weight": "0f2503aa126083a5d6ac72481be1ef66c6014705b573682b35bd864e4749a3d5",
+  "blk.2.attn_output.weight": "05fcd4a1226e482f91803a266f72caca887a93e63c2d2ba5611ab3c68d38743a",
+  "blk.2.attn_q.weight": "6a10b5c2fd423d1e4c4fd60fa8c154a0159b6b2501ea79cae2ef19f45a674e5e",
+  "blk.2.attn_v.weight": "3cf891945a1f8ae7cc908a5c6b729ff5b70f4436c5ffdbf245cc0ed4cc19cd1b",
+  "blk.2.ffn_down.weight": "ea204fd04e0d2fc728a9861a459216bbfec629c152004ba625f52cd8837bd51e",
+  "blk.2.ffn_gate.weight": "3a3518729f1b8b64a82b8792f33987db5418fdb094be0263c68f146a5c38de54",
+  "blk.2.ffn_norm.weight": "754ede678b725de41a34b82f0edf7688b5c065be7c0d46df6f7ad9430d986884",
+  "blk.2.ffn_up.weight": "ffdcb88439f5828ffbd9fc844b03ff91637b790b9838097258cc3ae75935720c",
+  "blk.2.post_attention_norm.weight": "4b3f53b7ba26e8c36b2dfda3b7e5fc4b1065257cefdea235fc7df9af130ac2fd",
+  "blk.2.post_ffw_norm.weight": "e550369e26b8485e2b54ad34b34bc98af5494287dcc513c2c39cf1eaa5b89d07",
+  "blk.3.attn_k.weight": "89f24ea450e37d9e95757651a83205c085d81b354ee9489dd6310a391d8409f3",
+  "blk.3.attn_norm.weight": "24e2ea662b7cb822b4ca5cd61bc17f2709f406d990ec3b4a0dac1cc112db45cf",
+  "blk.3.attn_output.weight": "ac4dad69473c6e3fac56669212cadd8c34ecc5973d945972e974d94805334967",
+  "blk.3.attn_q.weight": "b6a9c9a7d4722b9096631c65de62228dfddca6e26edfe6af7fce01e116ef0f4c",
+  "blk.3.attn_v.weight": "f272a960a40093942309bc342a379984cbacec2d7bc64428db3f64e6b1887ed4",
+  "blk.3.ffn_down.weight": "c0188ba50d8228805982029c277fc0e87aa57473b8363037c648f6d006ff828a",
+  "blk.3.ffn_gate.weight": "a04aec1561ee6c0fbb18c3db49dc62fb533619cf697fd548cbf2279761aaec3b",
+  "blk.3.ffn_norm.weight": "bc053837d44087ec05eb5d9458357b2a5be787789b19cdbbdc694b57697f99a6",
+  "blk.3.ffn_up.weight": "b3ce8b274f20796d3b1a7c08ba27a919066f9de89a782faa544c4a8d6bea1382",
+  "blk.3.post_attention_norm.weight": "9c922dee7a7df5667289e2788e60170238239cee2dfdbbd9e435763f9f416718",
+  "blk.3.post_ffw_norm.weight": "b682544ac953ad2e0b49027ed8916f2e9d1aba5d1587bb4127ac703570c7a03a",
+  "blk.4.attn_k.weight": "143b0cbb4b787b95c2b6212374410e32173ccef2adb914908a2f89a7916de512",
+  "blk.4.attn_norm.weight": "5668f60491b780273745192662d02c9a92a4f692b29d16aa0bbc7413fec4f85b",
+  "blk.4.attn_output.weight": "b9f2bdb68be1e0cf66dd19f8fa2afb105910ad2ef394864cb32cea8f8944e0d5",
+  "blk.4.attn_q.weight": "ddcf1343dafbc2dfcd0b8741225af22fe4b54b2becce29240bd01c34265d126c",
+  "blk.4.attn_v.weight": "6dc7074366e7ed52d9f48c594dcc85bef738e096276cb99d28228c89eecc5b9c",
+  "blk.4.ffn_down.weight": "30334ffc59ce343cf2a1b973174acb7722823463adc07e19a99bd0f404bc9906",
+  "blk.4.ffn_gate.weight": "890f7c8af208d63b28db52c4b8c16c2288a382d87ff5a6a6d6b0a5b3bf27e6cd",
+  "blk.4.ffn_norm.weight": "ff0316cc7847221eb86a90c1ab441d4ee61553d410c66414a7755021b3b12448",
+  "blk.4.ffn_up.weight": "6af97d113f91564c636734f215e25ee602d48eb045458f300b3ec7582be0f41d",
+  "blk.4.post_attention_norm.weight": "69438f231e105e68216b078bdeb35a7cdc8b12c4e2845e18ecf4c8d361d6a321",
+  "blk.4.post_ffw_norm.weight": "0fd535da78bcf2b32c95b05b2b83dc49817393765be90d8cc1ed3d56f47b68ec",
+  "blk.5.attn_k.weight": "0166eb3c6d20dcf3d3c169e94caa8dee057535bb525e29f698fb6f8844f18a6c",
+  "blk.5.attn_norm.weight": "a7808f27f164023d5cde2be00fc23cac6c71aa0ddeb60bc23e12411b80087672",
+  "blk.5.attn_output.weight": "8b65b2027a0842b68c5308f91d6a31de9599d794157d77df8418b19f9e0d9334",
+  "blk.5.attn_q.weight": "966bc626ef2c2394d872087a41c126bb1b67d1d5f6de920204ef5e5b16c34003",
+  "blk.5.attn_v.weight": "9a362aef3f4437fbf0ef6e1ba785f3329c3db2960f93fe36547d2795e9c254ea",
+  "blk.5.ffn_down.weight": "63e53541d34197720c06f297aa8142ac6b6eec002c7987b296f26e8b1400f931",
+  "blk.5.ffn_gate.weight": "d9591fdd32f783e0fc26e20d5d587ee8971ac8ae2e4c818c6eac1c125c7c7f37",
+  "blk.5.ffn_norm.weight": "677334cc60ecce3a7f4ab3acda15d359353d7358872f614ad8914e3780e9fc6e",
+  "blk.5.ffn_up.weight": "a63764110e1c655ffbd55af0669b2dfe4cc29d0e198d33a8e5426461b08a85f7",
+  "blk.5.post_attention_norm.weight": "c55499f859b2c0a7f5cabceaae47309a5ad38bc29d0f4a8db81f1357023162a9",
+  "blk.5.post_ffw_norm.weight": "82752754665f842418f3e302cb5f43d1e0504dcd124c4b8ddb77018b2c793837",
+  "blk.6.attn_k.weight": "e20a5f0d6c807273c8d491439566b428497ac02097cf0aa55e33748c28e14be6",
+  "blk.6.attn_norm.weight": "2c6ba42fd3c73d72073ced03a32dd28d70a89ed9bbbc8fea1ba03a7ade951e6c",
+  "blk.6.attn_output.weight": "4de7c5c2f4a133a266e17ed8c14c52959466b54cc7ab9e19f789a33b4850f284",
+  "blk.6.attn_q.weight": "56462d921800e6b8cd2213fef04c4ff16d728905cb2f4c58e966d0a053a3b0ae",
+  "blk.6.attn_v.weight": "b758dcbff769d6240c2245ede1dbc62c4170a67c77458e866312589220fe29af",
+  "blk.6.ffn_down.weight": "582247fb3c2bf687cbe9413fe18d18ad47bef4b65df7d78905e10335c6134764",
+  "blk.6.ffn_gate.weight": "3035444d5286aefb7a6d04e55bc27e1fac7cf895cd5be02319a431b8e047b4ae",
+  "blk.6.ffn_norm.weight": "e582d24c66e01b96faa20ce6adfda3d8583b11e809bff89969927398175e369a",
+  "blk.6.ffn_up.weight": "6f4b7bbfedeacf61a4866ae0616c4ba6c9e856662e8f00ae6aaec7f52c53e7b4",
+  "blk.6.post_attention_norm.weight": "8fe51b50bd677d21586aecab0b565c4bf9fa68ad50bfe366f45e8fea3c657ca8",
+  "blk.6.post_ffw_norm.weight": "81ba3cb4c2bf5c546b86855b7a885d3fafededc67eb3a35cd3598b03c9e26e65",
+  "blk.7.attn_k.weight": "2e044179cdcae0946708c86bfea7aa0391e1f7e2a09b33fca035d384cc3ca758",
+  "blk.7.attn_norm.weight": "94b48c546b046803c60e75a3acb17a356b710735989938021b565f68df9b4985",
+  "blk.7.attn_output.weight": "65709b4ad7a581f4d75793d39d4032a359f6bcc0c3835205242a0b99e5b66824",
+  "blk.7.attn_q.weight": "8ded993c95d1f7caf201ceb6fa035cd6ed6d351b50b999fa9355dfee9486cb5b",
+  "blk.7.attn_v.weight": "c92d5e2d2d48397542bc03bea25bf39154075e66c5bb1ead85188505aa04ae91",
+  "blk.7.ffn_down.weight": "e8ba8fb57208805ef1dc23cd7c86e9a2d1fb7c52c3940d292cd5bb2eb24b3fac",
+  "blk.7.ffn_gate.weight": "f0f06d6a2e06c5ac252083bc61d05c814e6289d3f4e4a87d2f06918254c02c36",
+  "blk.7.ffn_norm.weight": "ebf8ef775f72624148e09d68a4332187a7a5020c521fe0623da1cd3485ad33e0",
+  "blk.7.ffn_up.weight": "a554adc4fc7122c247c77670e169916ba1794c787b5be30a2b36705138f1f746",
+  "blk.7.post_attention_norm.weight": "3aa6bc21d85c3a0c12b964e82b12feaedfdd13130c3cd2229228e24e0967ebdf",
+  "blk.7.post_ffw_norm.weight": "508bc7b19ee8ff08f0007c890133a462fc57c7e72b16ee8f6dd64def264ef876",
+  "blk.8.attn_k.weight": "363c8e74056642fe9e7c2f3f9769d57319cd3fa0a6022810189ab8d894322885",
+  "blk.8.attn_norm.weight": "685b49a1f1acb169f4df0bdd8e3de6943f3033cebad14b898a72000595610d92",
+  "blk.8.attn_output.weight": "7bde571e4efef1c6a6143f0526721dfb59e0a0ea0e1a3616a322b2eb937efa48",
+  "blk.8.attn_q.weight": "fc993dbc1074c28a0e1d85e5ab2f4ea6a9c6c1affe7ee56027000a275daed9b6",
+  "blk.8.attn_v.weight": "281e8791d3aef9b3864f1cb054da0ae0c2fef4ce0a58b1bad8bc136b2fa0f62b",
+  "blk.8.ffn_down.weight": "b1164a2578a7f87ed99c2bbc76c5dfbbbc6a1a803605391acc3f320fc989ffd7",
+  "blk.8.ffn_gate.weight": "6b39a3b3aaaa79aee61416b54d62160b9258042650e61c6b47bc77c2dd17daf3",
+  "blk.8.ffn_norm.weight": "17ea1362c72da27f12bc936500492035bdef3fd8f940cb12b57f37d42ba8ecb1",
+  "blk.8.ffn_up.weight": "bc3a7c47afc440d2bdf8fbe9ddf2c9220467472c60c8b4ded8c0f181470ec96c",
+  "blk.8.post_attention_norm.weight": "5c506204e00411ef9c8b4134d40eedcc19fffe68dd0af7d7cc49dcabf2dfac7e",
+  "blk.8.post_ffw_norm.weight": "002faec235c3678864e2901eed275ce4e9dc229164a91c9cd4c965142ba62305",
+  "blk.9.attn_k.weight": "0bab39d8c237f1b6d0010db40467142625a9e6f2e0e4c49a56c12b41e4e0b1fa",
+  "blk.9.attn_norm.weight": "de5f38e873b17f07aa7598831b89cc1cae2c9bc3eb2e042ee9af059d2563e84e",
+  "blk.9.attn_output.weight": "8a8184702c25a62df9ff309c0c7badc8587208523b2be3e8fa90ce7080573e6f",
+  "blk.9.attn_q.weight": "7c961b2431b09ddf95377acd07201cb91bf13d9cd3ae0f2c25c7d6a0358d9f50",
+  "blk.9.attn_v.weight": "e22d240cb4743067033e659cbf210ebe2ebbab3e1dea6ccbe5eaa982382ca038",
+  "blk.9.ffn_down.weight": "a426f81210f03d6ad53277416e1fdcdf37d8065e4817613edaf6c67a343426be",
+  "blk.9.ffn_gate.weight": "a82eba825cb77b8e64f85ff99ede2fc71bc9b01751eeb17e9e6c246ee12ea62e",
+  "blk.9.ffn_norm.weight": "1a97f9b1302a3a326d534c5c3fed2db6db0ae45fd0edd381a3e4fc1c75d81030",
+  "blk.9.ffn_up.weight": "5f20bac2bbf03bb42adb92fbf99561651e1edda57e0b61935ac7f6c08c0ed7cb",
+  "blk.9.post_attention_norm.weight": "9f9866d13988e1946b1e1c80d9374a92a6e3be33748f8eaed3e126d1e1a4c796",
+  "blk.9.post_ffw_norm.weight": "a6896dbf698db4dbbe5dbf12417d4fd80e9cad0c539c858892ec0aa5b046bb58",
+  "blk.10.attn_k.weight": "ca8446e5d21ecd4e6a70dca8d321be480be4fba94d70cba065205436feb44270",
+  "blk.10.attn_norm.weight": "4f41fe290e8f21f63b82151b6cce94bf7318d121468816b0c58af0ff7c1658ab",
+  "blk.10.attn_output.weight": "c626d2e9681c5c941bbde43dddfae1a8d4986bf2be4470857bc8e8bd7f869044",
+  "blk.10.attn_q.weight": "1e61b210a13a429977325cf15d781ab77d604cfa862f4270329cbd94237d5835",
+  "blk.10.attn_v.weight": "8ff8d3e3f058ec3b35ada1057f2ed59c06494d0e0be6a8dc3ff9edf9f0e1a115",
+  "blk.10.ffn_down.weight": "bcebc04219f8081a5f483e58103c0ddbbbc631a0a54fd6dd9d55778e041f70ee",
+  "blk.10.ffn_gate.weight": "7a23a1e620ef871384ddf9611ccdcfb893fbf013cc203ac8e72f745420f1eea0",
+  "blk.10.ffn_norm.weight": "e3a375e43c349a1c6c66c22328e513cc1af3137fe839e43dc8e9be2f65914fd7",
+  "blk.10.ffn_up.weight": "5d182e7c94369194fca5f19cbbe668a999911e57f3d363bc7fb6088428700cb9",
+  "blk.10.post_attention_norm.weight": "b841c6308296e8984f3c5f549c6e3a242f4b3e19141e1f54cc08de9c46759c09",
+  "blk.10.post_ffw_norm.weight": "9d66fa05b5c940208f634f5053d809094c99a2a10a1d1e8847c8281fbd99fb49",
+  "blk.11.attn_k.weight": "14adf24ebb2bb17b336ca81cec3e690fd854782f4440ca6c66cc1d7e7bf1c850",
+  "blk.11.attn_norm.weight": "2d2213f311f50414702b5b34f22aafb9d9a0b6787243e7578562583dc40ad195",
+  "blk.11.attn_output.weight": "de1f14cc2a7fff00cf11b229f0576999205f17b9536e97abc9d6de3cc79a7884",
+  "blk.11.attn_q.weight": "2bcc5c147524003109ece0be08b89ac8b25baa71416ffa76573c6c052ffc6eea",
+  "blk.11.attn_v.weight": "2e6ab8573070c22dc1e0d7aebe4d52123226dacf7822dcce06fadbb38fb036a4",
+  "blk.11.ffn_down.weight": "1b86902f4e36868421e5228b9445051f8290b292df22a6d1af836dcecc1f25c3",
+  "blk.11.ffn_gate.weight": "e756e8081bd0a16aea4a9ef5076ad102113524f7a3d50a3a77aaa7f7938b63e8",
+  "blk.11.ffn_norm.weight": "6913887267be227cf9d1991a3dd8db2e7e74bb9b5fbdfcb9ac954fd7d7b95b3b",
+  "blk.11.ffn_up.weight": "619a3ac0609ebdf42c3fb2b6e4b1db48df79e6dd8418d7ab8f1bbff13d8a6a50",
+  "blk.11.post_attention_norm.weight": "e4b4ba92cef7b6a78407e8ab1b0307d47dac6c3df7b6817e28038317ff662d7e",
+  "blk.11.post_ffw_norm.weight": "40aceeec58cb855f0c158c9cc217168fcd5d0e735567d587217b1d78df17bc5f",
+  "blk.12.attn_k.weight": "c54c5a4d4892522022d1aa2204cfc624f0b4042caa536e678967316293fe5cb1",
+  "blk.12.attn_norm.weight": "7cd2ef58298569ffdf244d9b390f3917245276c8206e5780af5f96d8c0bbb446",
+  "blk.12.attn_output.weight": "85495ef9cc8b3deb21f741bde463ff6493acae2be51f02ecdeef952cbdec3375",
+  "blk.12.attn_q.weight": "d19383f83fd119bfb8c0280c9515705c11d8e7d502019fcf8f49efeef0d106d0",
+  "blk.12.attn_v.weight": "869ac669ba49531d9128892a0e27cef15de508ff40cdf80cc1681dde50d09204",
+  "blk.12.ffn_down.weight": "578f39f8f9fc2f09138afc884a952d7cc3a9a31de4216acd10e88e19e0b75f8c",
+  "blk.12.ffn_gate.weight": "e29a0186bc6c4a0720246306e922d3a83f777dadcf4ac80bad468287031cc8b5",
+  "blk.12.ffn_norm.weight": "e1ee95c6584b5cb57fcf1db8ce2bcc03aff91eb389238c094a61c00dde93d1f2",
+  "blk.12.ffn_up.weight": "2a826f06d7cdfb3edc6ae250ff44363ef77a2a9cdf96313e23a331b99ebfa17d",
+  "blk.12.post_attention_norm.weight": "4bafc7699b948d5cbc0d3e09b418b06c6abc4651a61ada9609d9a2f21c7e5607",
+  "blk.12.post_ffw_norm.weight": "bbb8c34a7176bb1a49f9fe2bacca0bd26b673d52c0835b2e90fa11f2962f077f",
+  "blk.13.attn_k.weight": "ffeefccfe8255d1b694382012ff4134eee5fec9d9491c8d0ff0a13832d1a37e8",
+  "blk.13.attn_norm.weight": "35713726529e3887c4135a88e86e8a4d7270ba5b9f2d1ab462622fbf40a7cdce",
+  "blk.13.attn_output.weight": "0d60b7c5cd71190a9ef4b873b0f516be15447c32d83914db2794b14592b0b460",
+  "blk.13.attn_q.weight": "8296069e65bef794cefc61257fc65789b3cb22955e30f3df129205e5041b2222",
+  "blk.13.attn_v.weight": "ca0f4ab9d16a748fc643a5c0c7a19826a811bf2a4e7316a8c935d4bf0ce8abc6",
+  "blk.13.ffn_down.weight": "d5514e0c8e7b3ed1cbcc1605eb5be1733b6ab3514cf8a0508fc72f7d05ed8bcb",
+  "blk.13.ffn_gate.weight": "8108e517a82e08a3aefbbd267bfa50a1668f92a76273280ce8a6bc1f6dd61521",
+  "blk.13.ffn_norm.weight": "5fcb6132d2134bf1f835b904a99820fa501dbc57d2224129f7098bf3cabc1d36",
+  "blk.13.ffn_up.weight": "6d744b7cd390a3cae3aa350dd379b81246acd056a2259996b6aaadece8465ccc",
+  "blk.13.post_attention_norm.weight": "e08b14698912509790e9575b8676971fbb0a4d82d719367e3756c0d0c4ab8cc0",
+  "blk.13.post_ffw_norm.weight": "2b196e4450fc5f1e7367b2cf7fe33a15fe919fbcdd861d11002346f16e980535",
+  "blk.14.attn_k.weight": "120e5f48d7268dfd9ab5f4bc9cc57a7cec63ea9635f56b80d435eb22936e9483",
+  "blk.14.attn_norm.weight": "146367bcce4db72cc894419a2e0145a6f533507dd68e4739c10ee480308c401f",
+  "blk.14.attn_output.weight": "720fa0165e756876c5cb6ad9e2780dd910390933f3f8849e5add5da04266650b",
+  "blk.14.attn_q.weight": "f5183466f56219ca1aca52d8b82c2d966a4198fea40fdd6b39f4d8b06ca2a6dd",
+  "blk.14.attn_v.weight": "24f8ea3d5512cd37c43c8329cb0da0c90d1895aef763ac2dcee3fe5157ec50a2",
+  "blk.14.ffn_down.weight": "e29960965b384ae5ab3d898a4dbaa8fddd28fa0e477ac28bcac49dec12a5ac67",
+  "blk.14.ffn_gate.weight": "6d0d6a74bfe9692e8f8eedff0fc34fc4fa1c8687794f35f2e2b033ab2d7510b8",
+  "blk.14.ffn_norm.weight": "f7036c1a9a71e046c9d2af16e9218fda5dbb0f7241ab44747abed1f0f9d602ca",
+  "blk.14.ffn_up.weight": "7d69ea1424007ffc9c12247dd0308c616e93ac02a59ec341cfa48f92d6ce3b10",
+  "blk.14.post_attention_norm.weight": "65b9712834d9445d4236bec362f3fb795c20d60c541b3dc6dbb7914d9b493e41",
+  "blk.14.post_ffw_norm.weight": "9c6a8da2e4e437d5cfdf3b9097e9f8b64bf07946a048badec20f4d374613f38f",
+  "blk.15.attn_k.weight": "864bc618303a0e4ee67fb1d5e751de61e936cd51e96669dd86f8cd08f2305045",
+  "blk.15.attn_norm.weight": "f9f4187da6eeadc2fc5921d8fe669741697d16c13d71e4aaeb73b82f50dc577e",
+  "blk.15.attn_output.weight": "ce2419a0b097036b2a31f2f4ad731d5814bcc2ef4c511786e24471e5eefd273b",
+  "blk.15.attn_q.weight": "9539db5a970d11ebe99722d1e13fcd635e250033630811efe583d2f97778e4a9",
+  "blk.15.attn_v.weight": "1c834b48ccd88adaeabb7d8bcb6be0bcd6d5ac1354ce88fc28f19a1a96b81ab3",
+  "blk.15.ffn_down.weight": "bc1f97a65dde6fa2c1e5397afb612266944b343f2eaa868b635ddd25829f8a42",
+  "blk.15.ffn_gate.weight": "1b14529d57056b79037f6cb5008132e62cc35992353b38dda59572274623103b",
+  "blk.15.ffn_norm.weight": "9af77458de9ee55c66f93865759f9c2c398557f94f3fa8fa6af30543d7339cde",
+  "blk.15.ffn_up.weight": "41d524a26b61a9595816b4fd53cf57ef50a702e4ef32933ff6136dca9136a267",
+  "blk.15.post_attention_norm.weight": "c60a03cd0e63a7db5c80015e58e9b97ba2208caa19f66a6fef5c4447eca900ce",
+  "blk.15.post_ffw_norm.weight": "34f7f9f96769215bbc3d17084df091864aef96a6645b7d0b3b7d9bd92f1a4b0b",
+  "blk.16.attn_k.weight": "7e27240d9f3a8c6cf0f4a980113d43234f514eadc3e3e1792b86efb29ffb1a6d",
+  "blk.16.attn_norm.weight": "af798acc0899282a30448edec48223b3e8efda177090273e612d8eca5e377301",
+  "blk.16.attn_output.weight": "79df39a3709d3d53e84146291e0944a7a653d06705293d9ccb5648dceadb432c",
+  "blk.16.attn_q.weight": "db58a1c3b83ad294804e5fd7321005719e200659173466df5a52a182b80b7165",
+  "blk.16.attn_v.weight": "2af6d48cbaeb225b5c1a704f76abd89c8ab1521417695b112b4dcc2cbd39b74d",
+  "blk.16.ffn_down.weight": "fc1c813eb5e7da3d6194569d6cb21602fc6eff2dc8e1b0eb753f2d5df148189c",
+  "blk.16.ffn_gate.weight": "7a80bcbc42464bd55df4814a6edbd7b5c153e0428323bbe49de55e2d2add33e7",
+  "blk.16.ffn_norm.weight": "2041685ee926d30f3f2ae4ec35b5688f1cd834167a6359a7d4057eac804c58b2",
+  "blk.16.ffn_up.weight": "8da4b718973ac1d43b928829bc45e062fd101984d6c98dd825bd7c5d08ebfbe3",
+  "blk.16.post_attention_norm.weight": "975c48fe680a6167438a106140a8872eee7765191f152d80e3b8ddf47693e095",
+  "blk.16.post_ffw_norm.weight": "4de2d4d483acfe4fc77860ea929025df2f4e15c10729413f36a18c94eaa6d689",
+  "blk.17.attn_k.weight": "f937e61f0af8c4cd98ee742648eb60e02e579683e21d421071295a3b70aebaad",
+  "blk.17.attn_norm.weight": "c3270583ed28b7e423f5b170c59113234f258169b93a867d9274f4c10b7cb115",
+  "blk.17.attn_output.weight": "b8c1150e81e685e539a5dcf2c19047a24eba2b281fabe166674b1d71ef4612ea",
+  "blk.17.attn_q.weight": "c255100ae2011e7dc7e3bf3bc3ccd96d859fbb98581cae993d7b82c1ba8e8b39",
+  "blk.17.attn_v.weight": "5830bb0a555984c6485348067f70b5d22ae337c011aa9248dac2ff4c95944551",
+  "blk.17.ffn_down.weight": "8ff9a7cccaa3776434a9d895aae4fb5c36c736bf2ec98784226b4c234940fbb0",
+  "blk.17.ffn_gate.weight": "1b52876739712831c272911533da206f407b46034a1a4ae8a88c1f96b6bd5747",
+  "blk.17.ffn_norm.weight": "d0e16ba5e87c91b545334e022058c7d03849665c3b1a6298771b656531366b66",
+  "blk.17.ffn_up.weight": "4dd6211d01dbebbe21052708eddc242b082a58b5f18ed16479e17987c1d3432e",
+  "blk.17.post_attention_norm.weight": "6f49c775c7417dade77ba8268a0f8441c1e5ec28b5d7e4dc5ed07a04d04600c8",
+  "blk.17.post_ffw_norm.weight": "b91a0bb2e6679e9c9be06ad323adae441d00a3d673efb19d7c4954be2aa84b27",
+  "blk.18.attn_k.weight": "22b565ace1b4da8b33865a58625be1d90beea9891f29686a69fa9cf7c93217db",
+  "blk.18.attn_norm.weight": "3e0160d7063c8753de65d2356a66648e47d921efdc5c917efb8209892120f8db",
+  "blk.18.attn_output.weight": "e3180f0bb4ca90b31e9b08158db38e332de62dfbaefe34aa94cc316409331e09",
+  "blk.18.attn_q.weight": "f3a5a83614c3ba7ea41cdd5b1b0819a241ee2a951a381ce4a9e001d3f700ed8f",
+  "blk.18.attn_v.weight": "f3350a5984fb951fc738adcf78147e6d812ff1c576670c460cafc99c253c1654",
+  "blk.18.ffn_down.weight": "9e9d09b13a33525e14bdaee6efc65c551ac7cf7680e534b940ab122a3a7c1ac9",
+  "blk.18.ffn_gate.weight": "ebaec8b4b578a2e8d815baac12f1675c208f80c68074d5a18288a2e1a60680ee",
+  "blk.18.ffn_norm.weight": "33e7687c53a242f2f8dc7093a491c97b18d4a5a8c14d183f02bd586a770f05aa",
+  "blk.18.ffn_up.weight": "78a1816662378ce56cc870e705174492781897b3afd2d4d97a51f10f2f2987c1",
+  "blk.18.post_attention_norm.weight": "a58dde3f12df3e94cbc27d87c8ea86f89af8a388a506446ff6758f05399b05fc",
+  "blk.18.post_ffw_norm.weight": "cebf90cc143577d483cca27b032dfd82031ee59bdf17c0e2cf60a0a3ad5bf996",
+  "blk.19.attn_k.weight": "4683375d0599ac9e2232196aae1e90af13a14cae26e865465de5c8e257bb2055",
+  "blk.19.attn_norm.weight": "f3eba936bfb1814bbcb0a1d62739eb66daac839df8c9c836fe0e94860df88525",
+  "blk.19.attn_output.weight": "51c0f01d38a9dcfe9bdbc4643576fab164c1d9e4b7168b7695c0ee55e6965667",
+  "blk.19.attn_q.weight": "28d15b69b8416f2e7ddc88fe381cb1e2ef2ad705fb1c268139ba96498cc74848",
+  "blk.19.attn_v.weight": "6860f1cd720638e63a981fa2c0b4db900129826bcb9823c9ddf9fb8b1b9f3383",
+  "blk.19.ffn_down.weight": "bc7f2d7827ee01c2dd41401c7b3b1700ad3a4ff620e8bb734f92630d342dcc7f",
+  "blk.19.ffn_gate.weight": "54d03ef69ba373fc410fbca8f1e34a565d58e4296d9a035ff7e48340b9c848e7",
+  "blk.19.ffn_norm.weight": "9178fc796a340ee6e8128ca74c0cb6203d1adbed6927af4e5ac7863da57affc7",
+  "blk.19.ffn_up.weight": "a77bd708026c6e83ad5c79c223278e74621bcf74a9641c7818d96b595daaad20",
+  "blk.19.post_attention_norm.weight": "ae94aa26f4c411bf9496a6fd4a6df64ee589ee1ae9a04b531d45acc95721e582",
+  "blk.19.post_ffw_norm.weight": "9ad210700edeef12133bdcff04bf1c7f62b49f6f4a9ba483c7cdc59857c24a5c",
+  "blk.20.attn_k.weight": "e35bce1e9f4a7a09ef34721f57ea38cfca68c272f52d923fe50af8308f66cfaa",
+  "blk.20.attn_norm.weight": "644800f6926fd34f233795c4dec1151a295d2138ca8cac33e3e48167d26f8b41",
+  "blk.20.attn_output.weight": "8d3758cd236471741e1ad66c0710cb79077dc8c7a3a292d35bc551c0c5abe627",
+  "blk.20.attn_q.weight": "c333b1f0f6f956b5d73891df10b1a0321e55fc31c40d623a24e1f52caa6a998b",
+  "blk.20.attn_v.weight": "8562b418d0c4868a050fb19fa3fcaf50a8cf1c669f537d666c80c7b3a04714e1",
+  "blk.20.ffn_down.weight": "97efb608ac44cc804198faec3ee66eafe56ced6b7ca5359700c6f1df75b7205e",
+  "blk.20.ffn_gate.weight": "5c61151d86f28415c73c73d90ec088c646cbe5c1640197caf58eb501ba7db293",
+  "blk.20.ffn_norm.weight": "24bbe0a701afd4bbeea65b3edde712b3cbb2281043bbc43dbf250582453116ed",
+  "blk.20.ffn_up.weight": "e170cf68e249566aa99eb6f6b265679bf9a5a6b76830ba24e7e130c2515910c4",
+  "blk.20.post_attention_norm.weight": "e092d751cfe20dbf2d348358f3b38397bd83e4ed94d6bbaa6bbaddcd902b2ac4",
+  "blk.20.post_ffw_norm.weight": "219a18a47dcba76e669e4322223a5a9227bd3db1de3fbd3d3cfb22e54a783c5a",
+  "blk.21.attn_k.weight": "c3a095ebddb42c63824f1c98da65263dc88e4d790a26aa1632840b44f5cc7cb1",
+  "blk.21.attn_norm.weight": "ef8bbaded5fbc45ad9cf3985ae02174524e7090fe6362811124f942ef643bec7",
+  "blk.21.attn_output.weight": "668f018aba72baac6252aa3ad58569ddd55ab751a0dd8d7bcc9fb9b6efb4bf53",
+  "blk.21.attn_q.weight": "e759c65663089f3bbbd51847934c185e680c82f1249065d5d487da638e519e6d",
+  "blk.21.attn_v.weight": "2ff57762686cf9ba1f5a6be76503454b97556ce67f4ac98254bd0562231197ba",
+  "blk.21.ffn_down.weight": "3fd106556fb721b1c28ae3f4026bc83eb1b08ed910f2ba5f466c6b5f327d91cb",
+  "blk.21.ffn_gate.weight": "338022d882f4b6619e8054a6fb909696fa3eef3013cf69b65c3cacdfc5b9e42c",
+  "blk.21.ffn_norm.weight": "1e77660c23a3f9653ee721a863d1960f773d87437cabc4dc0a6e17ee3d4e5e44",
+  "blk.21.ffn_up.weight": "7d31b20fbc2e6eba8f350f170069dc36f0cb12f68fbc4206ec5022a74085ebcb",
+  "blk.21.post_attention_norm.weight": "9638bae8d8bdcd7ed68da282979cd84a07c41ff9cabcaea94ebc846a1803db23",
+  "blk.21.post_ffw_norm.weight": "d622ef11115fe0cbe04b727d5a3b6371e7f39bf08c8d5eb9bc6da52e3f3cfb9d",
+  "blk.22.attn_k.weight": "5c321cb29deffbe57de200dd206a62005f1e80acb86c4fd2349dd44c8d3594fd",
+  "blk.22.attn_norm.weight": "198d949705d7170a331d75889d8c7500c3635254dac2cc6aa4dc35d556584536",
+  "blk.22.attn_output.weight": "19805cd5d7025b457e5d41d70db8b3fd63c2dd0e4a94d3ef1704d50ef4e749e8",
+  "blk.22.attn_q.weight": "177836cd583fc87405975ddc21ebfebdaa090a0363799664c72caa3da851ae2c",
+  "blk.22.attn_v.weight": "fea255692483e30d0108f9e4e250eb3ed7dbda8d83f499b06519b8c223ae6096",
+  "blk.22.ffn_down.weight": "00cb8939f03e5817d6d412de8cf2c923c9568d5493e382cec7faf5718fb034eb",
+  "blk.22.ffn_gate.weight": "b0591065b91281b2fbd8a9567f3568d40479f680e1f0a29e27ae213f37642489",
+  "blk.22.ffn_norm.weight": "96b5c5d0737c2ceb8fc869f54adb9e5f46e28cb7b177c40f49fa926b923c00f8",
+  "blk.22.ffn_up.weight": "81f472185b24344ab0594ea8246cc6e200e0dc1cab4943e74fbe4ca19d5a9701",
+  "blk.22.post_attention_norm.weight": "27fa9aa6260aa3071e0391e1a1d49322dcb6e8072315b8a9b7064087108dbd06",
+  "blk.22.post_ffw_norm.weight": "f37e1dcd7f643d9545675ffe9dc527a11eba86eb204989c2f44f636b266d896a",
+  "blk.23.attn_k.weight": "5d82f36658a56c3f94d0bb2d61f65509c966fa6568f81812e0d3e338b380ef8c",
+  "blk.23.attn_norm.weight": "b7983f88d9cad88bc88a528923e6da592ad20e699965b223ebc10840fe1f4fec",
+  "blk.23.attn_output.weight": "59f97f80f430d71606aab0158a195aed29ccd3405e6c0a5c41c809be8eb01898",
+  "blk.23.attn_q.weight": "53ac4789fe958919cc02ea4222bcd64c0ea1b4baa54304bff46635bdf42f7490",
+  "blk.23.attn_v.weight": "ec8abe09b9e84dbb52c7a068094657c6d3c62fe551ba8d7c3a3f23da622e9756",
+  "blk.23.ffn_down.weight": "3cf547eccb1b82aa64f208cee9682d7f558ca84e0aead7d9d3d1420d90f3d992",
+  "blk.23.ffn_gate.weight": "366aa2486d911ba81eb519119e13807deacf7e9908bc1975a2a63e00d6b10124",
+  "blk.23.ffn_norm.weight": "6d1d4a4af34bb7dc090ac87d6457d398c3e0fb68bd2e2b60b099dc318b6cfac3",
+  "blk.23.ffn_up.weight": "53f76692e253f5d2420b3f200c731b9f3b7a83e379920b4a067c729b4674aa4d",
+  "blk.23.post_attention_norm.weight": "7c952fa0efa76b3f048c8c4c9e8dcb5e3724d231327eda6423a34d3f3d3367de",
+  "blk.23.post_ffw_norm.weight": "7ab188cfe61f0a91b40309a0ab6bfa99f19d0ff2a37b6ac10e5f0c7f44eb5270",
+  "blk.24.attn_k.weight": "225798792f9bfdd10eff0505ebe61e0aad0209c17b431f6044ee7968ffe8c198",
+  "blk.24.attn_norm.weight": "635e3c1ebf5219bbebfc40ef164bc32d2b726ef595a94da64ac524ae878e2915",
+  "blk.24.attn_output.weight": "482f5bb2db8d9ed22b253d9a3296333b239efe698e5992e5d77e7e12dc2a5cf5",
+  "blk.24.attn_q.weight": "43805bbccddb65d58fffc4be9b5c374d4e1df1395ec1e1ffb4bcff03e98d5adb",
+  "blk.24.attn_v.weight": "fa741af54b4a3b1775d32f59134756090c5df2e7345a12a2d8db94fe289667a7",
+  "blk.24.ffn_down.weight": "83c6351e3162626b276f524a57836144625c2556dbe321b57cbd8fd486a68fab",
+  "blk.24.ffn_gate.weight": "fbe66be0d84d12cea5176cc7eaef64382ffc7324cd9d6266a3342dc43442f2ac",
+  "blk.24.ffn_norm.weight": "77c1445a8639ad24938bdf0280233eea2362d47391421833dfa72ec756dfc1e8",
+  "blk.24.ffn_up.weight": "78235ac729ee23c1cf1ae543751e3af32776d8808cee6e529c2a625a1f027654",
+  "blk.24.post_attention_norm.weight": "161f71b6d07628d43e4ae51a4c9088ec6ca2db123a17986a14505d83fdd04dad",
+  "blk.24.post_ffw_norm.weight": "cf1ba692aa683368b02ac413e69b2521b98c69a5274eacbb54165b53bf38a8b2",
+  "blk.25.attn_k.weight": "057a56bd8c8d2b41608d1f71faa3052902152ddf85e47669ad950c1c3e77c33f",
+  "blk.25.attn_norm.weight": "b7179fe02c334da556ddcf6c1b502245639a728c4cbba8b552d8e1df4565ee9d",
+  "blk.25.attn_output.weight": "4fed8b05b08a0ff75ffd022701bbeb52f17b23d09332a1ddcba737244bd0d3b0",
+  "blk.25.attn_q.weight": "c52e99f5d38bf7538d6106a0bbf38ac6dc6296bca9a3f849afa384ea67b4af01",
+  "blk.25.attn_v.weight": "c49c23d8e1cfa6a8eb971eb69942204890c6d7d830dc8774c84b108a80598912",
+  "blk.25.ffn_down.weight": "c08d4dc8412b19fdc870c164b83c341b236ec6fe7bb4a9bcfe0dc100faa20286",
+  "blk.25.ffn_gate.weight": "1a4cb3f36735d59181721471452807903006539e5e1b5ceb4f72d1d7ae134127",
+  "blk.25.ffn_norm.weight": "8fd6bd0dcec5198761525a36992a57c9ec5e9da60a22092839a84ae8c4e87f26",
+  "blk.25.ffn_up.weight": "3a00f39bdd5f31dc5e3b281d2002e1ac4f2475d49a0ac1d7720a25b377dcd04a",
+  "blk.25.post_attention_norm.weight": "e5f31a648612c859b6d21c9ee426e87a86cb1973dfdd86276c767371d9cef5ad",
+  "blk.25.post_ffw_norm.weight": "553c3bd774922c99c2384380a142d019881d30dbf0fe3bf9430dabfb3f6cbd33",
+  "output_norm.weight": "49445c4585ab0a8135717a0bdb1cda4a062a030177d0119561d91542aec5744b"
+}
--- a/convert/testdata/gemma-2-9b-it.json
+++ b/convert/testdata/gemma-2-9b-it.json
@@ -0,0 +1,6 @@
+{
+  "general.architecture": "gemma2",
+  "gemma2.attention.sliding_window": "4096",
+  "gemma2.attn_logit_softcapping": "50",
+  "gemma2.final_logit_softcapping": "30"
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -1,7 +1,6 @@
 package convert

 import (
-	"cmp"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
@@ -11,6 +10,8 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+
+	"golang.org/x/exp/maps"
 )

 const (
@@ -99,8 +100,21 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}

 		if template, ok := p["chat_template"]; ok {
-			if err := json.Unmarshal(template, &t.Template); err != nil {
-				return nil, err
+			var s []struct {
+				Name     string `json:"name"`
+				Template string `json:"template"`
+			}
+			if err := json.Unmarshal(template, &t.Template); err == nil {
+				// noop
+			} else if err := json.Unmarshal(template, &s); err == nil {
+				for _, e := range s {
+					if e.Name == "default" {
+						t.Template = e.Template
+						break
+					}
+				}
+			} else {
+				return nil, fmt.Errorf("invalid chat_template: %w", err)
 			}
 		}

@@ -140,7 +154,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 }

 type tokenizer struct {
-	Version     string  `json:"version"`
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
 		Type   string         `json:"type"`
@@ -184,32 +197,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}

-	var tokens []token
+	tokens := make(map[int]token, len(t.Model.Vocab))
 	for k, v := range t.Model.Vocab {
-		tokens = append(tokens, token{
+		tokens[v] = token{
 			ID:      v,
 			Content: k,
-		})
+		}
 	}

-	for _, t := range t.AddedTokens {
-		t.UserDefined = true
-		tokens = append(tokens, t)
+	for _, token := range t.AddedTokens {
+		token.UserDefined = true
+		tokens[token.ID] = token
 	}

-	slices.SortFunc(tokens, func(i, j token) int {
-		return cmp.Compare(i.ID, j.ID)
-	})
+	keys := maps.Keys(tokens)
+	slices.Sort(keys)

 	v := Vocabulary{Model: "gpt2"}
-	for _, t := range tokens {
-		v.Tokens = append(v.Tokens, t.Content)
-		v.Scores = append(v.Scores, float32(t.ID))
+	for _, k := range keys {
+		token := tokens[k]
+		v.Tokens = append(v.Tokens, token.Content)
+		v.Scores = append(v.Scores, float32(token.ID))

 		switch {
-		case t.Special:
+		case token.Special:
 			v.Types = append(v.Types, tokenTypeControl)
-		case t.UserDefined:
+		case token.UserDefined:
 			v.Types = append(v.Types, tokenTypeUserDefined)
 		default:
 			v.Types = append(v.Types, tokenTypeNormal)
@@ -238,7 +251,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
 		return pattern.Func(fsys)
 	}

-	return nil, errors.New("unknown tensor format")
+	return nil, errors.New("unknown tokenizer format")
 }

 type SpecialVocabulary struct {
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -15,6 +15,11 @@ import (
 )

 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	ast, err := parseAdditionalSpecialTokens(fsys)
+	if err != nil {
+		return nil, err
+	}
+
 	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			sentencepiece.ModelProto_SentencePiece_BYTE:
 			v.Types = append(v.Types, int32(t))
 		default:
-			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
+			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			if slices.Contains(ast, piece.GetPiece()) {
+				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
+			}
+
+			v.Types = append(v.Types, tt)
 		}
 	}

@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {

 	return &v, nil
 }
+
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
+	f, err := fsys.Open("special_tokens_map.json")
+	if errors.Is(err, os.ErrNotExist) {
+		return nil, nil
+	} else if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var m struct {
+		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
+	}
+
+	if err := json.NewDecoder(f).Decode(&m); err != nil {
+		return nil, err
+	}
+
+	return m.AdditionalSpecialTokens, nil
+}
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -0,0 +1,208 @@
+package convert
+
+import (
+	"io"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS {
+	t.Helper()
+
+	for k, v := range files {
+		if err := func() error {
+			f, err := os.Create(filepath.Join(dir, k))
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			if _, err := io.Copy(f, v); err != nil {
+				return err
+			}
+
+			return nil
+		}(); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+
+	return os.DirFS(dir)
+}
+
+func TestParseTokenizer(t *testing.T) {
+	cases := []struct {
+		name              string
+		fsys              fs.FS
+		specialTokenTypes []string
+		want              *Tokenizer
+	}{
+		{
+			name: "string chat template",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"chat_template": "<default template>"
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "default",
+				Template:   "<default template>",
+			},
+		},
+		{
+			name: "list chat template",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"chat_template": [
+						{
+							"name": "default",
+							"template": "<default template>"
+						},
+						{
+							"name": "tools",
+							"template": "<tools template>"
+						}
+					]
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "default",
+				Template:   "<default template>",
+			},
+		},
+		{
+			name: "added tokens",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 999,
+							"content": "<unused999>",
+							"special": false
+						}
+					]
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<unused999>"},
+					Scores: []float32{999},
+					Types:  []int32{4},
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "added tokens overlap vocab",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<pad>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<pad>": 0
+						}
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<pad>"},
+					Scores: []float32{0},
+					Types:  []int32{3},
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "special token types",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<pad>",
+							"special": true
+						},
+						{
+							"id": 1,
+							"content": "<eos>",
+							"special": true
+						},
+						{
+							"id": 2,
+							"content": "<bos>",
+							"special": true
+						},
+						{
+							"id": 3,
+							"content": "<unk>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<pad>": 0,
+							"<eos>": 1,
+							"<bos>": 2,
+							"<unk>": 3
+						}
+					}
+				}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"add_bos_token": true,
+					"add_eos_token": false,
+					"bos_token": "<bos>",
+					"eos_token": "<eos>",
+					"pad_token": "<pad>",
+					"unk_token": "<unk>"
+				}`),
+			}),
+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
+					Scores: []float32{0, 1, 2, 3},
+					Types:  []int32{3, 3, 3, 3},
+				},
+				SpecialVocabulary: []*SpecialVocabulary{
+					{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
+					{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
+					{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
+					{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
+				},
+				Pre: "default",
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if diff := cmp.Diff(tt.want, tokenizer); diff != "" {
+				t.Errorf("unexpected tokenizer (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.

 ## How do I use Ollama behind a proxy?

-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+
+> [!NOTE]
+> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.

 ### How do I use Ollama behind a proxy in Docker?

@@ -191,6 +194,8 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.

+> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
+
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## How can I use Ollama in Visual Studio Code?
@@ -276,4 +281,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit

 ## How does Ollama load models on multiple GPUs?

-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
 | 9.0                | NVIDIA              | `H100`                                                                                                      |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
-| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
+| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
 | 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
 | 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
--- a/docs/images/ollama-keys.png
+++ b/docs/images/ollama-keys.png
--- a/docs/images/signup.png
+++ b/docs/images/signup.png
--- a/docs/import.md
+++ b/docs/import.md
@@ -1,44 +1,129 @@
-# Import
+# Importing a model

-GGUF models and select Safetensors models can be imported directly into Ollama.
+## Table of Contents

-## Import GGUF
+  * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
+  * [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
+  * [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
+  * [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)

-A binary GGUF file can be imported directly into Ollama through a Modelfile.
+## Importing a fine tuned adapter from Safetensors weights
+
+First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:

 ```dockerfile
-FROM /path/to/file.gguf
+FROM <base model name>
+ADAPTER /path/to/safetensors/adapter/directory
 ```

-## Import Safetensors
+Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.

-If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
+Now run `ollama create` from the directory where the `Modelfile` was created:

- - LlamaForCausalLM
- - MistralForCausalLM
- - MixtralForCausalLM
- - GemmaForCausalLM
- - Phi3ForCausalLM
+```bash
+ollama create my-model
+```
+
+Lastly, test the model:
+
+```bash
+ollama run my-model
+```
+
+Ollama supports importing adapters based on several different model architectures including:
+
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
+  * Gemma (including Gemma 1 and Gemma 2)
+
+You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
+
+  * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
+  * [Unsloth](https://github.com/unslothai/unsloth)
+  * [MLX](https://github.com/ml-explore/mlx)
+
+
+## Importing a model from Safetensors weights
+
+First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:

 ```dockerfile
 FROM /path/to/safetensors/directory
 ```

-For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
+If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.

-## Automatic Quantization
+Now run the `ollama create` command from the directory where you created the `Modelfile`:

-> [!NOTE]
-> Automatic quantization requires v0.1.35 or higher.
+```shell
+ollama create my-model
+```

-Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
+Lastly, test the model:
+
+```shell
+ollama run my-model
+```
+
+Ollama supports importing models for several different architectures including:
+
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
+  * Gemma (including Gemma 1 and Gemma 2); and
+  * Phi3
+
+This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
+
+
+## Importing a GGUF based model or adapter
+
+If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
+
+  * converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp; 
+  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
+  * downloading a model or adapter from a place such as HuggingFace
+
+To import a GGUF model, create a `Modelfile` containg:
+
+```dockerfile
+FROM /path/to/file.gguf
+```
+
+For a GGUF adapter, create the `Modelfile` with:
+
+```dockerfile
+FROM <model name>
+ADAPTER /path/to/file.gguf
+```
+
+When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
+
+ * a model from Ollama
+ * a GGUF file
+ * a Safetensors based model 
+
+Once you have created your `Modelfile`, use the `ollama create` command to build the model.
+
+```shell
+ollama create my-model
+```
+
+## Quantizing a Model
+
+Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
+
+Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
+
+First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.

 ```dockerfile
 FROM /path/to/my/gemma/f16/model
 ```

+Use `ollama create` to then create the quantized model.
+
 ```shell
-$ ollama create -q Q4_K_M mymodel
+$ ollama create --quantize q4_K_M mymodel
 transferring model data
 quantizing F16 model to Q4_K_M
 creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@@ -49,42 +134,53 @@ success

 ### Supported Quantizations

- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
+- `q8_0`

 #### K-means Quantizations

- `Q3_K_S`
- `Q3_K_M`
- `Q3_K_L`
- `Q4_K_S`
- `Q4_K_M`
- `Q5_K_S`
- `Q5_K_M`
- `Q6_K`
+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
+- `q4_K_S`
+- `q4_K_M`
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`

-## Template Detection

-> [!NOTE]
-> Template detection requires v0.1.42 or higher.
+## Sharing your model on ollama.com

-Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
+You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.

-```dockerfile
-FROM /path/to/my/gemma/model
-```
+First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
+
+<img src="images/signup.png" alt="Sign-Up" width="40%">
+
+The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
+
+Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
+
+Follow the directions on the page to determine where your Ollama Public Key is located.
+
+<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
+
+Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
+
+To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
+your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).

 ```shell
-$ ollama create mymodel
-transferring model data
-using autodetected template gemma-instruct
-creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
-creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
-writing manifest
-success
+ollama cp mymodel myuser/mymodel
+ollama push myuser/mymodel
+```
+
+Once your model has been pushed, other users can pull and run it by using the command:
+
+```shell
+ollama run myuser/mymodel
 ```

-Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -1,40 +1,59 @@
-# Ollama on Linux
+# Linux

 ## Install

-Install Ollama running this one-liner:
+To install Ollama, run the following command:

->
-
-```bash
+```shell
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-## AMD Radeon GPU support
-
-While AMD has contributed the `amdgpu` driver upstream to the official linux
-kernel source, the version is older and may not support all ROCm features. We
-recommend you install the latest driver from
-https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-GPU.
-
 ## Manual install

-### Download the `ollama` binary
+Download and extract the package:

-Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
+```shell
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
+sudo tar -C /usr -xzf ollama-linux-amd64.tgz
+```

-```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+Start Ollama:
+
+```shell
+ollama serve
+```
+
+In another terminal, verify that Ollama is running:
+
+```shell
+ollama -v
+```
+
+### AMD GPU install
+
+If you have an AMD GPU, also download and extract the additional ROCm package:
+
+```shell
+curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
+sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
+```
+
+### ARM64 install
+
+Download and extract the ARM64-specific package:
+
+```shell
+curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
+sudo tar -C /usr -xzf ollama-linux-arm64.tgz
 ```

 ### Adding Ollama as a startup service (recommended)

-Create a user for Ollama:
+Create a user and group for Ollama:

-```bash
-sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
+```shell
+sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
+sudo usermod -a -G ollama $(whoami)
 ```

 Create a service file in `/etc/systemd/system/ollama.service`:
@@ -50,6 +69,7 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
+Environment="PATH=$PATH"

 [Install]
 WantedBy=default.target
@@ -57,47 +77,54 @@ WantedBy=default.target

 Then start the service:

-```bash
+```shell
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```

-### Install CUDA drivers (optional – for Nvidia GPUs)
+### Install CUDA drivers (optional)

 [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.

 Verify that the drivers are installed by running the following command, which should print details about your GPU:

-```bash
+```shell
 nvidia-smi
 ```

-### Install ROCm (optional - for Radeon GPUs)
-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
+### Install AMD ROCm drivers (optional)

-Make sure to install ROCm v6
+[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.

 ### Start Ollama

-Start Ollama using `systemd`:
+Start Ollama and verify it is running:

-```bash
+```shell
 sudo systemctl start ollama
+sudo systemctl status ollama
 ```

-## Update
+> [!NOTE]
+> While AMD has contributed the `amdgpu` driver upstream to the official linux
+> kernel source, the version is older and may not support all ROCm features. We
+> recommend you install the latest driver from
+> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+> GPU.

-Update ollama by running the install script again:
+## Updating

-```bash
+Update Ollama by running the install script again:
+
+```shell
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-Or by downloading the ollama binary:
+Or by re-downloading Ollama:

-```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+```shell
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
+sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

 ## Installing specific versions
@@ -106,15 +133,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s

 For example:

-```
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
+```shell
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
 ```

 ## Viewing logs

 To view logs of Ollama running as a startup service, run:

-```bash
+```shell
 journalctl -e -u ollama
 ```

@@ -122,7 +149,7 @@ journalctl -e -u ollama

 Remove the ollama service:

-```bash
+```shell
 sudo systemctl stop ollama
 sudo systemctl disable ollama
 sudo rm /etc/systemd/system/ollama.service
@@ -130,13 +157,13 @@ sudo rm /etc/systemd/system/ollama.service

 Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):

-```bash
+```shell
 sudo rm $(which ollama)
 ```

 Remove the downloaded models and Ollama service user and group:

-```bash
+```shell
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -11,8 +11,9 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama3](#build-from-llama3)
-    - [Build from a bin file](#build-from-a-bin-file)
+    - [Build from llama3.1](#build-from-llama31)
+    - [Build from a Safetensors model](#build-from-a-safetensors-model)
+    - [Build from a GGUF file](#build-from-a-gguf-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
  - [TEMPLATE](#template)
@@ -99,22 +100,39 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```

-#### Build from llama3
+#### Build from llama3.1

 ```modelfile
-FROM llama3
+FROM llama3.1
 ```

 A list of available base models:
 <https://github.com/ollama/ollama#model-library>
+Additional models can be found at:
+<https://ollama.com/library>

-#### Build from a `bin` file
+#### Build from a Safetensors model

 ```modelfile
-FROM ./ollama-model.bin
+FROM <model directory>
 ```

-This bin file location should be specified as an absolute path or relative to the `Modelfile` location.
+The model directory should contain the Safetensors weights for a supported architecture.
+
+Currently supported model architectures:
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+  * Phi3
+
+#### Build from a GGUF file
+
+```modelfile
+FROM ./ollama-model.gguf
+```
+
+The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
+

 ### PARAMETER

@@ -174,10 +192,23 @@ SYSTEM """<system message>"""

 ### ADAPTER

-The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
+The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
+
+#### Safetensor adapter

 ```modelfile
-ADAPTER ./ollama-lora.bin
+ADAPTER <path to safetensor adapter>
+```
+
+Currently supported Safetensor adapters:
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+
+#### GGUF adapter
+
+```modelfile
+ADAPTER ./ollama-lora.gguf
 ```

 ### LICENSE
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -300,3 +300,28 @@ curl http://localhost:11434/v1/chat/completions \
        ]
    }'
 ```
+
+### Setting the context size
+
+The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
+
+```modelfile
+FROM <some model>
+PARAMETER num_ctx <context size>
+```
+
+Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
+
+```shell
+curl http://localhost:11434/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "mymodel",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -48,6 +48,9 @@ the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
 - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories

+## Uninstall
+
+The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.

 ## Standalone CLI

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -30,9 +30,7 @@ func Host() *url.URL {
 		defaultPort = "443"
 	}

-	// trim trailing slashes
-	hostport = strings.TrimRight(hostport, "/")
-
+	hostport, path, _ := strings.Cut(hostport, "/")
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
 		host, port = "127.0.0.1", defaultPort
@@ -45,15 +43,13 @@ func Host() *url.URL {

 	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
 		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
-		return &url.URL{
-			Scheme: scheme,
-			Host:   net.JoinHostPort(host, defaultPort),
-		}
+		port = defaultPort
 	}

 	return &url.URL{
 		Scheme: scheme,
 		Host:   net.JoinHostPort(host, port),
+		Path:   path,
 	}
 }

@@ -116,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
 	return keepAlive
 }

+// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
+// Zero or Negative values are treated as infinite.
+// Default is 5 minutes.
+func LoadTimeout() (loadTimeout time.Duration) {
+	loadTimeout = 5 * time.Minute
+	if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
+		if d, err := time.ParseDuration(s); err == nil {
+			loadTimeout = d
+		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
+			loadTimeout = time.Duration(n) * time.Second
+		}
+	}
+
+	if loadTimeout <= 0 {
+		return time.Duration(math.MaxInt64)
+	}
+
+	return loadTimeout
+}
+
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := Var(k); s != "" {
@@ -174,7 +190,7 @@ func RunnersDir() (p string) {

 	defer func() {
 		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()

@@ -190,17 +206,17 @@ func RunnersDir() (p string) {
 	}

 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}

 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
@@ -235,6 +251,23 @@ var (
 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )

+func Uint64(key string, defaultValue uint64) func() uint64 {
+	return func() uint64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
+// Set aside VRAM per GPU
+var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
+
 type EnvVar struct {
 	Name        string
 	Value       any
@@ -245,9 +278,11 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
+		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
+		"OLLAMA_LOAD_TIMEOUT":      {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
@@ -282,3 +317,12 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
+
+// On windows, we keep the binary at the top directory, but
+// other platforms use a "bin" directory, so this returns ".."
+func LibRelativeToExe() string {
+	if runtime.GOOS == "windows" {
+		return "."
+	}
+	return ".."
+}
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -13,34 +13,35 @@ func TestHost(t *testing.T) {
 		value  string
 		expect string
 	}{
-		"empty":               {"", "127.0.0.1:11434"},
-		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
-		"only port":           {":1234", ":1234"},
-		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
-		"hostname":            {"example.com", "example.com:11434"},
-		"hostname and port":   {"example.com:1234", "example.com:1234"},
-		"zero port":           {":0", ":0"},
-		"too large port":      {":66000", ":11434"},
-		"too small port":      {":-1", ":11434"},
-		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
-		"ipv6 world open":     {"[::]", "[::]:11434"},
-		"ipv6 no brackets":    {"::1", "[::1]:11434"},
-		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
-		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
-		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
-		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
-		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
-		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
-		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
-		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
-		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
+		"empty":               {"", "http://127.0.0.1:11434"},
+		"only address":        {"1.2.3.4", "http://1.2.3.4:11434"},
+		"only port":           {":1234", "http://:1234"},
+		"address and port":    {"1.2.3.4:1234", "http://1.2.3.4:1234"},
+		"hostname":            {"example.com", "http://example.com:11434"},
+		"hostname and port":   {"example.com:1234", "http://example.com:1234"},
+		"zero port":           {":0", "http://:0"},
+		"too large port":      {":66000", "http://:11434"},
+		"too small port":      {":-1", "http://:11434"},
+		"ipv6 localhost":      {"[::1]", "http://[::1]:11434"},
+		"ipv6 world open":     {"[::]", "http://[::]:11434"},
+		"ipv6 no brackets":    {"::1", "http://[::1]:11434"},
+		"ipv6 + port":         {"[::1]:1337", "http://[::1]:1337"},
+		"extra space":         {" 1.2.3.4 ", "http://1.2.3.4:11434"},
+		"extra quotes":        {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
+		"extra space+quotes":  {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
+		"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
+		"http":                {"http://1.2.3.4", "http://1.2.3.4:80"},
+		"http port":           {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
+		"https":               {"https://1.2.3.4", "https://1.2.3.4:443"},
+		"https port":          {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
+		"proxy path":          {"https://example.com/ollama", "https://example.com:443/ollama"},
 	}

 	for name, tt := range cases {
 		t.Run(name, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", tt.value)
-			if host := Host(); host.Host != tt.expect {
-				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
+			if host := Host(); host.String() != tt.expect {
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
 			}
 		})
 	}
@@ -214,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
 	}
 }

+func TestLoadTimeout(t *testing.T) {
+	defaultTimeout := 5 * time.Minute
+	cases := map[string]time.Duration{
+		"":       defaultTimeout,
+		"1s":     time.Second,
+		"1m":     time.Minute,
+		"1h":     time.Hour,
+		"5m0s":   defaultTimeout,
+		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
+		"0":      time.Duration(math.MaxInt64),
+		"60":     60 * time.Second,
+		"120":    2 * time.Minute,
+		"3600":   time.Hour,
+		"-0":     time.Duration(math.MaxInt64),
+		"-1":     time.Duration(math.MaxInt64),
+		"-1m":    time.Duration(math.MaxInt64),
+		// invalid values
+		" ":   defaultTimeout,
+		"???": defaultTimeout,
+		"1d":  defaultTimeout,
+		"1y":  defaultTimeout,
+		"1w":  defaultTimeout,
+	}
+
+	for tt, expect := range cases {
+		t.Run(tt, func(t *testing.T) {
+			t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
+			if actual := LoadTimeout(); actual != expect {
+				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
+			}
+		})
+	}
+}
+
 func TestVar(t *testing.T) {
 	cases := map[string]string{
 		"value":       "value",
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -9,6 +9,8 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+
+	"github.com/ollama/ollama/envconfig"
 )

 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -54,7 +56,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -34,10 +34,10 @@ type HipLib struct {
 }

 func NewHipLib() (*HipLib, error) {
-	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs/ this repo will consist with v5.7
-	h, err := windows.LoadLibrary("amdhip64.dll")
+	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
+	h, err := windows.LoadLibrary("amdhip64_6.dll")
 	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64.dll, please make sure to upgrade to the latest amd driver: %w", err)
+		return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
 	}
 	hl := &HipLib{}
 	hl.dll = h
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -23,7 +23,7 @@ const (
 var (
 	// Used to validate if the given ROCm lib is usable
 	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
-	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
+	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
 )

 func AMDGetGPUInfo() []RocmGPUInfo {
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
+	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -4,9 +4,17 @@ package gpu

 import (
 	"log/slog"
+	"os"
+	"regexp"
+	"runtime"
+	"strconv"
 	"strings"
 )

+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
+
+func cudaVariant(gpuInfo CudaGPUInfo) string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		return "v11"
+	}
+	return "v12"
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -64,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU

-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -215,7 +211,7 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
-					Variant: cpuCapability,
+					Variant: cpuCapability.String(),
 					ID:      "0",
 				},
 			},
@@ -229,11 +225,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}

-		// On windows we bundle the nvidia library one level above the runner dir
-		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
-		}
+		depPath := LibraryDir()

 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -269,11 +261,23 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.computeMajor = int(memInfo.major)
+				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DependencyPath = depPath
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
+				variant := cudaVariant(gpuInfo)
+				if depPath != "" {
+					gpuInfo.DependencyPath = depPath
+					// Check for variant specific directory
+					if variant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+						}
+					}
+				}
+				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+				gpuInfo.Variant = variant

 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
@@ -306,13 +310,6 @@ func GetGPUInfo() GpuInfoList {
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			if oHandles != nil && oHandles.oneapi != nil {
-
-				// On windows we bundle the oneapi library one level above the runner dir
-				depPath = ""
-				if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-					depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
-				}
-
 				for d := range oHandles.oneapi.num_drivers {
 					if oHandles.oneapi == nil {
 						// shouldn't happen
@@ -467,10 +464,12 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
-	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

+	// Start with our bundled libraries
+	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
+
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), ";")
@@ -479,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	default:
 		return gpuLibPaths
 	}
-	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+
+	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
+		patterns = append(patterns, filepath.Join(d, baseLibName))
 	}
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
@@ -641,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
+
+func LibraryDir() string {
+	// On Windows/linux we bundle the dependencies at the same level as the executable
+	appExe, err := os.Executable()
+	if err != nil {
+		slog.Warn("failed to lookup executable path", "error", err)
+	}
+	cwd, err := os.Getwd()
+	if err != nil {
+		slog.Warn("failed to lookup working directory", "error", err)
+	}
+	// Scan for any of our dependeices, and pick first match
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
+		libDep := filepath.Join("lib", "ollama")
+		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
+			return filepath.Join(root, libDep)
+		}
+		// Developer mode, local build
+		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+	}
+	slog.Warn("unable to locate gpu dependency libraries")
+	return ""
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability(),
+				Variant: GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability(),
+			Variant: GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -47,7 +47,7 @@ var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so"
+	OneapiMgmtName = "libze_intel_gpu.so*"
 )

 func GetCPUMem() (memInfo, error) {
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
 	}
 }

+func TestByLibrary(t *testing.T) {
+	type testCase struct {
+		input  []GpuInfo
+		expect int
+	}
+
+	testCases := map[string]*testCase{
+		"empty":                    {input: []GpuInfo{}, expect: 0},
+		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
+		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
+		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
+	}
+
+	for k, v := range testCases {
+		t.Run(k, func(t *testing.T) {
+			resp := (GpuInfoList)(v.input).ByLibrary()
+			if len(resp) != v.expect {
+				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
+			}
+		})
+	}
+}
+
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -19,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant CPUCapability `json:"variant"`
+	Variant string `json:"variant"`

 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -53,8 +53,10 @@ type CPUInfo struct {

 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead uint64 // Memory overhead between the driver library and management library
-	index      int    //nolint:unused,nolintlint
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
+	index        int    //nolint:unused,nolintlint
+	computeMajor int    //nolint:unused,nolintlint
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo

@@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone {
-			requested += "_" + info.Variant.String()
+		if info.Variant != CPUCapabilityNone.String() {
+			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
@@ -92,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 			}
 		}
 		if !found {
-			libs = append(libs, info.Library)
+			libs = append(libs, requested)
 			resp = append(resp, []GpuInfo{info})
 		}
 	}
@@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
+			"variant", g.Variant,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}

-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 6 {
+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}

-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 12 {
+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/llm/ext_server/json.hpp
+++ b/llm/ext_server/json.hpp
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -262,7 +262,7 @@ struct server_slot {
       char buffer[512];
        double t_token = t_prompt_processing / n_prompt_tokens_processed;
        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+        snprintf(buffer, sizeof(buffer), "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@@ -276,7 +276,7 @@ struct server_slot {

        t_token = t_token_generation / n_decoded;
        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+        snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@@ -288,7 +288,7 @@ struct server_slot {
            {"n_tokens_second",    n_tokens_second},
        });

-        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
+        snprintf(buffer, sizeof(buffer), "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
@@ -425,7 +425,7 @@ struct llama_server_context

        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_should_add_bos_token(model);
+        add_bos_token = llama_add_bos_token(model);

        return true;
    }
@@ -1031,7 +1031,7 @@ struct llama_server_context
                continue;
            }

-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG_TEE("Error processing the given image");
                return false;
            }
@@ -1429,7 +1429,13 @@ struct llama_server_context
        switch (task.type)
        {
            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = prefix_slot(task.data["prompt"]);
+                server_slot *slot = nullptr;
+                if (task.embedding_mode) {
+                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
+                    slot = slots[0].available() ? &slots[0] : nullptr;
+                } else {
+                    slot = prefix_slot(task.data["prompt"]);
+                }
                if (slot == nullptr)
                {
                    // if no slot is available, we defer this task for processing later
@@ -2008,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("options:\n");
    printf("  -h, --help                show this help message and exit\n");
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
@@ -2281,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.n_threads = std::stoi(argv[i]);
+            params.cpuparams.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--grp-attn-n" || arg == "-gan")
        {
@@ -2309,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.n_threads_batch = std::stoi(argv[i]);
+            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--threads-http")
        {
@@ -2620,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
        params.kv_overrides.back().key[0] = 0;
    }

+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
    if (invalid_param)
    {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
@@ -2769,8 +2780,8 @@ int main(int argc, char **argv) {
                            {"commit", LLAMA_COMMIT}});

    LOG_INFO("system info", {
-                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
+                                {"n_threads", params.cpuparams.n_threads},
+                                {"n_threads_batch", params.cpuparams_batch.n_threads},
                                {"total_threads", std::thread::hardware_concurrency()},
                                {"system_info", llama_print_system_info()},
                            });
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -9,11 +9,14 @@ init_vars() {
        ARCH="arm64"
        ;;
    *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
    esac

    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
+    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@@ -27,6 +30,7 @@ init_vars() {
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
+        DIST_BASE=../../dist/darwin-${GOARCH}/
        ;;
    "Linux")
        LIB_EXT="so"
@@ -35,6 +39,7 @@ init_vars() {

        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
+        DIST_BASE=../../dist/linux-${GOARCH}/
        ;;
    *)
        ;;
@@ -42,6 +47,7 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
+    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }

 git_module_setup() {
@@ -81,30 +87,42 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+    # remove unnecessary build artifacts
+    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }

 compress() {
    echo "Compressing payloads to reduce overall binary size..."
-    pids=""
    rm -rf ${BUILD_DIR}/bin/*.gz
    for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${f} &
-        pids+=" $!"
+        ${GZIP} -n --best -f ${f} &
+        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            gzip -n --best -f ${f} &
-            pids+=" $!"
+            ${GZIP} -n --best -f ${f} &
+            compress_pids+=" $!"
        done
    fi
    echo
-    for pid in ${pids}; do
+}
+
+wait_for_compress() {
+    for pid in ${compress_pids}; do
        wait $pid
    done
    echo "Finished compression"
 }

+install() {
+    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
+        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
+        cp -af "${lib}" "${BUILD_DIR}/bin/"
+    done
+}
+
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -6,6 +6,7 @@

 set -ex
 set -o pipefail
+compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
@@ -18,7 +19,7 @@ sign() {
    fi
 }

-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"

 case "${GOARCH}" in
 "amd64")
@@ -98,4 +99,5 @@ case "${GOARCH}" in
 esac

 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -13,6 +13,7 @@

 set -ex
 set -o pipefail
+compress_pids=""

 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
@@ -60,7 +61,7 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -86,10 +87,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        BUILD_DIR="../build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
+        install
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -102,7 +104,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake

-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -112,6 +114,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            BUILD_DIR="../build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
+            install
            compress
        fi

@@ -129,6 +132,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
+                install
                compress
            fi

@@ -142,6 +146,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
+                install
                compress
            fi
        fi
@@ -169,7 +174,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
+    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    if [ "${ARCH}" == "arm64" ]; then
@@ -187,29 +192,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
+    export CUDAFLAGS="-t8"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
-
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
-        fi
+    install
+    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
+    mkdir -p "${CUDA_DIST_DIR}"
+    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
+        cp -a "${lib}" "${CUDA_DIST_DIR}"
    done
    compress

@@ -227,21 +222,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
    BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
+    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
    build

    # copy oneAPI dependencies
+    mkdir -p "${ONEAPI_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp "${dep}" "${BUILD_DIR}/bin/"
+        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
    done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
+    install
    compress
 fi

@@ -263,7 +261,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
@@ -271,23 +269,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        echo "Building custom ROCM GPU"
    fi
    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    # ROCm dependencies are too large to fit into a unified bundle
+    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
+    # TODO figure out how to disable runpath (rpath)
+    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
+    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    # copy the ROCM dependencies
+    mkdir -p "${ROCM_DIST_DIR}"
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+        cp -a "${dep}"* "${ROCM_DIST_DIR}"
    done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/bin/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
+    install
    compress
 fi

 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -53,7 +53,7 @@ function init_vars {
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
+    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -135,7 +135,7 @@ function build {
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
-        $extra= @("--", "/p:CL_MPcount=8")
+        $extra= @("--", "/maxCpuCount:8")
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@@ -279,7 +279,7 @@ function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
@@ -291,9 +291,9 @@ function build_cuda() {
            "-DGGML_CUDA=ON",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            "-DCMAKE_CUDA_FLAGS=-t6",
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
+            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
@@ -304,12 +304,11 @@ function build_cuda() {
        sign
        install

-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    } else {
        write-host "Skipping CUDA generation step"
    }
@@ -343,18 +342,17 @@ function build_oneapi() {
    sign
    install

-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
@@ -375,7 +373,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
-            "-DLLAMA_CUDA_NO_PEER_COPY=on",
+            "-DGGML_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
@@ -404,12 +402,11 @@ function build_rocm() {
        sign
        install

-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
 	return "unknown"
 }

+func (kv KV) Kind() string {
+	if s, ok := kv["general.type"].(string); ok {
+		return s
+	}
+
+	return "unknown"
+}
+
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
@@ -352,11 +360,13 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui

 	switch llm.KV().Architecture() {
 	case "llama":
-		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
+		fullOffload = max(
+			4*batch*(1+4*embedding+context*(1+heads)),
+			4*batch*(embedding+vocab),
+		)

 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
-			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,6 +7,7 @@ import (
 	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// Overflow that didn't fit into the GPU
 	var overflow uint64

+	overhead := envconfig.GpuOverhead()
 	availableList := make([]string, len(gpus))
 	for i, gpu := range gpus {
 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
@@ -164,8 +166,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
-			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
+		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+			slog.Debug("gpu has too little memory to allocate any layers",
+				"id", gpus[i].ID,
+				"library", gpus[i].Library,
+				"variant", gpus[i].Variant,
+				"compute", gpus[i].Compute,
+				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
+				"name", gpus[i].Name,
+				"total", format.HumanBytes2(gpus[i].TotalMemory),
+				"available", format.HumanBytes2(gpus[i].FreeMemory),
+				"minimum_memory", gpus[i].MinimumMemory,
+				"layer_size", format.HumanBytes2(layerSize),
+				"gpu_zer_overhead", format.HumanBytes2(gzo),
+				"partial_offload", format.HumanBytes2(graphPartialOffload),
+				"full_offload", format.HumanBytes2(graphFullOffload),
+			)
 			continue
 		}
 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
@@ -196,7 +212,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+layerSize {
+			if (g.g.FreeMemory - overhead) > used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
@@ -219,7 +235,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+memoryLayerOutput {
+			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
 				gpuAllocations[g.i] += memoryLayerOutput
 				layerCounts[g.i]++
 				layerCount++
@@ -306,6 +322,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 }

 func (m MemoryEstimate) log() {
+	overhead := envconfig.GpuOverhead()
 	slog.Info(
 		"offload to "+m.inferenceLibrary,
 		slog.Group(
@@ -323,6 +340,7 @@ func (m MemoryEstimate) log() {
 			"memory",
 			// memory available by GPU for offloading
 			"available", m.availableList,
+			"gpu_overhead", format.HumanBytes2(overhead),
 			slog.Group(
 				"required",
 				// memory required for full offloading
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(inputLayerCount),
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..2ddf431d 100644
+index 88355971..dd7d41ed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
+@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "codeshell") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "exaone") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/patches/06-embeddings.diff
+++ b/llm/patches/06-embeddings.diff
@@ -1,37 +1,36 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 1fe2b9f7..a43312a7 100644
+index 88355971..d7db689b 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
+     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
+@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
-            res = nullptr; // do not extract logits for embedding case
-            embd = gf->nodes[gf->n_nodes - 1];
-            if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+-            res  = nullptr; // do not extract logits for embedding case
+-            embd = nullptr;
 +        }
 +
 +        if (cparams.embeddings) {
-+            for (int i = gf->n_nodes - 1; i >= 0; --i) {
+             for (int i = gf->n_nodes - 1; i >= 0; --i) {
+-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
+-                    embd = gf->nodes[i];
 +                embd = gf->nodes[i];
 +                if (strcmp(embd->name, "result_embd_pooled") == 0) {
-+                    break;
-+                }
+                     break;
+                 }
             }
-             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
-        } else {
-+         } else {
+-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
+         } else {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
@@ -39,7 +38,6 @@ index 1fe2b9f7..a43312a7 100644
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
-+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
@@ -1,60 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8420,14 +8420,14 @@ struct llm_build_context {
-     }
- 
-     struct ggml_tensor * build_inp_mean() {
-        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
-         cb(lctx.inp_mean, "inp_mean", -1);
-         ggml_set_input(lctx.inp_mean);
-         return lctx.inp_mean;
-     }
- 
-     struct ggml_tensor * build_inp_cls() {
-        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
-         cb(lctx.inp_cls, "inp_cls", -1);
-         ggml_set_input(lctx.inp_cls);
-         return lctx.inp_cls;
-@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
- 
-         float * data = (float *) lctx.inp_mean->data;
-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
-+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
- 
-         std::vector<uint64_t> sum(n_tokens, 0);
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-             sum[seq_id] += 1;
-         }
- 
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-+        std::vector<float> div(cparams.n_seq_max, 0.0f);
-+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
-             const uint64_t s = sum[i];
-             if (s > 0) {
-                 div[i] = 1.0f/float(s);
-@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
- 
-         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
-+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
- 
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-             const llama_pos    pos    = batch.pos[i];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
-             if (pos == 0) {
-                 data[seq_id] = i;
-             }
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -1,350 +0,0 @@
-diff --git a/common/common.cpp b/common/common.cpp
-index 2e8374d5..70d0afde 100644
--- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
-         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-         if (loaded_la.adapter == nullptr) {
-             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_free_model(model);
-            return iparams;
-+
-+            // if that fails, try loading as ggla for compatibility
-+            int err = llama_model_apply_lora_from_file(model,
-+                                                    la.path.c_str(),
-+                                                    la.scale,
-+                                                    nullptr,
-+                                                    params.n_threads);
-+            if (err != 0) {
-+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-+                llama_free(lctx);
-+                llama_free_model(model);
-+                return iparams;
-+            } else {
-+                break;
-+            }
-         }
-         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
-     }
-diff --git a/include/llama.h b/include/llama.h
-index 93fd77ca..b0fb37a6 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -1160,6 +1160,20 @@ extern "C" {
- 
-     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
- 
-+    // Apply a LoRA adapter to a loaded model
-+    // path_base_model is the path to a higher quality model to use as a base for
-+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-+    // will be applied on top of the previous one
-+    // Returns 0 on success
-+    LLAMA_API int32_t llama_model_apply_lora_from_file(
-+            const struct llama_model * model,
-+                            const char * path_lora,
-+                                float   scale,
-+                            const char * path_base_model,
-+                                int32_t   n_threads);
-+
-+
- #ifdef __cplusplus
- }
- #endif
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 80a0dd0f..9d7b0e17 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
-     fputs(text, stderr);
-     fflush(stderr);
- }
-+
-+static int llama_apply_lora_from_file_internal(
-+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
-+) {
-+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
-+
-+    const int64_t t_start_lora_us = ggml_time_us();
-+
-+    llama_file fin(path_lora, "rb");
-+
-+    // verify magic and version
-+    {
-+        uint32_t magic = fin.read_u32();
-+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-+            return 1;
-+        }
-+
-+        uint32_t format_version = fin.read_u32();
-+        if (format_version != 1) {
-+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
-+            return 1;
-+        }
-+    }
-+
-+    int32_t lora_r = fin.read_u32();
-+    int32_t lora_alpha = fin.read_u32();
-+    float scaling = scale * (float)lora_alpha / (float)lora_r;
-+
-+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
-+
-+    // load base model
-+    std::unique_ptr<llama_model_loader> ml;
-+    if (path_base_model) {
-+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
-+        ml->init_mappings(/*prefetch*/ false); // no prefetching
-+    }
-+
-+    struct tensor_meta {
-+        std::string name;
-+        ggml_type type;
-+        int32_t ne[2];
-+        size_t offset;
-+    };
-+    std::map<std::string, tensor_meta> tensor_meta_map;
-+
-+    // load all tensor meta
-+    while (true) {
-+        if (fin.tell() == fin.size) {
-+            // eof
-+            break;
-+        }
-+
-+        int32_t n_dims;
-+        int32_t name_len;
-+        int32_t ftype;
-+
-+        fin.read_raw(&n_dims, sizeof(n_dims));
-+        fin.read_raw(&name_len, sizeof(name_len));
-+        fin.read_raw(&ftype, sizeof(ftype));
-+
-+        if (n_dims != 1 && n_dims != 2) {
-+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
-+            return 1;
-+        }
-+
-+        int32_t ne[2] = { 1, 1 };
-+        for (int i = 0; i < n_dims; ++i) {
-+            fin.read_raw(&ne[i], sizeof(ne[i]));
-+        }
-+
-+        std::string name;
-+        {
-+            GGML_ASSERT(name_len < GGML_MAX_NAME);
-+            char buf[GGML_MAX_NAME];
-+            fin.read_raw(buf, name_len);
-+            name = std::string(buf, name_len);
-+        }
-+
-+        // check for lora suffix
-+        std::string lora_suffix;
-+        if (name.length() > 6) {
-+            lora_suffix = name.substr(name.length() - 6);
-+        }
-+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
-+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
-+            return 1;
-+        }
-+
-+        // tensor type
-+        ggml_type wtype;
-+        switch (ftype) {
-+            case 0: wtype = GGML_TYPE_F32;  break;
-+            case 1: wtype = GGML_TYPE_F16;  break;
-+            default:
-+                    {
-+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
-+                                __func__, ftype);
-+                        return 1;
-+                    }
-+        }
-+
-+        // data offset
-+        size_t offset = fin.tell();
-+        offset = (offset + 31) & -32;
-+
-+        // skip tensor data
-+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
-+
-+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
-+    }
-+
-+    bool warned = false;
-+    int n_tensors = 0;
-+
-+    // apply
-+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-+    if (backend_cpu == nullptr) {
-+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
-+        return 1;
-+    }
-+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
-+
-+    std::vector<no_init<uint8_t>> read_buf;
-+    for (const auto & it : model.tensors_by_name) {
-+        const std::string & base_name = it.first;
-+        ggml_tensor * model_t = it.second;
-+
-+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
-+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
-+            continue;
-+        }
-+
-+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
-+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
-+
-+        ggml_init_params lora_init_params = {
-+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-+            /* .mem_buffer */ nullptr,
-+            /* .no_alloc   */ true,
-+        };
-+        ggml_context * lora_ctx = ggml_init(lora_init_params);
-+        if (lora_ctx == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        // create tensors
-+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
-+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
-+        ggml_set_name(loraA, metaA.name.c_str());
-+        ggml_set_name(loraB, metaB.name.c_str());
-+
-+        ggml_tensor * base_t;
-+        if (ml) {
-+            if (!ml->get_tensor_meta(base_name.c_str())) {
-+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-+                return 1;
-+            }
-+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
-+        } else {
-+            base_t = ggml_dup_tensor(lora_ctx, model_t);
-+        }
-+        ggml_set_name(base_t, base_name.c_str());
-+
-+        // allocate in backend buffer
-+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-+        if (lora_buf == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
-+            return 1;
-+        }
-+
-+        // load tensor data
-+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
-+            read_buf.resize(ggml_nbytes(tensor));
-+            fin.seek(tensor_meta.offset, SEEK_SET);
-+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
-+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
-+        };
-+        load_tensor(metaA, loraA);
-+        load_tensor(metaB, loraB);
-+
-+        // load base model tensor data
-+        if (ml) {
-+            ml->load_data_for(base_t);
-+        } else {
-+            ggml_backend_tensor_copy(model_t, base_t);
-+        }
-+
-+        if (ggml_is_quantized(base_t->type) && !warned) {
-+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-+                            "use a f16 or f32 base model with --lora-base\n", __func__);
-+            warned = true;
-+        }
-+
-+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
-+            ggml_free(lora_ctx);
-+            ggml_backend_buffer_free(lora_buf);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        auto build_lora_graph = [&]() {
-+            // w = w + BA*s
-+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-+            ggml_set_name(BA, "BA");
-+
-+            if (scaling != 1.0f) {
-+                BA = ggml_scale(lora_ctx, BA, scaling);
-+                ggml_set_name(BA, "BA_scaled");
-+            }
-+
-+            ggml_tensor * r;
-+            r = ggml_add_inplace(lora_ctx, base_t, BA);
-+            ggml_set_name(r, "r_add");
-+
-+            if (base_t->type != model_t->type) {
-+                // convert the result to the model type
-+                r = ggml_cast(lora_ctx, r, model_t->type);
-+                ggml_set_name(r, "r_cast");
-+            }
-+
-+            return r;
-+        };
-+
-+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
-+        ggml_tensor * r = build_lora_graph();
-+        ggml_build_forward_expand(gf, r);
-+
-+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-+        if (graph_buf == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
-+            ggml_free(lora_ctx);
-+            ggml_backend_buffer_free(lora_buf);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        ggml_backend_graph_compute(backend_cpu, gf);
-+
-+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
-+
-+#if 0
-+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
-+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
-+
-+        // sched compute
-+        ggml_build_forward_expand(gf, build_graph());
-+        ggml_backend_sched_init_measure(sched, gf);
-+
-+        // create the graph again, since the previous one was destroyed by the measure
-+        ggml_graph_clear(gf);
-+        ggml_build_forward_expand(gf, build_graph());
-+        ggml_backend_sched_graph_compute(sched, gf);
-+        ggml_backend_sched_free(sched);
-+#endif
-+
-+        ggml_backend_buffer_free(lora_buf);
-+        ggml_backend_buffer_free(graph_buf);
-+        ggml_free(lora_ctx);
-+
-+        n_tensors++;
-+        if (n_tensors % 4 == 0) {
-+            LLAMA_LOG_INFO(".");
-+        }
-+    }
-+
-+    ggml_backend_free(backend_cpu);
-+
-+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
-+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
-+
-+    return 0;
-+}
-+
-+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-+    try {
-+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
-+    } catch (const std::exception & err) {
-+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-+        return 1;
-+    }
-+}
-\ No newline at end of file
--- a/llm/patches/11-phi3-sliding-window.diff
+++ b/llm/patches/11-phi3-sliding-window.diff
@@ -1,43 +0,0 @@
-From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Wed, 31 Jul 2024 14:57:04 -0700
-Subject: [PATCH] phi3 sliding window
-
---
- src/llama.cpp | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..f2872d4e 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
-             } break;
-         case LLM_ARCH_PHI3:
-             {
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
-+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- 
-                 switch (hparams.n_layer) {
-@@ -10762,7 +10762,7 @@ struct llm_build_context {
-         struct ggml_tensor * inp_pos = build_inp_pos();
- 
-         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
-+        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
- 
-         for (int il = 0; il < n_layer; ++il) {
-             auto residual = inpL;
-@@ -10820,7 +10820,7 @@ struct llm_build_context {
- 
-                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-             }
- 
-             if (il == n_layer - 1) {
-- 
-2.45.2
-
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone {
-		requested += "_" + info.Variant.String()
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
 	}

 	servers := []string{}
--- a/llm/server.go
+++ b/llm/server.go
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
 		systemSwapFreeMemory = systemMemInfo.FreeSwap
-		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+		slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	}

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}

-	if gpu.IsNUMA() {
+	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
@@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
-		libraryPaths := []string{dir, filepath.Dir(dir)}
+		// Start with the server directory for the LD_LIBRARY_PATH/PATH
+		libraryPaths := []string{dir}

 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// Append our runner directory to the path
-			// This will favor system libraries over our bundled library dependencies
+			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

 		// Note: we always put the dependency path first
-		// since this was the exact version we verified for AMD GPUs
-		// and we favor what the user had in their path
+		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != "" {
-			// TODO refine for multi-gpu support
+			// assume gpus from the same library have the same dependency path
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}

@@ -411,7 +409,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if err = s.cmd.Start(); err != nil {
-			// Detect permission denied and augment them essage about noexec
+			// Detect permission denied and augment the message about noexec
 			if errors.Is(err, os.ErrPermission) {
 				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
 				continue
@@ -586,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	stallDuration := 5 * time.Minute            // If no progress happens
-	finalLoadDuration := 5 * time.Minute        // After we hit 100%, give the runner more time to come online
+	stallDuration := envconfig.LoadTimeout()    // If no progress happens
 	stallTimer := time.Now().Add(stallDuration) // give up if we stall

 	slog.Info("waiting for llama runner to start responding")
@@ -639,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				stallTimer = time.Now().Add(stallDuration)
 			} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
 				slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
-				stallTimer = time.Now().Add(finalLoadDuration)
+				stallTimer = time.Now().Add(stallDuration)
 				fullyLoaded = true
 			}
 			time.Sleep(time.Millisecond * 250)
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -79,7 +79,7 @@ type ChatCompletionRequest struct {
 	Stop             any             `json:"stop"`
 	Temperature      *float64        `json:"temperature"`
 	FrequencyPenalty *float64        `json:"frequency_penalty"`
-	PresencePenalty  *float64        `json:"presence_penalty_penalty"`
+	PresencePenalty  *float64        `json:"presence_penalty"`
 	TopP             *float64        `json:"top_p"`
 	ResponseFormat   *ResponseFormat `json:"response_format"`
 	Tools            []api.Tool      `json:"tools"`
@@ -452,7 +452,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}

 	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature * 2.0
+		options["temperature"] = *r.Temperature
 	} else {
 		options["temperature"] = 1.0
 	}
@@ -462,11 +462,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}

 	if r.FrequencyPenalty != nil {
-		options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
+		options["frequency_penalty"] = *r.FrequencyPenalty
 	}

 	if r.PresencePenalty != nil {
-		options["presence_penalty"] = *r.PresencePenalty * 2.0
+		options["presence_penalty"] = *r.PresencePenalty
 	}

 	if r.TopP != nil {
@@ -513,7 +513,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	}

 	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature * 2.0
+		options["temperature"] = *r.Temperature
 	} else {
 		options["temperature"] = 1.0
 	}
@@ -522,9 +522,9 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		options["seed"] = *r.Seed
 	}

-	options["frequency_penalty"] = r.FrequencyPenalty * 2.0
+	options["frequency_penalty"] = r.FrequencyPenalty

-	options["presence_penalty"] = r.PresencePenalty * 2.0
+	options["presence_penalty"] = r.PresencePenalty

 	if r.TopP != 0.0 {
 		options["top_p"] = r.TopP
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -22,7 +22,10 @@ const (
 	image  = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
 )

-var False = false
+var (
+	False = false
+	True  = true
+)

 func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 	return func(c *gin.Context) {
@@ -70,6 +73,44 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
+		{
+			name: "chat handler with options",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "Hello"}
+				],
+				"stream":            true,
+				"max_tokens":        999,
+				"seed":              123,
+				"stop":              ["\n", "stop"],
+				"temperature":       3.0,
+				"frequency_penalty": 4.0,
+				"presence_penalty":  5.0,
+				"top_p":             6.0,
+				"response_format":   {"type": "json_object"}
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "Hello",
+					},
+				},
+				Options: map[string]any{
+					"num_predict":       999.0, // float because JSON doesn't distinguish between float and int
+					"seed":              123.0,
+					"stop":              []any{"\n", "stop"},
+					"temperature":       3.0,
+					"frequency_penalty": 4.0,
+					"presence_penalty":  5.0,
+					"top_p":             6.0,
+				},
+				Format: "json",
+				Stream: &True,
+			},
+		},
 		{
 			name: "chat handler with image content",
 			body: `{
@@ -186,6 +227,8 @@ func TestChatMiddleware(t *testing.T) {
 			req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
 			req.Header.Set("Content-Type", "application/json")

+			defer func() { capturedRequest = nil }()
+
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

@@ -202,7 +245,6 @@ func TestChatMiddleware(t *testing.T) {
 			if !reflect.DeepEqual(tc.err, errResp) {
 				t.Fatal("errors did not match")
 			}
-			capturedRequest = nil
 		})
 	}
 }
@@ -233,7 +275,7 @@ func TestCompletionsMiddleware(t *testing.T) {
 				Options: map[string]any{
 					"frequency_penalty": 0.0,
 					"presence_penalty":  0.0,
-					"temperature":       1.6,
+					"temperature":       0.8,
 					"top_p":             1.0,
 					"stop":              []any{"\n", "stop"},
 				},
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -4,6 +4,7 @@ set -eu

 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+GZIP=$(which pigz 2>/dev/null || echo "gzip")

 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@@ -21,11 +22,16 @@ for TARGETARCH in ${BUILD_ARCH}; do
        -t builder:$TARGETARCH \
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-
-    if [ "$TARGETARCH" = "amd64" ]; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
+    rm -rf ./dist/linux-$TARGETARCH
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
    fi
-
    docker rm builder-$TARGETARCH
+    echo "Compressing final linux bundle..."
+    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
+    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
+    if [ -d dist/linux-$TARGETARCH-rocm ]; then
+        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
+    fi
 done
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,6 +7,7 @@
 $ErrorActionPreference = "Stop"

 function checkEnv() {
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
@@ -15,26 +16,23 @@ function checkEnv() {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
-    # Try to find the CUDA dir
-    if ($null -eq $env:NVIDIA_DIR) {
+    # Locate CUDA versions
+    # Note: this assumes every version found will be built
+    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
+    if ($cudaList.length -eq 0) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:NVIDIA_DIR=($d| split-path -parent)
-        } else {
-            $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
-            if ($cudaList.length > 0) {
-                $script:NVIDIA_DIR=$cudaList[0]
-            }
+        if ($null -ne $d) {
+            $script:CUDA_DIRS=@($d| split-path -parent)
        }
    } else {
-        $script:NVIDIA_DIR=$env:NVIDIA_DIR
+        $script:CUDA_DIRS=$cudaList
    }
    
    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]

    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
-    echo "Checking version"
+    Write-Output "Checking version"
    if (!$env:VERSION) {
        $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
        $pattern="v(.+)"
@@ -71,7 +69,48 @@ function checkEnv() {
 function buildOllama() {
    write-host "Building ollama CLI"
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        & go generate ./...
+        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
+
+        # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
+        #        which targets to build
+
+        # Start by skipping CUDA to build everything else
+        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
+
+        # Then skip everyhting else and build all the CUDA variants
+        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
+            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+
+            if ($env:CUDA_LIB_DIR.Contains("v12")) {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            } else {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS=""
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            }
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@@ -103,22 +142,22 @@ function buildApp() {
 function gatherDependencies() {
    write-host "Gathering runtime dependencies"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
+    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null

    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
    }


    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -38,7 +38,7 @@ IS_WSL2=false
 KERN=$(uname -r)
 case "$KERN" in
    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
-    *icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
+    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
    *) ;;
 esac

@@ -63,16 +63,36 @@ if [ -n "$NEEDS" ]; then
    exit 1
 fi

-status "Downloading ollama..."
-curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-
 for BINDIR in /usr/local/bin /usr/bin /bin; do
    echo $PATH | grep -q $BINDIR && break || continue
 done
+OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})

-status "Installing ollama to $BINDIR..."
+status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
+$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
+if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
+    status "Downloading Linux ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    BUNDLE=1
+    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
+else
+    status "Downloading Linux ${ARCH} CLI"
+    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
+    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
+    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
+    BUNDLE=0
+    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
+fi
+

 install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -178,6 +198,16 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi

 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
+    if [ $BUNDLE -ne 0 ]; then
+        status "Downloading Linux ROCm ${ARCH} bundle"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+
+        install_success
+        status "AMD GPU ready."
+        exit 0
+    fi
    # Look for pre-existing ROCm v6 before downloading the dependencies
    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
@@ -326,12 +356,12 @@ if ! lsmod | grep -q nvidia || ! lsmod | grep -q nvidia_uvm; then
 fi

 # make sure the NVIDIA modules are loaded on boot with nvidia-persistenced
-if command -v nvidia-persistenced > /dev/null 2>&1; then
+if available nvidia-persistenced; then
    $SUDO touch /etc/modules-load.d/nvidia.conf
    MODULES="nvidia nvidia-uvm"
    for MODULE in $MODULES; do
        if ! grep -qxF "$MODULE" /etc/modules-load.d/nvidia.conf; then
-            echo "$MODULE" | sudo tee -a /etc/modules-load.d/nvidia.conf > /dev/null
+            echo "$MODULE" | $SUDO tee -a /etc/modules-load.d/nvidia.conf > /dev/null
        fi
    done
 fi
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -3,6 +3,7 @@
 # Script for common Dockerfile dependency installation in redhat linux based images

 set -ex
+set -o pipefail
 MACHINE=$(uname -m)

 if grep -i "centos" /etc/system-release >/dev/null; then
@@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@@ -43,12 +44,22 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
 EOF
    dnf install -y git \
        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
+        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+        findutils \
+        pigz
 else
    echo "ERROR Unexpected distro"
    exit 1
 fi

+if [ "${MACHINE}" = "x86_64" ] ; then
+    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
+    mv /tmp/ccache /usr/local/bin/
+else
+    yum -y install epel-release
+    yum install -y ccache
+fi
+
 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi
--- a/server/download.go
+++ b/server/download.go
@@ -256,7 +256,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 				continue
 			}
 			defer resp.Body.Close()
-			if resp.StatusCode != http.StatusTemporaryRedirect {
+			if resp.StatusCode != http.StatusTemporaryRedirect && resp.StatusCode != http.StatusOK {
 				return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
 			}
 			return resp.Location()
--- a/server/images.go
+++ b/server/images.go
@@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
 		return nil, "", err
 	}

-	if _, err = os.Stat(fp); err != nil {
-		return nil, "", err
-	}
-
-	var manifest *Manifest
-
-	bts, err := os.ReadFile(fp)
+	f, err := os.Open(fp)
 	if err != nil {
-		return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
+		return nil, "", err
 	}
+	defer f.Close()

-	shaSum := sha256.Sum256(bts)
-	shaStr := hex.EncodeToString(shaSum[:])
+	sha256sum := sha256.New()

-	if err := json.Unmarshal(bts, &manifest); err != nil {
+	var manifest Manifest
+	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
 		return nil, "", err
 	}

-	return manifest, shaStr, nil
+	return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
 }

 func GetModel(name string) (*Model, error) {
@@ -374,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	parameters := make(map[string]any)

 	var layers []Layer
+	var baseLayers []*layerGGML
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
+		command := c.Name

-		switch c.Name {
+		switch command {
 		case "model", "adapter":
-			var baseLayers []*layerGGML
-			if name := model.ParseName(c.Args); name.IsValid() {
+			if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
 				baseLayers, err = parseFromModel(ctx, name, fn)
 				if err != nil {
 					return err
@@ -414,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				}
 				defer blob.Close()

-				baseLayers, err = parseFromFile(ctx, blob, digest, fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
 				if err != nil {
 					return err
 				}
 			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
 				defer file.Close()

-				baseLayers, err = parseFromFile(ctx, file, "", fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
 				if err != nil {
 					return err
 				}
@@ -692,43 +688,18 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }

-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
-	fp, err := GetManifestPath()
+func deleteUnusedLayers(deleteMap map[string]struct{}) error {
+	manifests, err := Manifests()
 	if err != nil {
 		return err
 	}

-	walkFunc := func(path string, info os.FileInfo, _ error) error {
-		if info.IsDir() {
-			return nil
-		}
-
-		dir, file := filepath.Split(path)
-		dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
-		tag := strings.Join([]string{dir, file}, ":")
-		fmp := ParseModelPath(tag)
-
-		// skip the manifest we're trying to delete
-		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
-			return nil
-		}
-
-		// save (i.e. delete from the deleteMap) any files used in other manifests
-		manifest, _, err := GetManifest(fmp)
-		if err != nil {
-			return err
-		}
-
+	for _, manifest := range manifests {
 		for _, layer := range manifest.Layers {
 			delete(deleteMap, layer.Digest)
 		}

 		delete(deleteMap, manifest.Config.Digest)
-		return nil
-	}
-
-	if err := filepath.Walk(fp, walkFunc); err != nil {
-		return err
 	}

 	// only delete the files which are still in the deleteMap
@@ -781,8 +752,7 @@ func PruneLayers() error {

 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))

-	err = deleteUnusedLayers(nil, deleteMap)
-	if err != nil {
+	if err := deleteUnusedLayers(deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@@ -877,26 +847,19 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

-	var manifest *Manifest
-	var err error
-	var noprune string
-
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-
-	if !envconfig.NoPrune() {
-		manifest, _, err = GetManifest(mp)
-		if err != nil && !errors.Is(err, os.ErrNotExist) {
-			return err
+	manifest, _, err := GetManifest(mp)
+	if errors.Is(err, os.ErrNotExist) {
+		// noop
+	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	} else {
+		for _, l := range manifest.Layers {
+			deleteMap[l.Digest] = struct{}{}
 		}
-
-		if manifest != nil {
-			for _, l := range manifest.Layers {
-				deleteMap[l.Digest] = struct{}{}
-			}
-			if manifest.Config.Digest != "" {
-				deleteMap[manifest.Config.Digest] = struct{}{}
-			}
+		if manifest.Config.Digest != "" {
+			deleteMap[manifest.Config.Digest] = struct{}{}
 		}
 	}

@@ -975,11 +938,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}

-	if noprune == "" {
-		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap)
-		if err != nil {
-			slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
+	if !envconfig.NoPrune() && len(deleteMap) > 0 {
+		fn(api.ProgressResponse{Status: "removing unused layers"})
+		if err := deleteUnusedLayers(deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
@@ -1000,12 +961,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
 	}
 	defer resp.Body.Close()

-	var m *Manifest
+	var m Manifest
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
 	}

-	return m, err
+	return &m, err
 }

 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
--- a/server/layer.go
+++ b/server/layer.go
@@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 		if err := os.Rename(temp.Name(), blob); err != nil {
 			return Layer{}, err
 		}
+		if err := os.Chmod(blob, 0o644); err != nil {
+			return Layer{}, err
+		}
 	}

 	return Layer{
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"log/slog"
 	"os"
@@ -150,14 +151,16 @@ func Manifests() (map[model.Name]*Manifest, error) {

 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
-				slog.Warn("bad manifest name", "path", rel, "error", err)
+				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}

 			m, err := ParseNamedManifest(n)
-			if err != nil {
+			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
+			} else if err != nil {
+				return nil, fmt.Errorf("%s: %w", n, err)
 			}

 			ms[n] = m
--- a/server/model.go
+++ b/server/model.go
@@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }

-func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
@@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	defer t.Close()
 	defer os.Remove(t.Name())

-	fn(api.ProgressResponse{Status: "converting model"})
-	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
-		return nil, err
+	var layerType string
+
+	switch command {
+	case "adapter":
+		var baseModel *llm.GGML
+		for _, l := range baseLayers {
+			if l.GGML != nil {
+				baseModel = l.GGML
+				break
+			}
+		}
+
+		if baseModel == nil {
+			return nil, fmt.Errorf("no base model specified for the adapter")
+		}
+
+		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.adapter"
+	case "model":
+		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.model"
 	}

 	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}

-	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
+	layer, err := NewLayer(t, layerType)
 	if err != nil {
 		return nil, err
 	}
@@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	return detectChatTemplate(layers)
 }

-func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
@@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
-		return parseFromZipFile(ctx, file, digest, fn)
+		return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
@@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 		}

 		mediatype := "application/vnd.ollama.image.model"
-		if ggml.Name() == "ggla" {
+		if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -139,6 +139,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,

 func TestParseFromFileFromLayer(t *testing.T) {
 	tempModels := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempModels)

 	file, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@@ -153,7 +154,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@@ -166,7 +167,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@@ -189,6 +190,7 @@ func TestParseFromFileFromLayer(t *testing.T) {

 func TestParseLayerFromCopy(t *testing.T) {
 	tempModels := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempModels)

 	file2, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@@ -206,7 +208,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -73,18 +73,6 @@ func ParseModelPath(name string) ModelPath {

 var errModelPathInvalid = errors.New("invalid model path")

-func (mp ModelPath) Validate() error {
-	if mp.Repository == "" {
-		return fmt.Errorf("%w: model repository name is required", errModelPathInvalid)
-	}
-
-	if strings.Contains(mp.Tag, ":") {
-		return fmt.Errorf("%w: ':' (colon) is not allowed in tag names", errModelPathInvalid)
-	}
-
-	return nil
-}
-
 func (mp ModelPath) GetNamespaceRepository() string {
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }
@@ -105,7 +93,11 @@ func (mp ModelPath) GetShortTagname() string {

 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
+	if p := filepath.Join(mp.Registry, mp.Namespace, mp.Repository, mp.Tag); filepath.IsLocal(p) {
+		return filepath.Join(envconfig.Models(), "manifests", p), nil
+	}
+
+	return "", errModelPathInvalid
 }

 func (mp ModelPath) BaseURL() *url.URL {
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -1,6 +1,7 @@
 package server

 import (
+	"errors"
 	"os"
 	"path/filepath"
 	"testing"
@@ -154,3 +155,10 @@ func TestParseModelPath(t *testing.T) {
 		})
 	}
 }
+
+func TestInsecureModelpath(t *testing.T) {
+	mp := ParseModelPath("../../..:something")
+	if _, err := mp.GetManifestPath(); !errors.Is(err, errModelPathInvalid) {
+		t.Errorf("expected error: %v", err)
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -463,7 +463,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

-func (s *Server) PullModelHandler(c *gin.Context) {
+func (s *Server) PullHandler(c *gin.Context) {
 	var req api.PullRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -513,7 +513,7 @@ func (s *Server) PullModelHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) PushModelHandler(c *gin.Context) {
+func (s *Server) PushHandler(c *gin.Context) {
 	var req api.PushRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -577,7 +577,7 @@ func checkNameExists(name model.Name) error {
 	return nil
 }

-func (s *Server) CreateModelHandler(c *gin.Context) {
+func (s *Server) CreateHandler(c *gin.Context) {
 	var r api.CreateRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -647,7 +647,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) DeleteModelHandler(c *gin.Context) {
+func (s *Server) DeleteHandler(c *gin.Context) {
 	var r api.DeleteRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -680,7 +680,7 @@ func (s *Server) DeleteModelHandler(c *gin.Context) {
 	}
 }

-func (s *Server) ShowModelHandler(c *gin.Context) {
+func (s *Server) ShowHandler(c *gin.Context) {
 	var req api.ShowRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -829,7 +829,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
 	return kv, nil
 }

-func (s *Server) ListModelsHandler(c *gin.Context) {
+func (s *Server) ListHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -879,7 +879,7 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, api.ListResponse{Models: models})
 }

-func (s *Server) CopyModelHandler(c *gin.Context) {
+func (s *Server) CopyHandler(c *gin.Context) {
 	var r api.CopyRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -1081,33 +1081,33 @@ func (s *Server) GenerateRoutes() http.Handler {
 		allowedHostsMiddleware(s.addr),
 	)

-	r.POST("/api/pull", s.PullModelHandler)
+	r.POST("/api/pull", s.PullHandler)
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
 	r.POST("/api/embed", s.EmbedHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)
-	r.POST("/api/create", s.CreateModelHandler)
-	r.POST("/api/push", s.PushModelHandler)
-	r.POST("/api/copy", s.CopyModelHandler)
-	r.DELETE("/api/delete", s.DeleteModelHandler)
-	r.POST("/api/show", s.ShowModelHandler)
+	r.POST("/api/create", s.CreateHandler)
+	r.POST("/api/push", s.PushHandler)
+	r.POST("/api/copy", s.CopyHandler)
+	r.DELETE("/api/delete", s.DeleteHandler)
+	r.POST("/api/show", s.ShowHandler)
 	r.POST("/api/blobs/:digest", s.CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
-	r.GET("/api/ps", s.ProcessHandler)
+	r.GET("/api/ps", s.PsHandler)

 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler)
 	r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler)
 	r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler)
-	r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler)
-	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler)
+	r.GET("/v1/models", openai.ListMiddleware(), s.ListHandler)
+	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowHandler)

 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
 		})

-		r.Handle(method, "/api/tags", s.ListModelsHandler)
+		r.Handle(method, "/api/tags", s.ListHandler)
 		r.Handle(method, "/api/version", func(c *gin.Context) {
 			c.JSON(http.StatusOK, gin.H{"version": version.Version})
 		})
@@ -1269,7 +1269,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 	})
 }

-func (s *Server) ProcessHandler(c *gin.Context) {
+func (s *Server) PsHandler(c *gin.Context) {
 	models := []api.ProcessModelResponse{}

 	for _, v := range s.sched.loaded {
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -93,7 +93,7 @@ func TestCreateFromBin(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)

 	var s Server
-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -120,7 +120,7 @@ func TestCreateFromModel(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -134,7 +134,7 @@ func TestCreateFromModel(t *testing.T) {
 		filepath.Join(p, "manifests", "registry.ollama.ai", "library", "test", "latest"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test",
 		Stream:    &stream,
@@ -162,7 +162,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -182,7 +182,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-bc80b03733773e0728011b2f4adf34c458b400e1aad48cb28d61170f3a2ad2d6"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -210,7 +210,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -230,7 +230,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-f29e82a8284dbdf5910b1555580ff60b04238b8da9d5e51159ada67a4d0d5851"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -267,7 +267,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -288,7 +288,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	})

 	// in order to merge parameters, the second model must be created FROM the first
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7",
 		Stream:    &stream,
@@ -326,7 +326,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	}

 	// slices are replaced
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7\nPARAMETER stop <|endoftext|>",
 		Stream:    &stream,
@@ -371,7 +371,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -391,7 +391,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-e0e27d47045063ccb167ae852c51d49a98eab33fabaee4633fdddf97213e40b5"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nMESSAGE assistant \"You're a test, Harry.\"\nMESSAGE user \"I-I'm a what?\"\nMESSAGE assistant \"A test. And a thumping good one at that, I'd wager.\"",
 		Stream:    &stream,
@@ -448,7 +448,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -488,7 +488,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	}

 	t.Run("incomplete template", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -500,7 +500,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with unclosed if", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -512,7 +512,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with undefined function", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{  Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -531,7 +531,7 @@ func TestCreateLicenses(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -579,7 +579,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	var s Server

 	t.Run("matched", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name: "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
@@ -593,14 +593,14 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
+			filepath.Join(p, "blobs", "sha256-35360843d0c84fb1506952a131bbef13cd2bb4a541251f22535170c05b56e672"),
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
-			filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
+			filepath.Join(p, "blobs", "sha256-de3959f841e9ef6b4b6255fa41cb9e0a45da89c3066aa72bdd07a4747f848990"),
 		})
 	})

 	t.Run("unmatched", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 			Stream:    &stream,
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -22,7 +22,7 @@ func TestDelete(t *testing.T) {

 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 	})
@@ -31,7 +31,7 @@ func TestDelete(t *testing.T) {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 	})
@@ -52,7 +52,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
+	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -68,7 +68,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test2"})
+	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test2"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -102,7 +102,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
 		t.Fatal(err)
 	}

-	w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
+	w := createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
 	if w.Code != http.StatusOK {
 		t.Errorf("expected status code 200, actual %d", w.Code)
 	}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -84,7 +84,7 @@ func TestGenerateChat(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@@ -144,7 +144,7 @@ func TestGenerateChat(t *testing.T) {
 	})

 	t.Run("missing capabilities chat", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@@ -270,7 +270,7 @@ func TestGenerateChat(t *testing.T) {
 		checkChatResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@@ -382,7 +382,7 @@ func TestGenerate(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@@ -442,7 +442,7 @@ func TestGenerate(t *testing.T) {
 	})

 	t.Run("missing capabilities generate", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@@ -583,7 +583,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@@ -652,7 +652,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test-suffix",
 		Modelfile: `FROM test
 TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -31,13 +31,13 @@ func TestList(t *testing.T) {

 	var s Server
 	for _, n := range expectNames {
-		createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      n,
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		})
 	}

-	w := createRequest(t, s.ListModelsHandler, nil)
+	w := createRequest(t, s.ListHandler, nil)
 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -318,7 +318,7 @@ func TestCase(t *testing.T) {
 	var s Server
 	for _, tt := range cases {
 		t.Run(tt, func(t *testing.T) {
-			w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+			w := createRequest(t, s.CreateHandler, api.CreateRequest{
 				Name:      tt,
 				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 				Stream:    &stream,
@@ -334,7 +334,7 @@ func TestCase(t *testing.T) {
 			}

 			t.Run("create", func(t *testing.T) {
-				w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+				w = createRequest(t, s.CreateHandler, api.CreateRequest{
 					Name:      strings.ToUpper(tt),
 					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 					Stream:    &stream,
@@ -350,7 +350,7 @@ func TestCase(t *testing.T) {
 			})

 			t.Run("pull", func(t *testing.T) {
-				w := createRequest(t, s.PullModelHandler, api.PullRequest{
+				w := createRequest(t, s.PullHandler, api.PullRequest{
 					Name:   strings.ToUpper(tt),
 					Stream: &stream,
 				})
@@ -365,7 +365,7 @@ func TestCase(t *testing.T) {
 			})

 			t.Run("copy", func(t *testing.T) {
-				w := createRequest(t, s.CopyModelHandler, api.CopyRequest{
+				w := createRequest(t, s.CopyHandler, api.CopyRequest{
 					Source:      tt,
 					Destination: strings.ToUpper(tt),
 				})
@@ -387,7 +387,7 @@ func TestShow(t *testing.T) {

 	var s Server

-	createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name: "show-model",
 		Modelfile: fmt.Sprintf(
 			"FROM %s\nFROM %s",
@@ -396,7 +396,7 @@ func TestShow(t *testing.T) {
 		),
 	})

-	w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+	w := createRequest(t, s.ShowHandler, api.ShowRequest{
 		Name: "show-model",
 	})

--- a/server/sched.go
+++ b/server/sched.go
@@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

+					// Embedding models should always be loaded with parallel=1
+					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
+						numParallel = 1
+					}
+
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
@@ -734,7 +739,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL

 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	*numParallel = 1
+	if *numParallel <= 0 {
+		*numParallel = 1
+		req.opts.NumCtx = req.origNumCtx
+	}
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est

 	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(1),
--- a/server/upload.go
+++ b/server/upload.go
@@ -45,7 +45,7 @@ type blobUpload struct {
 }

 const (
-	numUploadParts          = 64
+	numUploadParts          = 16
 	minUploadPartSize int64 = 100 * format.MegaByte
 	maxUploadPartSize int64 = 1000 * format.MegaByte
 )
--- a/template/alfred.gotmpl
+++ b/template/alfred.gotmpl
@@ -1 +1,2 @@
-{{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
+{{- range .Messages }}<start_{{ .Role }}>{{ .Content }}<end_message>
+{{- end }}<start_assistant>
--- a/template/alpaca.gotmpl
+++ b/template/alpaca.gotmpl
@@ -1,8 +1,18 @@
-{{ if .System }}{{ .System }}
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- else if eq .Role "user" }}
+{{- if $system }}{{ $system }}

-{{ end }}{{ if .Prompt }}### Instruction:
-{{ .Prompt }}
+{{ $system = "" }}
+{{- end }}### Instruction:
+{{ .Content }}

-{{ end }}### Response:
-{{ .Response }}
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}

+{{ end }}
+{{- end }}### Response:
--- a/template/chatml.gotmpl
+++ b/template/chatml.gotmpl
@@ -1,6 +1,3 @@
-{{ if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}{{ if .Prompt }}<|im_start|>user
-{{ .Prompt }}<|im_end|>
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ .Response }}<|im_end|>
--- a/template/chatqa.gotmpl
+++ b/template/chatqa.gotmpl
@@ -1,6 +1,7 @@
-{{ if .System }}System: {{ .System }}
-
-{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
-
-{{ end }}Assistant: {{ .Response }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System:
+{{- else if eq .Role "user" }}User:
+{{- else if eq .Role "assistant" }}Assistant:
+{{- end }} {{ .Content }}

+{{ end }}Assistant:
--- a/template/codellama-70b-instruct.gotmpl
+++ b/template/codellama-70b-instruct.gotmpl
@@ -1,10 +1,10 @@
-{{ if .System }}Source: system
-
- {{ .System }} <step> {{ end }}Source: user
-
- {{ .Prompt }} <step> Source: assistant
-{{- if not .Response }}
-Destination: user
+{{- range .Messages }}Source:
+{{- if eq .Role "system" }} system
+{{- else if eq .Role "user" }} user
+{{- else if eq .Role "assistant" }} assistant
 {{- end }}

- {{ .Response }} <step> 
+ {{ .Content }} <step> {{ end }}Source: assistant
+Destination: user
+
+ 
--- a/template/falcon-instruct.gotmpl
+++ b/template/falcon-instruct.gotmpl
@@ -1,5 +1,8 @@
-{{ if .System }}System: {{ .System }}
-{{ end }}{{ if .Prompt }}User:
-{{ .Prompt }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System: {{ .Content }}
+{{ continue }}
+{{- else if eq .Role "user" }}User:
+{{- else if eq .Role "assistant" }}Falcon:
+{{- end }}
+{{ .Content }}
 {{ end }}Falcon:
-{{ .Response }}
--- a/template/gemma-instruct.gotmpl
+++ b/template/gemma-instruct.gotmpl
@@ -1,5 +1,16 @@
-<start_of_turn>user
-{{ if .System }}{{ .System }}
-{{ end }}{{ .Prompt }}<end_of_turn>
-<start_of_turn>model
-{{ .Response }}<end_of_turn>
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- continue }}
+{{- else if eq .Role "user" }}<start_of_turn>user
+{{- if $system }}
+{{ $system }}
+{{- $system = "" }}
+{{- end }}
+{{- else if eq .Role "assistant" }}<start_of_turn>model
+{{- end }}
+{{ .Content }}<end_of_turn>
+{{ end }}<start_of_turn>model
--- a/template/granite-instruct.gotmpl
+++ b/template/granite-instruct.gotmpl
@@ -1,9 +1,8 @@
-{{ if .System }}System:
-{{ .System }}
-
-{{ end }}{{ if .Prompt }}Question:
-{{ .Prompt }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System:
+{{- else if eq .Role "user" }}Question:
+{{- else if eq .Role "assistant" }}Answer:
+{{- end }}
+{{ .Content }}

 {{ end }}Answer:
-{{ .Response }}
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
likelovewant	486ae433ae	Merge branch 'ollama:main' into main	2024-09-09 19:33:57 +08:00
Jeffrey Morgan	bb6a086d63	readme: add crewAI to community integrations (#6699 )	2024-09-08 00:36:24 -07:00
RAPID ARCHITECT	30c8f201cc	readme: add crewAI with mesop to community integrations	2024-09-08 00:35:59 -07:00
likelovewant	06db1f2cf5	Merge branch 'ollama:main' into main	2024-09-08 11:44:55 +08:00
frob	06d4fba851	openai: align chat temperature and frequency_penalty options with completion (#6688 )	2024-09-07 09:08:08 -07:00
Jeffrey Morgan	108fb6c1d1	docs: improve linux install documentation (#6683 ) Includes small improvements to document layout and code blocks	2024-09-06 22:05:37 -07:00
Yaroslav	da915345d1	openai: don't scale temperature or frequency_penalty (#6514 )	2024-09-06 17:45:45 -07:00
nickthecook	8a027bc401	readme: add Archyve to community integrations (#6680 )	2024-09-06 14:06:01 -07:00
imoize	5446903fbd	readme: add Plasmoid Ollama Control to community integrations (#6681 )	2024-09-06 14:04:12 -07:00
Daniel Hiltgen	56318fb365	Improve logging on GPU too small (#6666 ) When we determine a GPU is too small for any layers, it's not always clear why. This will help troubleshoot those scenarios.	2024-09-06 08:29:36 -07:00
frob	fe91d7fff1	openai: fix "presence_penalty" typo and add test (#6665 )	2024-09-06 01:16:28 -07:00
Patrick Devine	608e87bf87	Fix gemma2 2b conversion (#6645 )	2024-09-05 17:02:28 -07:00
Daniel Hiltgen	48685c6ed0	Document uninstall on windows (#6663 )	2024-09-05 15:57:38 -07:00
Daniel Hiltgen	9565fa64a8	Revert "Detect running in a container (#6495 )" (#6662 ) This reverts commit `a60d9b89ce`.	2024-09-05 14:26:00 -07:00
Daniel Hiltgen	6719097649	llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT With the new very large parameter models, some users are willing to wait for a very long time for models to load.	2024-09-05 14:00:08 -07:00
Daniel Hiltgen	b05c9e83d9	Introduce GPU Overhead env var (#5922 ) Provide a mechanism for users to set aside an amount of VRAM on each GPU to make room for other applications they want to start after Ollama, or workaround memory prediction bugs	2024-09-05 13:46:35 -07:00
Daniel Hiltgen	a60d9b89ce	Detect running in a container (#6495 )	2024-09-05 13:24:51 -07:00
Michael Yang	bf612cd608	Merge pull request #6260 from ollama/mxyng/mem llama3.1 memory	2024-09-05 13:22:08 -07:00
Zeyo	ef98e56122	readme: add AiLama to the list of community integrations (#4957 )	2024-09-05 13:10:44 -07:00
Michael	5f944baac7	Update gpu.md: Add RTX 3050 Ti and RTX 3050 Ti (#5888 ) * Update gpu.md Seems strange that the laptop versions of 3050 and 3050 Ti would be supported but not the non-notebook, but this is what the page (https://developer.nvidia.com/cuda-gpus) says. Signed-off-by: bean5 <2052646+bean5@users.noreply.github.com> * Update gpu.md Remove notebook reference --------- Signed-off-by: bean5 <2052646+bean5@users.noreply.github.com>	2024-09-05 11:24:26 -07:00
Tobias Heinze	6fc9d22707	server: fix blob download when receiving a 200 response (#6656 )	2024-09-05 10:48:26 -07:00
Vitaly Zdanevich	f27c00d8c5	readme: add Gentoo package manager entry to community integrations (#5714 )	2024-09-05 09:58:14 -07:00
王卿	c7c845ec52	Update install.sh：Replace "command -v" with encapsulated functionality (#6035 ) Replace "command -v" with encapsulated functionality	2024-09-05 09:49:48 -07:00
Augustinas Malinauskas	cf48603943	readme: include Enchanted for Apple Vision Pro (#4949 ) Added Enchanted with Apple Vision Pro support	2024-09-05 01:30:19 -04:00
Silas Marvin	6e67be09b6	readme: add lsp-ai to community integrations (#5063 )	2024-09-05 01:17:34 -04:00
Arda Günsüren	0f5f060d2b	readme: add ollama-php library to community integrations (#6361 )	2024-09-05 01:01:14 -04:00
jk011ru	b3554778bd	readme: add vnc-lm discord bot community integration (#6644 )	2024-09-04 19:46:02 -04:00
Pascal Patry	bbe7b96ded	llm: use json.hpp from common (#6642 )	2024-09-04 19:34:42 -04:00
Rune Berg	c18ff18b2c	readme: add confichat to community integrations (#6378 )	2024-09-04 17:26:02 -04:00
Tomoya Fujita	133770a548	docs: add group to manual Linux isntructions and verify service is running (#6430 )	2024-09-04 14:45:09 -04:00
Teïlo M	f36ebfb478	readme: add gollm to the list of community libraries (#6099 )	2024-09-04 14:19:41 -04:00
亢奋猫	5b55379651	readme: add Cherry Studio to community integrations (#6633 )	2024-09-04 10:53:36 -04:00
Mitar	93eb43d020	readme: add Go fun package (#6421 )	2024-09-04 10:52:46 -04:00
Carter	369479cc30	docs: fix spelling error (#6391 ) change "dorrect" to "correct"	2024-09-04 09:42:33 -04:00
Erkin Alp Güney	7d89e48f5c	install.sh: update instructions to use WSL2 (#6450 )	2024-09-04 09:34:53 -04:00
Sam	27bcce6d9f	readme: add claude-dev to community integrations (#6630 )	2024-09-04 09:32:26 -04:00
Viz	491fc312ae	readme: add PyOllaMx project (#6624 )	2024-09-03 23:10:53 -04:00
Jeffrey Morgan	5e2653f9fe	llm: update llama.cpp commit to 8962422 (#6618 )	2024-09-03 21:12:39 -04:00
Daniel Hiltgen	f29b167e1a	Use cuda v11 for driver 525 and older (#6620 ) It looks like driver 525 (aka, cuda driver 12.0) has problems with the cuda v12 library we compile against, so run v11 on those older drivers if detected.	2024-09-03 17:15:31 -07:00
Daniel Hiltgen	037a4d103e	Log system memory at info (#6617 ) On systems with low system memory, we can hit allocation failures that are difficult to diagnose without debug logs. This will make it easier to spot.	2024-09-03 14:55:20 -07:00
Mateusz Migas	50c05d57e0	readme: add Painting Droid community integration (#5514 )	2024-09-03 16:15:54 -04:00
Amith Koujalgi	35159de18a	readme: update Ollama4j link and add link to Ollama4j Web UI (#6608 )	2024-09-03 16:08:50 -04:00
FellowTraveler	94fff5805f	Fix sprintf to snprintf (#5664 ) /Users/au/src/ollama/llm/ext_server/server.cpp:289:9: warning: 'sprintf' is deprecated: This function is provided for compatibility reasons only. Due to security concerns inherent in the design of sprintf(3), it is highly recommended that you use snprintf(3) instead.	2024-09-03 09:32:59 -07:00
OpenVMP	14d5093cd0	readme: add PartCAD tool to readme for generating 3D CAD models using Ollama (#6605 )	2024-09-03 12:28:01 -04:00
R0CKSTAR	9df5f0e8e4	Reduce docker image size (#5847 ) Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com>	2024-09-03 09:25:31 -07:00
presbrey	ad3eb00bee	readme: add OllamaFarm project (#6508 )	2024-09-02 16:05:36 -04:00
Jonathan Hecl	bfc2d61549	readme: add go-crew and Ollamaclient projects (#6583 )	2024-09-02 15:34:26 -04:00
SnoopyTlion	741affdfd6	docs: update faq.md for OLLAMA_MODELS env var permissions (#6587 )	2024-09-02 15:31:29 -04:00
likelovewant	8a7baa1bbf	Merge branch 'ollama:main' into main	2024-09-02 14:58:22 +08:00
Vimal Kumar	5f7b4a5e30	fix(cmd): show info may have nil ModelInfo (#6579 )	2024-08-31 21:12:17 -07:00
rayfiyo	1aad838707	docs: update GGUF examples and references (#6577 )	2024-08-31 19:34:25 -07:00
Daniel Hiltgen	a1cef4d0a5	Add findutils to base images (#6581 ) This caused missing internal files	2024-08-31 10:40:05 -07:00
Michael Yang	c41f0b9e6c	Merge pull request #6562 from ollama/mxyng/build-artifacts remove any unneeded build artifacts	2024-08-30 09:40:50 -07:00
Michael Yang	142cbb722d	Merge pull request #6482 from ollama/mxyng/client-path passthrough OLLAMA_HOST path to client	2024-08-30 09:40:34 -07:00
Michael Yang	9468c6824a	Merge pull request #6534 from ollama/mxyng/messages update templates to use messages	2024-08-30 09:39:59 -07:00
Michael Yang	11018196e0	remove any unneeded build artifacts	2024-08-29 13:40:47 -07:00
Bryan Honof	56346ccfa3	doc: Add Nix and Flox to package manager listing (#6074 )	2024-08-29 12:45:35 -04:00
Patrick Devine	8e4e509fa4	update the openai docs to explain how to set the context size (#6548 )	2024-08-28 17:11:46 -07:00
Michael Yang	47c2b947a9	Merge pull request #6546 from ollama/mxyng/fix-test fix(test): do not clobber models directory	2024-08-28 15:37:47 -07:00
Michael Yang	5eb77bf976	Merge pull request #6539 from ollama/mxyng/validate-modelpath fix: validate modelpath	2024-08-28 14:38:27 -07:00
Michael Yang	e4d0a9c325	fix(test): do not clobber models directory	2024-08-28 14:07:48 -07:00
Patrick Devine	7416ced70f	add llama3.1 chat template (#6545 )	2024-08-28 14:03:20 -07:00
Michael Yang	9cfd2dd3e3	Merge pull request #6522 from ollama/mxyng/detect-chat detect chat template from configs that contain lists	2024-08-28 11:04:18 -07:00
Michael Yang	8e6da3cbc5	update deprecated warnings	2024-08-28 09:55:11 -07:00
Michael Yang	d9d50c43cc	validate model path	2024-08-28 09:32:57 -07:00
likelovewant	76feb6c569	Merge branch 'ollama:main' into main	2024-08-28 12:02:21 +08:00
Patrick Devine	6c1c1ad6a9	throw an error when encountering unsupport tensor sizes (#6538 )	2024-08-27 17:54:04 -07:00
Daniel Hiltgen	93ea9240ae	Move ollama executable out of bin dir (#6535 )	2024-08-27 16:19:00 -07:00
Michael Yang	413ae39f3c	update templates to use messages	2024-08-27 15:44:04 -07:00
Michael Yang	60e47573a6	more tokenizer tests	2024-08-27 14:51:10 -07:00
Patrick Devine	d13c3daa0b	add safetensors to the modelfile docs (#6532 )	2024-08-27 14:46:47 -07:00
Patrick Devine	1713eddcd0	Fix import image width (#6528 )	2024-08-27 14:19:47 -07:00
Daniel Hiltgen	4e1c4f6e0b	Update manual instructions with discrete ROCm bundle (#6445 )	2024-08-27 13:42:28 -07:00
Sean Khatiri	397cae7962	llm: fix typo in comment (#6530 )	2024-08-27 13:28:29 -07:00
Patrick Devine	1c70a00f71	adjust image sizes	2024-08-27 11:15:25 -07:00
Michael Yang	eae3af6807	clean up convert tokenizer	2024-08-27 11:11:43 -07:00
Michael Yang	3eb08377f8	detect chat template from configs that contain lists	2024-08-27 10:49:33 -07:00
Patrick Devine	ac80010db8	update the import docs (#6104 )	2024-08-26 19:57:26 -07:00
Jeffrey Morgan	47fa0839b9	server: clean up route names for consistency (#6524 )	2024-08-26 19:36:11 -07:00
Daniel Hiltgen	0f92b19bec	Only enable numa on CPUs (#6484 ) The numa flag may be having a performance impact on multi-socket systems with GPU loads	2024-08-24 17:24:50 -07:00
Daniel Hiltgen	69be940bf6	gpu: Group GPU Library sets by variant (#6483 ) The recent cuda variant changes uncovered a bug in ByLibrary which failed to group by common variant for GPU types.	2024-08-23 15:11:56 -07:00
Michael Yang	9638c24c58	Merge pull request #5446 from ollama/mxyng/faq update faq	2024-08-23 14:05:59 -07:00
Michael Yang	bb362caf88	update faq	2024-08-23 13:37:21 -07:00
Michael Yang	386af6c1a0	passthrough OLLAMA_HOST path to client	2024-08-23 13:23:28 -07:00
Patrick Devine	0c819e167b	convert safetensor adapters into GGUF (#6327 )	2024-08-23 11:29:56 -07:00
Daniel Hiltgen	7a1e1c1caf	gpu: Ensure driver version set before variant (#6480 ) During rebasing, the ordering was inverted causing the cuda version selection logic to break, with driver version being evaluated as zero incorrectly causing a downgrade to v11.	2024-08-23 11:21:12 -07:00
Daniel Hiltgen	0b03b9c32f	llm: Align cmake define for cuda no peer copy (#6455 ) Define changed recently and this slipped through the cracks with the old name.	2024-08-23 11:20:39 -07:00
Daniel Hiltgen	90ca84172c	Fix embeddings memory corruption (#6467 ) * Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+)	2024-08-22 14:51:42 -07:00
Michael Yang	6bd8a4b0a1	Merge pull request #6064 from ollama/mxyng/convert-llama3 convert: update llama conversion for llama3.1	2024-08-21 12:57:09 -07:00
Michael Yang	77903ab8b4	llama3.1	2024-08-21 11:49:31 -07:00
Michael Yang	e22286c9e1	Merge pull request #5365 from ollama/mxyng/convert-gemma2 convert gemma2	2024-08-21 11:48:43 -07:00
Michael Yang	107f695929	Merge pull request #4917 from ollama/mxyng/convert-bert convert bert model from safetensors	2024-08-21 11:48:29 -07:00
Michael Yang	4ecc70d3b4	Merge pull request #6386 from zwwhdls/fix-new-layer fix: chmod new layer to 0o644 when creating it	2024-08-21 10:58:45 -07:00
likelovewant	f9e1f572c2	Merge branch 'ollama:main' into main	2024-08-21 10:45:57 +08:00
Michael Yang	3546bbd08c	convert gemma2	2024-08-20 17:27:51 -07:00
Michael Yang	beb49eef65	create bert models from cli	2024-08-20 17:27:34 -07:00
Michael Yang	5a28b9cf5f	bert	2024-08-20 17:27:34 -07:00
Daniel Hiltgen	a017cf2fea	Split rocm back out of bundle (#6432 ) We're over budget for github's maximum release artifact size with rocm + 2 cuda versions. This splits rocm back out as a discrete artifact, but keeps the layout so it can be extracted into the same location as the main bundle.	2024-08-20 07:26:38 -07:00
Daniel Hiltgen	19e5a890f7	CI: remove directories from dist dir before upload step (#6429 )	2024-08-19 15:19:21 -07:00
Daniel Hiltgen	f91c9e3709	CI: handle directories during checksum (#6427 )	2024-08-19 13:48:45 -07:00
Daniel Hiltgen	2df6905ede	Merge pull request #6424 from dhiltgen/cuda_v12 Fix overlapping artifact name on CI	2024-08-19 12:11:58 -07:00
Daniel Hiltgen	d8be22e47d	Fix overlapping artifact name on CI	2024-08-19 12:07:18 -07:00
Daniel Hiltgen	652c273f0e	Merge pull request #5049 from dhiltgen/cuda_v12 Cuda v12	2024-08-19 11:14:24 -07:00
Daniel Hiltgen	88e7705079	Merge pull request #6402 from rick-github/numParallel Override numParallel in pickBestPartialFitByLibrary() only if unset.	2024-08-19 11:07:22 -07:00
Daniel Hiltgen	f9e31da946	Review comments	2024-08-19 10:36:15 -07:00
Daniel Hiltgen	88bb9e3328	Adjust layout to bin+lib/ollama	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	3b19cdba2a	Remove Jetpack	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	927d98a6cd	Add windows cuda v12 + v11 support	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	f6c811b320	Enable cuda v12 flags	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	4fe3a556fa	Add cuda v12 variant and selection logic Based on compute capability and driver version, pick v12 or v11 cuda variants.	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	fc3b4cda89	Report GPU variant in log	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	d470ebe78b	Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	c7bcb00319	Wire up ccache and pigz in the docker based build This should help speed things up a little	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	74d45f0102	Refactor linux packaging This adjusts linux to follow a similar model to windows with a discrete archive (zip/tgz) to cary the primary executable, and dependent libraries. Runners are still carried as payloads inside the main binary Darwin retain the payload model where the go binary is fully self contained.	2024-08-19 09:38:53 -07:00
Jeffrey Morgan	9fddef3731	server: limit upload parts to 16 (#6411 )	2024-08-19 09:20:52 -07:00
Richard Lyons	885cf45087	Fix white space.	2024-08-18 03:07:16 +02:00
Richard Lyons	9352eeb752	Reset NumCtx.	2024-08-18 02:55:01 +02:00
Richard Lyons	0ad0e738cd	Override numParallel only if unset.	2024-08-18 01:43:26 +02:00
likelovewant	3442ca76a9	Merge branch 'ollama:main' into main	2024-08-16 15:28:34 +08:00
likelovewant	4574e557ee	update to hip sdk 6.1.2	2024-08-16 15:25:43 +08:00
zwwhdls	bdc4308afb	fix: chmod new layer to 0o644 when creating it Signed-off-by: zwwhdls <zww@hdls.me>	2024-08-16 11:43:19 +08:00
Daniel Hiltgen	d29cd4c2ed	Merge pull request #6381 from eust-w/main fix: Add tooltip to system tray icon	2024-08-15 15:31:15 -07:00
eust-w	a84c05cf91	fix: Add tooltip to system tray icon - Updated setIcon method to include tooltip text for the system tray icon. - Added NIF_TIP flag and set the tooltip text using UTF16 encoding. Resolves: #6372	2024-08-16 06:00:12 +08:00
Michael Yang	e3d7f32af7	Merge pull request #6363 from ollama/mxyng/fix-noprune fix: noprune on pull	2024-08-15 12:20:38 -07:00
Michael Yang	3a75e74e34	only skip invalid json manifests	2024-08-15 10:29:14 -07:00
Michael Yang	237dccba1e	skip invalid manifest files	2024-08-14 16:55:45 -07:00
Michael Yang	b3f75fc812	fix noprune	2024-08-14 15:48:51 -07:00
Michael Yang	2003d60159	llama3.1 memory	2024-08-08 11:18:13 -07:00