Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2025-09-13 10:45:51 +08:00
committed by GitHub
28 changed files with 520 additions and 129 deletions

View File

@@ -65,14 +65,36 @@ jobs:
arch: amd64 arch: amd64
preset: 'CUDA 12' preset: 'CUDA 12'
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
cuda-components:
- '"cudart"'
- '"nvcc"'
- '"cublas"'
- '"cublas_dev"'
cuda-version: '12.8' cuda-version: '12.8'
flags: '' flags: ''
runner_dir: 'cuda_v12'
- os: windows
arch: amd64
preset: 'CUDA 13'
install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
cuda-components:
- '"cudart"'
- '"nvcc"'
- '"cublas"'
- '"cublas_dev"'
- '"crt"'
- '"nvvm"'
- '"nvptxcompiler"'
cuda-version: '13.0'
flags: ''
runner_dir: 'cuda_v13'
- os: windows - os: windows
arch: amd64 arch: amd64
preset: 'ROCm 6' preset: 'ROCm 6'
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
rocm-version: '6.2' rocm-version: '6.2'
flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
runner_dir: ''
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }} runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
environment: release environment: release
env: env:
@@ -96,7 +118,7 @@ jobs:
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
} }
@@ -138,7 +160,7 @@ jobs:
run: | run: |
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo' Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
cmake --build --parallel --preset "${{ matrix.preset }}" cmake --build --parallel --preset "${{ matrix.preset }}"
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8 cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
env: env:
@@ -232,7 +254,7 @@ jobs:
case "$COMPONENT" in case "$COMPONENT" in
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
lib/ollama/cuda_sbsa) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;; lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;

View File

@@ -46,7 +46,7 @@ jobs:
include: include:
- preset: CPU - preset: CPU
- preset: CUDA - preset: CUDA
container: nvidia/cuda:12.8.1-devel-ubuntu22.04 container: nvidia/cuda:13.0.0-devel-ubuntu22.04
flags: '-DCMAKE_CUDA_ARCHITECTURES=87' flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
- preset: ROCm - preset: ROCm
container: rocm/dev-ubuntu-22.04:6.1.2 container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,8 +78,17 @@ jobs:
include: include:
- preset: CPU - preset: CPU
- preset: CUDA - preset: CUDA
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
flags: '-DCMAKE_CUDA_ARCHITECTURES=80' flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
cuda-components:
- '"cudart"'
- '"nvcc"'
- '"cublas"'
- '"cublas_dev"'
- '"crt"'
- '"nvvm"'
- '"nvptxcompiler"'
cuda-version: '13.0'
- preset: ROCm - preset: ROCm
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@@ -102,7 +111,8 @@ jobs:
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
} }
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path

View File

@@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
endif() endif()
set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama) set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama) set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR})
@@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
install(TARGETS ggml-cuda install(TARGETS ggml-cuda
RUNTIME_DEPENDENCIES RUNTIME_DEPENDENCIES
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR} DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
PRE_INCLUDE_REGEXES cublas cublasLt cudart PRE_INCLUDE_REGEXES cublas cublasLt cudart
PRE_EXCLUDE_REGEXES ".*" PRE_EXCLUDE_REGEXES ".*"
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA

View File

@@ -18,6 +18,14 @@
"name": "CUDA", "name": "CUDA",
"inherits": [ "Default" ] "inherits": [ "Default" ]
}, },
{
"name": "CUDA 11",
"inherits": [ "CUDA" ],
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
}
},
{ {
"name": "CUDA 12", "name": "CUDA 12",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],
@@ -26,6 +34,14 @@
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
} }
}, },
{
"name": "CUDA 13",
"inherits": [ "CUDA" ],
"cacheVariables": {
"CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual",
"CMAKE_CUDA_FLAGS": "-t 2"
}
},
{ {
"name": "JetPack 5", "name": "JetPack 5",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],
@@ -72,11 +88,21 @@
"configurePreset": "CUDA", "configurePreset": "CUDA",
"targets": [ "ggml-cuda" ] "targets": [ "ggml-cuda" ]
}, },
{
"name": "CUDA 11",
"inherits": [ "CUDA" ],
"configurePreset": "CUDA 11"
},
{ {
"name": "CUDA 12", "name": "CUDA 12",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],
"configurePreset": "CUDA 12" "configurePreset": "CUDA 12"
}, },
{
"name": "CUDA 13",
"inherits": [ "CUDA" ],
"configurePreset": "CUDA 13"
},
{ {
"name": "JetPack 5", "name": "JetPack 5",
"inherits": [ "CUDA" ], "inherits": [ "CUDA" ],

View File

@@ -39,15 +39,35 @@ RUN --mount=type=cache,target=/root/.ccache \
&& cmake --build --parallel --preset 'CPU' \ && cmake --build --parallel --preset 'CPU' \
&& cmake --install build --component CPU --strip --parallel 8 && cmake --install build --component CPU --strip --parallel 8
FROM base AS cuda-11
ARG CUDA11VERSION=11.8
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
ENV PATH=/usr/local/cuda-11/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
&& cmake --build --parallel --preset 'CUDA 11' \
&& cmake --install build --component CUDA --strip --parallel 8
FROM base AS cuda-12 FROM base AS cuda-12
ARG CUDA12VERSION=12.8 ARG CUDA12VERSION=12.8
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-} RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
ENV PATH=/usr/local/cuda-12/bin:$PATH ENV PATH=/usr/local/cuda-12/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \ RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CUDA 12' \ cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
&& cmake --build --parallel --preset 'CUDA 12' \ && cmake --build --parallel --preset 'CUDA 12' \
&& cmake --install build --component CUDA --strip --parallel 8 && cmake --install build --component CUDA --strip --parallel 8
FROM base AS cuda-13
ARG CUDA13VERSION=13.0
RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
ENV PATH=/usr/local/cuda-13/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
&& cmake --build --parallel --preset 'CUDA 13' \
&& cmake --install build --component CUDA --strip --parallel 8
FROM base AS rocm-6 FROM base AS rocm-6
ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \ RUN --mount=type=cache,target=/root/.ccache \
@@ -92,10 +112,14 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
go build -trimpath -buildmode=pie -o /bin/ollama . go build -trimpath -buildmode=pie -o /bin/ollama .
FROM --platform=linux/amd64 scratch AS amd64 FROM --platform=linux/amd64 scratch AS amd64
COPY --from=cuda-12 dist/lib/ollama /lib/ollama # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
FROM --platform=linux/arm64 scratch AS arm64 FROM --platform=linux/arm64 scratch AS arm64
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5 COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

View File

@@ -435,6 +435,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.) - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.) - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models) - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
### Cloud ### Cloud

View File

@@ -388,8 +388,12 @@ type EmbedRequest struct {
// this request. // this request.
KeepAlive *Duration `json:"keep_alive,omitempty"` KeepAlive *Duration `json:"keep_alive,omitempty"`
// Truncate truncates the input to fit the model's max sequence length.
Truncate *bool `json:"truncate,omitempty"` Truncate *bool `json:"truncate,omitempty"`
// Dimensions truncates the output embedding to the specified dimension.
Dimensions int `json:"dimensions,omitempty"`
// Options lists model-specific options. // Options lists model-specific options.
Options map[string]any `json:"options"` Options map[string]any `json:"options"`
} }

View File

@@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
if err != nil { if err != nil {
return return
} }
for _, cap := range resp.Capabilities { if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
if cap == model.CapabilityThinking { return
return
}
} }
fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name) fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
} }

View File

@@ -43,14 +43,15 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
} }
} }
} }
return "sbsa"
} }
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers if gpuInfo.DriverMajor < 13 {
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) { // The detected driver is older than 580 (Aug 2025)
// The detected driver is older than Feb 2023 // Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) {
return "v11" slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
}
return "v12"
} }
return "v12" return "v13"
} }

View File

@@ -1708,6 +1708,7 @@ Advanced parameters:
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `dimensions`: number of dimensions for the embedding
### Examples ### Examples

View File

@@ -11,12 +11,13 @@ curl -fsSL https://ollama.com/install.sh | sh
## Manual install ## Manual install
> [!NOTE] > [!NOTE]
> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first. > If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
Download and extract the package: Download and extract the package:
```shell ```shell
curl -LO https://ollama.com/download/ollama-linux-amd64.tgz curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
sudo rm -rf /usr/lib/ollama
sudo tar -C /usr -xzf ollama-linux-amd64.tgz sudo tar -C /usr -xzf ollama-linux-amd64.tgz
``` ```

View File

@@ -92,6 +92,9 @@ If none of those resolve the problem, gather additional information and file an
- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
You may get more details for initialization failures by enabling debug prints in the uvm driver. You should only use this temporarily while troubleshooting
- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
## AMD GPU Discovery ## AMD GPU Discovery

View File

@@ -185,8 +185,6 @@ var (
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
// Auth enables authentication between the Ollama client and server // Auth enables authentication between the Ollama client and server
UseAuth = Bool("OLLAMA_AUTH") UseAuth = Bool("OLLAMA_AUTH")
// Enable the new memory estimation logic
NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
) )
func String(s string) func() string { func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
"OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
// Informational // Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},

View File

@@ -57,10 +57,28 @@ func (kv KV) EmbeddingLength() uint64 {
return uint64(kv.Uint("embedding_length")) return uint64(kv.Uint("embedding_length"))
} }
func (kv KV) HeadCount() []uint64 {
headCountDefault := uint32(1)
headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
if len(headCount) == 1 {
headCountDefault = headCount[0]
}
nLayers := int(kv.BlockCount())
if len(headCount) > nLayers {
slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
}
out := make([]uint64, nLayers)
for i := range nLayers {
if i >= len(headCount) {
out[i] = uint64(headCountDefault)
} else {
out[i] = uint64(headCount[i])
}
}
return out
}
func (kv KV) HeadCountMax() uint64 { func (kv KV) HeadCountMax() uint64 {
// TODO(drifkin): using the max value can cause an overestimation. In the
// future if array values become more popular, we can adapt the more invasive
// <https://github.com/ollama/ollama/pull/10225>
return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1)) return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
} }
@@ -68,6 +86,27 @@ func (kv KV) HeadCountMin() uint64 {
return uint64(kv.UintOrMinArrayValue("attention.head_count", 1)) return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
} }
func (kv KV) HeadCountKV() []uint64 {
headCountKVDefault := uint32(1)
headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
if len(headCountKV) == 1 {
headCountKVDefault = headCountKV[0]
}
nLayers := int(kv.BlockCount())
if len(headCountKV) > nLayers {
slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
}
out := make([]uint64, nLayers)
for i := range nLayers {
if i >= len(headCountKV) {
out[i] = uint64(headCountKVDefault)
} else {
out[i] = uint64(headCountKV[i])
}
}
return out
}
func (kv KV) HeadCountKVMax() uint64 { func (kv KV) HeadCountKVMax() uint64 {
return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1)) return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
} }
@@ -100,6 +139,26 @@ func (kv KV) ChatTemplate() string {
return kv.String("tokenizer.chat_template") return kv.String("tokenizer.chat_template")
} }
// ssm architecture parameters
func (kv KV) SSMConvKernel() uint64 {
return uint64(kv.Uint("ssm.conv_kernel"))
}
func (kv KV) SSMInnerSize() uint64 {
return uint64(kv.Uint("ssm.inner_size"))
}
func (kv KV) SSMStateSize() uint64 {
return uint64(kv.Uint("ssm.state_size"))
}
func (kv KV) SSMGroupCount() uint64 {
return uint64(kv.Uint("ssm.group_count"))
}
// general types
func (kv KV) String(key string, defaultValue ...string) string { func (kv KV) String(key string, defaultValue ...string) string {
val, _ := keyValue(kv, key, append(defaultValue, "")...) val, _ := keyValue(kv, key, append(defaultValue, "")...)
return val return val
@@ -131,22 +190,27 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
} }
func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) { func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
return slices.Min(arrVal), slices.Max(arrVal)
}
func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
if u32, ok := keyValue(kv, key, uint32(0)); ok { if u32, ok := keyValue(kv, key, uint32(0)); ok {
return u32, u32 return []uint32{u32}
} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok { } else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
min := slices.Min(u32s.values) return u32s.values
max := slices.Max(u32s.values)
return min, max
} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok { } else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
min := slices.Min(i32s.values) dst := make([]uint32, len(i32s.values))
max := slices.Max(i32s.values) for i, v := range i32s.values {
if min < 0 || max < 0 { if v < 0 {
slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max) slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
}
dst[i] = uint32(v)
} }
return uint32(min), uint32(max) return dst
} }
return defaultValue, defaultValue return []uint32{defaultValue}
} }
func (kv KV) Strings(key string, defaultValue ...[]string) []string { func (kv KV) Strings(key string, defaultValue ...[]string) []string {
@@ -486,7 +550,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
embedding := f.KV().EmbeddingLength() embedding := f.KV().EmbeddingLength()
heads := f.KV().HeadCountMax() heads := f.KV().HeadCountMax()
headsArr := f.KV().HeadCount()
headsKV := f.KV().HeadCountKVMax() headsKV := f.KV().HeadCountKVMax()
headsKVArr := f.KV().HeadCountKV()
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size) vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
embeddingHeads := f.KV().EmbeddingHeadCountMax() embeddingHeads := f.KV().EmbeddingHeadCountMax()
@@ -496,12 +562,51 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
bytesPerElement := kvCacheBytesPerElement(kvCacheType) bytesPerElement := kvCacheBytesPerElement(kvCacheType)
// Default for models unless special-cased below. These defaults mirror the
// cache usage in llama.cpp under the assumption that models without special
// cases below will use the llamarunner and caching will be handled by the
// llama.cpp layer.
//
// This also assumes that a layer without heads or headsKV set is recurrent
// which is usually the case. Some models (eg nemotronh) use "blocks" in
// place of layers where some are MLP blocks that don't have any cache.
// Models like this will need a special case below to be accurately
// estimated.
var kvTotal uint64 var kvTotal uint64
kv = make([]uint64, f.KV().BlockCount()) kv = make([]uint64, f.KV().BlockCount())
kvSizeAttn := uint64(0)
kvSizeRecurrent := uint64(0)
for i := range kv { for i := range kv {
kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) headsL := headsArr[i]
headsKVL := headsKVArr[i]
if headsL > 0 && headsKVL > 0 {
// full attention layer
// NOTE: Assumes uniform values for all attn layers
kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
kvSizeAttn += kv[i]
} else {
// recurrent layer
ssmDConv := f.KV().SSMConvKernel()
ssmDState := f.KV().SSMStateSize()
ssmDInner := f.KV().SSMInnerSize()
ssmNGroups := f.KV().SSMGroupCount()
nEmbdR := uint64(0)
if ssmDConv > 0 {
nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
}
nEmbdS := ssmDState * ssmDInner
// recurrent always uses F32 in llama.cpp backend
// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
bytesPerElementRecurrent := kvCacheBytesPerElement("f32")
kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
kvSizeRecurrent += kv[i]
}
kvTotal += kv[i] kvTotal += kv[i]
} }
slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)
switch f.KV().Architecture() { switch f.KV().Architecture() {
case "llama", "llama4": case "llama", "llama4":
@@ -759,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
// SupportsKVCacheType checks if the requested cache type is supported // SupportsKVCacheType checks if the requested cache type is supported
func (f GGML) SupportsKVCacheType(cacheType string) bool { func (f GGML) SupportsKVCacheType(cacheType string) bool {
if cacheType == "" || cacheType == "f16" {
return true
}
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) { if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types // gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types ", "mode", arch) slog.Warn("model only supports non-quantized cache types", "model", arch)
return cacheType == "f16" return false
} }
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
} }
// SupportsFlashAttention checks if the model supports flash attention // SupportsFlashAttention checks if the model supports flash attention
@@ -774,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool {
return false return false
} }
if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
return false
}
// Check head counts match and are non-zero // Check head counts match and are non-zero
headCountK := f.KV().EmbeddingHeadCountK() headCountK := f.KV().EmbeddingHeadCountK()
headCountV := f.KV().EmbeddingHeadCountV() headCountV := f.KV().EmbeddingHeadCountV()
@@ -794,6 +907,8 @@ func kvCacheBytesPerElement(cacheType string) float64 {
return 1 // 1/2 of fp16 return 1 // 1/2 of fp16
case "q4_0": case "q4_0":
return 0.5 // 1/4 of fp16 return 0.5 // 1/4 of fp16
case "f32":
return 4 // f32 (default for recurrent)
default: default:
return 2 // f16 (default) return 2 // f16 (default)
} }

View File

@@ -410,3 +410,99 @@ func TestAPIEmbeddings(t *testing.T) {
t.Errorf("zero length embedding response") t.Errorf("zero length embedding response")
} }
} }
func TestAPIToolCalling(t *testing.T) {
initialTimeout := 60 * time.Second
streamTimeout := 30 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
modelName := "qwen3:0.6b"
if err := PullIfMissing(ctx, client, modelName); err != nil {
t.Fatalf("pull failed %s", err)
}
tools := []api.Tool{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get the current weather in a given location",
Parameters: api.ToolFunctionParameters{
Type: "object",
Required: []string{"location"},
Properties: map[string]api.ToolProperty{
"location": {
Type: api.PropertyType{"string"},
Description: "The city and state, e.g. San Francisco, CA",
},
},
},
},
},
}
req := api.ChatRequest{
Model: modelName,
Messages: []api.Message{
{
Role: "user",
Content: "Call get_weather with location set to San Francisco.",
},
},
Tools: tools,
Options: map[string]any{
"temperature": 0,
},
}
stallTimer := time.NewTimer(initialTimeout)
var gotToolCall bool
var lastToolCall api.ToolCall
fn := func(response api.ChatResponse) error {
if len(response.Message.ToolCalls) > 0 {
gotToolCall = true
lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
}
if !stallTimer.Reset(streamTimeout) {
return fmt.Errorf("stall was detected while streaming response, aborting")
}
return nil
}
stream := true
req.Stream = &stream
done := make(chan int)
var genErr error
go func() {
genErr = client.Chat(ctx, &req, fn)
done <- 0
}()
select {
case <-stallTimer.C:
t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
case <-done:
if genErr != nil {
t.Fatalf("chat failed: %v", genErr)
}
if !gotToolCall {
t.Fatalf("expected at least one tool call, got none")
}
if lastToolCall.Function.Name != "get_weather" {
t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
}
if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
}
case <-ctx.Done():
t.Error("outer test context done while waiting for tool-calling chat")
}
}

View File

@@ -121,6 +121,7 @@ func TestMultiModelStress(t *testing.T) {
// The intent is to go 1 over what can fit so we force the scheduler to thrash // The intent is to go 1 over what can fit so we force the scheduler to thrash
targetLoadCount := 0 targetLoadCount := 0
slog.Info("Loading models to find how many can fit in VRAM before overflowing") slog.Info("Loading models to find how many can fit in VRAM before overflowing")
chooseModels:
for i, model := range chosenModels { for i, model := range chosenModels {
req := &api.GenerateRequest{Model: model} req := &api.GenerateRequest{Model: model}
slog.Info("loading", "model", model) slog.Info("loading", "model", model)
@@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) {
slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount]) slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
break break
} }
// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
for _, m := range models.Models {
if m.SizeVRAM == 0 {
slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
break chooseModels
}
}
} }
} }
if targetLoadCount == len(chosenModels) { if targetLoadCount == len(chosenModels) {

View File

@@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil { if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err) t.Fatalf("PullIfMissing failed: %v", err)
} }
DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
} }
func TestContextExhaustion(t *testing.T) { func TestContextExhaustion(t *testing.T) {
@@ -50,7 +50,7 @@ func TestContextExhaustion(t *testing.T) {
// Set up the test data // Set up the test data
req := api.GenerateRequest{ req := api.GenerateRequest{
Model: smol, Model: smol,
Prompt: "Write me a story with a ton of emojis?", Prompt: "Write me a story in english with a lot of emojis",
Stream: &stream, Stream: &stream,
Options: map[string]any{ Options: map[string]any{
"temperature": 0, "temperature": 0,

View File

@@ -38,8 +38,9 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
defer cleanup() defer cleanup()
req := api.EmbeddingRequest{ req := api.EmbeddingRequest{
Model: "all-minilm", Model: "all-minilm",
Prompt: "why is the sky blue?", Prompt: "why is the sky blue?",
KeepAlive: &api.Duration{Duration: 10 * time.Second},
} }
res, err := embeddingTestHelper(ctx, client, t, req) res, err := embeddingTestHelper(ctx, client, t, req)

View File

@@ -502,6 +502,22 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
done <- 0 done <- 0
}() }()
var response string
verify := func() {
// Verify the response contains the expected data
response = buf.String()
atLeastOne := false
for _, resp := range anyResp {
if strings.Contains(strings.ToLower(response), resp) {
atLeastOne = true
break
}
}
if !atLeastOne {
t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
}
}
select { select {
case <-stallTimer.C: case <-stallTimer.C:
if buf.Len() == 0 { if buf.Len() == 0 {
@@ -517,21 +533,14 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
if genErr != nil { if genErr != nil {
t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt) t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt)
} }
// Verify the response contains the expected data verify()
response := buf.String()
atLeastOne := false
for _, resp := range anyResp {
if strings.Contains(strings.ToLower(response), resp) {
atLeastOne = true
break
}
}
if !atLeastOne {
t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
}
slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response) slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
case <-ctx.Done(): case <-ctx.Done():
t.Error("outer test context done while waiting for generate") // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
// if they are still generating valid responses
slog.Warn("outer test context done while waiting for generate")
verify()
} }
return context return context
} }
@@ -552,7 +561,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
Model: smol, Model: smol,
Prompt: "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply", Prompt: "how do rainbows form? Be brief but factual in your reply",
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
@@ -570,9 +579,9 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{ [][]string{
{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"}, {"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"}, {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
{"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states", "cultural", "hardship", "autumn", "festival"}, {"water", "droplet", "refracted", "reflect", "color", "spectrum"},
{"fourth", "july", "declaration", "independence"}, {"fourth", "july", "declaration", "independence"},
{"nitrogen", "oxygen", "carbon", "dioxide"}, {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor"},
} }
} }
@@ -599,6 +608,22 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
done <- 0 done <- 0
}() }()
var response string
verify := func() {
// Verify the response contains the expected data
response = buf.String()
atLeastOne := false
for _, resp := range anyResp {
if strings.Contains(strings.ToLower(response), resp) {
atLeastOne = true
break
}
}
if !atLeastOne {
t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
}
}
select { select {
case <-stallTimer.C: case <-stallTimer.C:
if buf.Len() == 0 { if buf.Len() == 0 {
@@ -614,23 +639,14 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
if genErr != nil { if genErr != nil {
t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages) t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages)
} }
verify()
// Verify the response contains the expected data
response := buf.String()
atLeastOne := false
for _, resp := range anyResp {
if strings.Contains(strings.ToLower(response), resp) {
atLeastOne = true
break
}
}
if !atLeastOne {
t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
}
slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response) slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response)
case <-ctx.Done(): case <-ctx.Done():
t.Error("outer test context done while waiting for generate") // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
// if they are still generating valid responses
slog.Warn("outer test context done while waiting for chat")
verify()
} }
return &api.Message{Role: role, Content: buf.String()} return &api.Message{Role: role, Content: buf.String()}
} }

View File

@@ -515,33 +515,34 @@ func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
} }
nChunks := C.mtmd_input_chunks_size(ic) nChunks := C.mtmd_input_chunks_size(ic)
numEmbed := llamaContext.Model().NEmbd() numEmbed := llamaContext.Model().NEmbd()
lastChunkSize := 0 embed := make([][]float32, 0)
for i := range int(nChunks) { for i := range int(nChunks) {
chunk := C.mtmd_input_chunks_get(ic, C.size_t(i)) chunk := C.mtmd_input_chunks_get(ic, C.size_t(i))
numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk)) numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk))
lastChunkSize = numTokens slog.Debug("chunk tokens", "index", i, "numTokens", numTokens)
// Encode the chunk // Encode the chunk
if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) { if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
return nil, errors.New("unable to encode mtmd image chunk") return nil, errors.New("unable to encode mtmd image chunk")
} }
}
// Get the embeddings // Get the embeddings for this chunk
embed := make([][]float32, lastChunkSize) chunkEmbed := make([][]float32, numTokens)
embd := C.mtmd_get_output_embd(c.c) chunkEmbd := C.mtmd_get_output_embd(c.c)
if nil == embd { if nil == chunkEmbd {
return nil, errors.New("failed to get image embedding") continue
} }
// Extend the embedding array for each token // Extend the embedding array for each token
s := unsafe.Slice((*float32)(embd), numEmbed*lastChunkSize) s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed)
rows := make([]float32, len(s)) rows := make([]float32, len(s))
copy(rows, s) copy(rows, s)
for i := range lastChunkSize { for i := range numTokens {
embed[i] = rows[i*numEmbed : (i+1)*numEmbed] chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed]
}
embed = append(embed, chunkEmbed...)
} }
slog.Debug("image embeddings", "totalEmbeddings", len(embed))
return embed, nil return embed, nil
} }

View File

@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
var kvct string var kvct string
if useFlashAttention { if useFlashAttention {
requested := strings.ToLower(envconfig.KvCacheType()) requested := strings.ToLower(envconfig.KvCacheType())
if requested != "" && f.SupportsKVCacheType(requested) { if f.SupportsKVCacheType(requested) {
kvct = requested kvct = requested
} }
} }

View File

@@ -148,7 +148,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
var textProcessor model.TextProcessor var textProcessor model.TextProcessor
var err error var err error
if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
textProcessor, err = model.NewTextProcessor(modelPath) if len(projectors) == 0 {
textProcessor, err = model.NewTextProcessor(modelPath)
} else {
err = errors.New("split vision models aren't supported")
}
if err != nil { if err != nil {
// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
@@ -161,11 +165,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
} }
} }
newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
if newEstimates {
slog.Info("enabling new memory estimates")
}
// Verify the requested context size is <= the model training size // Verify the requested context size is <= the model training size
trainCtx := f.KV().ContextLength() trainCtx := f.KV().ContextLength()
if opts.NumCtx > int(trainCtx) && trainCtx > 0 { if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@@ -173,6 +172,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
opts.NumCtx = int(trainCtx) opts.NumCtx = int(trainCtx)
} }
opts.NumBatch = min(opts.NumBatch, opts.NumCtx)
loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()} loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount() defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
@@ -218,7 +219,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
// Flash Attention also supports kv cache quantization // Flash Attention also supports kv cache quantization
// Enable if the requested and kv cache type is supported by the model // Enable if the requested and kv cache type is supported by the model
if kvct != "" && f.SupportsKVCacheType(kvct) { if f.SupportsKVCacheType(kvct) {
loadRequest.KvCacheType = kvct loadRequest.KvCacheType = kvct
} else { } else {
slog.Warn("kv cache type not supported by model", "type", kvct) slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -431,7 +432,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
} }
}() }()
if newEstimates { if textProcessor != nil {
return &ollamaServer{llmServer: s}, nil return &ollamaServer{llmServer: s}, nil
} else { } else {
return &llamaServer{llmServer: s, ggml: f}, nil return &llamaServer{llmServer: s, ggml: f}, nil

View File

@@ -76,8 +76,9 @@ type JsonSchema struct {
} }
type EmbedRequest struct { type EmbedRequest struct {
Input any `json:"input"` Input any `json:"input"`
Model string `json:"model"` Model string `json:"model"`
Dimensions int `json:"dimensions,omitempty"`
} }
type StreamOptions struct { type StreamOptions struct {
@@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
} }
var b bytes.Buffer var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil { if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return return
} }

View File

@@ -246,7 +246,7 @@ func filesForModel(path string) ([]string, error) {
for _, match := range matches { for _, match := range matches {
if ct, err := detectContentType(match); err != nil { if ct, err := detectContentType(match); err != nil {
return nil, err return nil, err
} else if ct != contentType { } else if len(contentType) > 0 && ct != contentType {
return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match) return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match)
} }
} }
@@ -255,7 +255,8 @@ func filesForModel(path string) ([]string, error) {
} }
var files []string var files []string
if st, _ := glob(filepath.Join(path, "*.safetensors"), "application/octet-stream"); len(st) > 0 { // some safetensors files do not properly match "application/octet-stream", so skip checking their contentType
if st, _ := glob(filepath.Join(path, "*.safetensors"), ""); len(st) > 0 {
// safetensors files might be unresolved git lfs references; skip if they are // safetensors files might be unresolved git lfs references; skip if they are
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
files = append(files, st...) files = append(files, st...)

View File

@@ -34,8 +34,8 @@ type InputCache struct {
func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) { func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
numCtx := kvSize / int32(numSlots) numCtx := kvSize / int32(numSlots)
if numCtx < 1 { if int(numCtx) < batchSize {
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots) return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots)
} }
slots := make([]InputCacheSlot, numSlots) slots := make([]InputCacheSlot, numSlots)
@@ -70,11 +70,9 @@ func kvCacheTypeFromStr(s string) ml.DType {
} }
func (c *InputCache) Close() { func (c *InputCache) Close() {
if c == nil { if c != nil && c.cache != nil {
return c.cache.Close()
} }
c.cache.Close()
} }
// Locking: Operations on InputCacheSlot (including finding one // Locking: Operations on InputCacheSlot (including finding one

View File

@@ -18,7 +18,6 @@ import (
"reflect" "reflect"
"regexp" "regexp"
"runtime" "runtime"
"runtime/debug"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -1086,9 +1085,13 @@ func (s *Server) allocModel(
// Convert memory allocation panics to errors // Convert memory allocation panics to errors
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
debug.PrintStack()
if err, ok := r.(error); ok { if err, ok := r.(error); ok {
panicErr = err var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
panicErr = noMem
} else {
panic(r)
}
} else { } else {
panic(r) panic(r)
} }

View File

@@ -78,7 +78,7 @@ function checkEnv() {
} }
function buildOllama() { function buildCPU() {
mkdir -Force -path "${script:DIST_DIR}\" mkdir -Force -path "${script:DIST_DIR}\"
if ($script:ARCH -ne "arm64") { if ($script:ARCH -ne "arm64") {
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
@@ -90,20 +90,72 @@ function buildOllama() {
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component CPU --strip & cmake --install build --component CPU --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
function buildCUDA11() {
# CUDA v11 claims to be compatible with MSVC 2022, but the latest updates are no longer compatible
# 19.40 is the last compiler version that works, but recent udpates are 19.43
# So this pins to MSVC 2019 for best compatibility
mkdir -Force -path "${script:DIST_DIR}\"
if ($script:ARCH -ne "arm64") {
$hashEnv = @{} $hashEnv = @{}
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
if ("$script:CUDA_DIRS".Contains("v12")) { if ("$script:CUDA_DIRS".Contains("v11")) {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }} $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
$env:CUDAToolkit_ROOT=$hashEnv[$v12] write-host "Building CUDA v11 backend libraries $cuda"
write-host "Building CUDA v12 backend libraries" $env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
}
function buildCUDA12() {
mkdir -Force -path "${script:DIST_DIR}\"
if ($script:ARCH -ne "arm64") {
$hashEnv = @{}
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
if ("$script:CUDA_DIRS".Contains("v12.8")) {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
write-host "Building CUDA v12 backend libraries $cuda"
$env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS & cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip & cmake --install build --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
}
}
function buildCUDA13() {
mkdir -Force -path "${script:DIST_DIR}\"
if ($script:ARCH -ne "arm64") {
$hashEnv = @{}
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
if ("$script:CUDA_DIRS".Contains("v13")) {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
$env:CUDAToolkit_ROOT=$cuda
write-host "Building CUDA v13 backend libraries $cuda"
& cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
}
function buildROCm() {
mkdir -Force -path "${script:DIST_DIR}\"
if ($script:ARCH -ne "arm64") {
if ($env:HIP_PATH) { if ($env:HIP_PATH) {
write-host "Building ROCm backend libraries" write-host "Building ROCm backend libraries"
if (-Not (get-command -ErrorAction silent ninja)) { if (-Not (get-command -ErrorAction silent ninja)) {
@@ -129,6 +181,10 @@ function buildOllama() {
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
}
function buildOllama() {
mkdir -Force -path "${script:DIST_DIR}\"
write-host "Building ollama CLI" write-host "Building ollama CLI"
& go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" . & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -236,6 +292,10 @@ function distZip() {
checkEnv checkEnv
try { try {
if ($($args.count) -eq 0) { if ($($args.count) -eq 0) {
buildCPU
buildCUDA12
buildCUDA13
buildROCm
buildOllama buildOllama
buildApp buildApp
gatherDependencies gatherDependencies

View File

@@ -576,7 +576,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
if err != nil { if err != nil {
return err return err
} }
embeddings[i] = normalize(embedding) // TODO: this first normalization should be done by the model
embedding = normalize(embedding)
if req.Dimensions > 0 && req.Dimensions < len(embedding) {
embedding = normalize(embedding[:req.Dimensions])
}
embeddings[i] = embedding
return nil return nil
}) })
} }
@@ -602,11 +607,7 @@ func normalize(vec []float32) []float32 {
sum += v * v sum += v * v
} }
norm := float32(0.0) norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12))
if sum > 0 {
norm = float32(1.0 / math.Sqrt(float64(sum)))
}
for i := range vec { for i := range vec {
vec[i] *= norm vec[i] *= norm
} }