diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 40871e64..fc3cde9c 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -65,14 +65,36 @@ jobs: arch: amd64 preset: 'CUDA 12' install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' cuda-version: '12.8' flags: '' + runner_dir: 'cuda_v12' + - os: windows + arch: amd64 + preset: 'CUDA 13' + install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' + cuda-version: '13.0' + flags: '' + runner_dir: 'cuda_v13' - os: windows arch: amd64 preset: 'ROCm 6' install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe rocm-version: '6.2' flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' + runner_dir: '' runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }} environment: release env: @@ -96,7 +118,7 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } @@ -138,7 +160,7 @@ jobs: run: | Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo' - cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} + cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}" cmake --build --parallel --preset "${{ matrix.preset }}" cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8 env: @@ -232,7 +254,7 @@ jobs: case "$COMPONENT" in bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/cuda_sbsa) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4d8cf773..e470540a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,7 +46,7 @@ jobs: include: - preset: CPU - preset: CUDA - container: nvidia/cuda:12.8.1-devel-ubuntu22.04 + container: nvidia/cuda:13.0.0-devel-ubuntu22.04 flags: '-DCMAKE_CUDA_ARCHITECTURES=87' - preset: ROCm container: rocm/dev-ubuntu-22.04:6.1.2 @@ -78,8 +78,17 @@ jobs: include: - preset: CPU - preset: CUDA - install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe + install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe flags: '-DCMAKE_CUDA_ARCHITECTURES=80' + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' + cuda-version: '13.0' - preset: ROCm install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' @@ -102,7 +111,8 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path diff --git a/CMakeLists.txt b/CMakeLists.txt index 10e4c42f..2679de90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") endif() set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama) -set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama) +set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR}) @@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda) install(TARGETS ggml-cuda RUNTIME_DEPENDENCIES - DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR} + DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR} PRE_INCLUDE_REGEXES cublas cublasLt cudart PRE_EXCLUDE_REGEXES ".*" RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA diff --git a/CMakePresets.json b/CMakePresets.json index 85e9af4d..bbeab76f 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -18,6 +18,14 @@ "name": "CUDA", "inherits": [ "Default" ] }, + { + "name": "CUDA 11", + "inherits": [ "CUDA" ], + "cacheVariables": { + "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual", + "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" + } + }, { "name": "CUDA 12", "inherits": [ "CUDA" ], @@ -26,6 +34,14 @@ "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" } }, + { + "name": "CUDA 13", + "inherits": [ "CUDA" ], + "cacheVariables": { + "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual", + "CMAKE_CUDA_FLAGS": "-t 2" + } + }, { "name": "JetPack 5", "inherits": [ "CUDA" ], @@ -72,11 +88,21 @@ "configurePreset": "CUDA", "targets": [ "ggml-cuda" ] }, + { + "name": "CUDA 11", + "inherits": [ "CUDA" ], + "configurePreset": "CUDA 11" + }, { "name": "CUDA 12", "inherits": [ "CUDA" ], "configurePreset": "CUDA 12" }, + { + "name": "CUDA 13", + "inherits": [ "CUDA" ], + "configurePreset": "CUDA 13" + }, { "name": "JetPack 5", "inherits": [ "CUDA" ], diff --git a/Dockerfile b/Dockerfile index 0dc3c126..c84b5239 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,15 +39,35 @@ RUN --mount=type=cache,target=/root/.ccache \ && cmake --build --parallel --preset 'CPU' \ && cmake --install build --component CPU --strip --parallel 8 +FROM base AS cuda-11 +ARG CUDA11VERSION=11.8 +RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-} +ENV PATH=/usr/local/cuda-11/bin:$PATH +RUN --mount=type=cache,target=/root/.ccache \ + cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \ + && cmake --build --parallel --preset 'CUDA 11' \ + && cmake --install build --component CUDA --strip --parallel 8 + FROM base AS cuda-12 ARG CUDA12VERSION=12.8 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-} ENV PATH=/usr/local/cuda-12/bin:$PATH RUN --mount=type=cache,target=/root/.ccache \ - cmake --preset 'CUDA 12' \ + cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\ && cmake --build --parallel --preset 'CUDA 12' \ && cmake --install build --component CUDA --strip --parallel 8 + +FROM base AS cuda-13 +ARG CUDA13VERSION=13.0 +RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} +ENV PATH=/usr/local/cuda-13/bin:$PATH +RUN --mount=type=cache,target=/root/.ccache \ + cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \ + && cmake --build --parallel --preset 'CUDA 13' \ + && cmake --install build --component CUDA --strip --parallel 8 + + FROM base AS rocm-6 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH RUN --mount=type=cache,target=/root/.ccache \ @@ -92,10 +112,14 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ go build -trimpath -buildmode=pie -o /bin/ollama . FROM --platform=linux/amd64 scratch AS amd64 -COPY --from=cuda-12 dist/lib/ollama /lib/ollama +# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/ +COPY --from=cuda-12 dist/lib/ollama /lib/ollama/ +COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/ FROM --platform=linux/arm64 scratch AS arm64 -COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa +# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/ +COPY --from=cuda-12 dist/lib/ollama /lib/ollama/ +COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/ COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6 diff --git a/README.md b/README.md index 8a815215..0c79970a 100644 --- a/README.md +++ b/README.md @@ -435,6 +435,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.) - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.) - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models) +- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare) +- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads) ### Cloud diff --git a/api/types.go b/api/types.go index d3f6fc5a..a7ddbc37 100644 --- a/api/types.go +++ b/api/types.go @@ -388,8 +388,12 @@ type EmbedRequest struct { // this request. KeepAlive *Duration `json:"keep_alive,omitempty"` + // Truncate truncates the input to fit the model's max sequence length. Truncate *bool `json:"truncate,omitempty"` + // Dimensions truncates the output embedding to the specified dimension. + Dimensions int `json:"dimensions,omitempty"` + // Options lists model-specific options. Options map[string]any `json:"options"` } diff --git a/cmd/cmd.go b/cmd/cmd.go index 8fe06865..19f1e192 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) if err != nil { return } - for _, cap := range resp.Capabilities { - if cap == model.CapabilityThinking { - return - } + if slices.Contains(resp.Capabilities, model.CapabilityThinking) { + return } fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name) } diff --git a/discover/cuda_common.go b/discover/cuda_common.go index b539f6b3..ca008af6 100644 --- a/discover/cuda_common.go +++ b/discover/cuda_common.go @@ -43,14 +43,15 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { } } } - return "sbsa" } - // driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers - if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) { - // The detected driver is older than Feb 2023 - slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) - return "v11" + if gpuInfo.DriverMajor < 13 { + // The detected driver is older than 580 (Aug 2025) + // Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance + if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) { + slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) + } + return "v12" } - return "v12" + return "v13" } diff --git a/docs/api.md b/docs/api.md index f11d59ed..f47af63c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1708,6 +1708,7 @@ Advanced parameters: - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) +- `dimensions`: number of dimensions for the embedding ### Examples diff --git a/docs/linux.md b/docs/linux.md index 9a156d1d..ce5ed860 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -11,12 +11,13 @@ curl -fsSL https://ollama.com/install.sh | sh ## Manual install > [!NOTE] -> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first. +> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first. Download and extract the package: ```shell curl -LO https://ollama.com/download/ollama-linux-amd64.tgz +sudo rm -rf /usr/lib/ollama sudo tar -C /usr -xzf ollama-linux-amd64.tgz ``` diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6fdd3e85..7647b12f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -92,6 +92,9 @@ If none of those resolve the problem, gather additional information and file an - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` +You may get more details for initialization failures by enabling debug prints in the uvm driver. You should only use this temporarily while troubleshooting +- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1` + ## AMD GPU Discovery diff --git a/envconfig/config.go b/envconfig/config.go index 868813ae..7fc01887 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -185,8 +185,6 @@ var ( ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) // Auth enables authentication between the Ollama client and server UseAuth = Bool("OLLAMA_AUTH") - // Enable the new memory estimation logic - NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES") ) func String(s string) func() string { @@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, - "OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"}, // Informational "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 3f4374cd..6b582b49 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -57,10 +57,28 @@ func (kv KV) EmbeddingLength() uint64 { return uint64(kv.Uint("embedding_length")) } +func (kv KV) HeadCount() []uint64 { + headCountDefault := uint32(1) + headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault) + if len(headCount) == 1 { + headCountDefault = headCount[0] + } + nLayers := int(kv.BlockCount()) + if len(headCount) > nLayers { + slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers) + } + out := make([]uint64, nLayers) + for i := range nLayers { + if i >= len(headCount) { + out[i] = uint64(headCountDefault) + } else { + out[i] = uint64(headCount[i]) + } + } + return out +} + func (kv KV) HeadCountMax() uint64 { - // TODO(drifkin): using the max value can cause an overestimation. In the - // future if array values become more popular, we can adapt the more invasive - // return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1)) } @@ -68,6 +86,27 @@ func (kv KV) HeadCountMin() uint64 { return uint64(kv.UintOrMinArrayValue("attention.head_count", 1)) } +func (kv KV) HeadCountKV() []uint64 { + headCountKVDefault := uint32(1) + headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault) + if len(headCountKV) == 1 { + headCountKVDefault = headCountKV[0] + } + nLayers := int(kv.BlockCount()) + if len(headCountKV) > nLayers { + slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers) + } + out := make([]uint64, nLayers) + for i := range nLayers { + if i >= len(headCountKV) { + out[i] = uint64(headCountKVDefault) + } else { + out[i] = uint64(headCountKV[i]) + } + } + return out +} + func (kv KV) HeadCountKVMax() uint64 { return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1)) } @@ -100,6 +139,26 @@ func (kv KV) ChatTemplate() string { return kv.String("tokenizer.chat_template") } +// ssm architecture parameters + +func (kv KV) SSMConvKernel() uint64 { + return uint64(kv.Uint("ssm.conv_kernel")) +} + +func (kv KV) SSMInnerSize() uint64 { + return uint64(kv.Uint("ssm.inner_size")) +} + +func (kv KV) SSMStateSize() uint64 { + return uint64(kv.Uint("ssm.state_size")) +} + +func (kv KV) SSMGroupCount() uint64 { + return uint64(kv.Uint("ssm.group_count")) +} + +// general types + func (kv KV) String(key string, defaultValue ...string) string { val, _ := keyValue(kv, key, append(defaultValue, "")...) return val @@ -131,22 +190,27 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 { } func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) { + arrVal := kv.UintOrArrayValueAsArray(key, defaultValue) + return slices.Min(arrVal), slices.Max(arrVal) +} + +func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 { if u32, ok := keyValue(kv, key, uint32(0)); ok { - return u32, u32 + return []uint32{u32} } else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok { - min := slices.Min(u32s.values) - max := slices.Max(u32s.values) - return min, max + return u32s.values } else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok { - min := slices.Min(i32s.values) - max := slices.Max(i32s.values) - if min < 0 || max < 0 { - slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max) + dst := make([]uint32, len(i32s.values)) + for i, v := range i32s.values { + if v < 0 { + slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v) + } + dst[i] = uint32(v) } - return uint32(min), uint32(max) + return dst } - return defaultValue, defaultValue + return []uint32{defaultValue} } func (kv KV) Strings(key string, defaultValue ...[]string) []string { @@ -486,7 +550,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri embedding := f.KV().EmbeddingLength() heads := f.KV().HeadCountMax() + headsArr := f.KV().HeadCount() headsKV := f.KV().HeadCountKVMax() + headsKVArr := f.KV().HeadCountKV() vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size) embeddingHeads := f.KV().EmbeddingHeadCountMax() @@ -496,12 +562,51 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri layers := f.Tensors().GroupLayers() bytesPerElement := kvCacheBytesPerElement(kvCacheType) + + // Default for models unless special-cased below. These defaults mirror the + // cache usage in llama.cpp under the assumption that models without special + // cases below will use the llamarunner and caching will be handled by the + // llama.cpp layer. + // + // This also assumes that a layer without heads or headsKV set is recurrent + // which is usually the case. Some models (eg nemotronh) use "blocks" in + // place of layers where some are MLP blocks that don't have any cache. + // Models like this will need a special case below to be accurately + // estimated. var kvTotal uint64 kv = make([]uint64, f.KV().BlockCount()) + kvSizeAttn := uint64(0) + kvSizeRecurrent := uint64(0) for i := range kv { - kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) + headsL := headsArr[i] + headsKVL := headsKVArr[i] + if headsL > 0 && headsKVL > 0 { + // full attention layer + // NOTE: Assumes uniform values for all attn layers + kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement) + kvSizeAttn += kv[i] + } else { + // recurrent layer + ssmDConv := f.KV().SSMConvKernel() + ssmDState := f.KV().SSMStateSize() + ssmDInner := f.KV().SSMInnerSize() + ssmNGroups := f.KV().SSMGroupCount() + nEmbdR := uint64(0) + if ssmDConv > 0 { + nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState) + } + nEmbdS := ssmDState * ssmDInner + + // recurrent always uses F32 in llama.cpp backend + // https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644 + bytesPerElementRecurrent := kvCacheBytesPerElement("f32") + + kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent) + kvSizeRecurrent += kv[i] + } kvTotal += kv[i] } + slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent) switch f.KV().Architecture() { case "llama", "llama4": @@ -759,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + if cacheType == "" || cacheType == "f16" { + return true + } + if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) { // gpt-oss uses attention with sinks which does not support quantized cache types - slog.Warn("model only supports non-quantized cache types ", "mode", arch) - return cacheType == "f16" + slog.Warn("model only supports non-quantized cache types", "model", arch) + return false } - return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) + return slices.Contains([]string{"q8_0", "q4_0"}, cacheType) } // SupportsFlashAttention checks if the model supports flash attention @@ -774,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool { return false } + if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) { + return false + } + // Check head counts match and are non-zero headCountK := f.KV().EmbeddingHeadCountK() headCountV := f.KV().EmbeddingHeadCountV() @@ -794,6 +907,8 @@ func kvCacheBytesPerElement(cacheType string) float64 { return 1 // 1/2 of fp16 case "q4_0": return 0.5 // 1/4 of fp16 + case "f32": + return 4 // f32 (default for recurrent) default: return 2 // f16 (default) } diff --git a/integration/api_test.go b/integration/api_test.go index 0baba882..c39192c9 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -410,3 +410,99 @@ func TestAPIEmbeddings(t *testing.T) { t.Errorf("zero length embedding response") } } + +func TestAPIToolCalling(t *testing.T) { + initialTimeout := 60 * time.Second + streamTimeout := 30 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + modelName := "qwen3:0.6b" + if err := PullIfMissing(ctx, client, modelName); err != nil { + t.Fatalf("pull failed %s", err) + } + + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather in a given location", + Parameters: api.ToolFunctionParameters{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]api.ToolProperty{ + "location": { + Type: api.PropertyType{"string"}, + Description: "The city and state, e.g. San Francisco, CA", + }, + }, + }, + }, + }, + } + + req := api.ChatRequest{ + Model: modelName, + Messages: []api.Message{ + { + Role: "user", + Content: "Call get_weather with location set to San Francisco.", + }, + }, + Tools: tools, + Options: map[string]any{ + "temperature": 0, + }, + } + + stallTimer := time.NewTimer(initialTimeout) + var gotToolCall bool + var lastToolCall api.ToolCall + + fn := func(response api.ChatResponse) error { + if len(response.Message.ToolCalls) > 0 { + gotToolCall = true + lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] + } + if !stallTimer.Reset(streamTimeout) { + return fmt.Errorf("stall was detected while streaming response, aborting") + } + return nil + } + + stream := true + req.Stream = &stream + done := make(chan int) + var genErr error + go func() { + genErr = client.Chat(ctx, &req, fn) + done <- 0 + }() + + select { + case <-stallTimer.C: + t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) + case <-done: + if genErr != nil { + t.Fatalf("chat failed: %v", genErr) + } + + if !gotToolCall { + t.Fatalf("expected at least one tool call, got none") + } + + if lastToolCall.Function.Name != "get_weather" { + t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") + } + + if _, ok := lastToolCall.Function.Arguments["location"]; !ok { + t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for tool-calling chat") + } +} diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 331bb6e7..3104eacc 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -121,6 +121,7 @@ func TestMultiModelStress(t *testing.T) { // The intent is to go 1 over what can fit so we force the scheduler to thrash targetLoadCount := 0 slog.Info("Loading models to find how many can fit in VRAM before overflowing") +chooseModels: for i, model := range chosenModels { req := &api.GenerateRequest{Model: model} slog.Info("loading", "model", model) @@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) { slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount]) break } + // Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts + for _, m := range models.Models { + if m.SizeVRAM == 0 { + slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount]) + break chooseModels + } + } } } if targetLoadCount == len(chosenModels) { diff --git a/integration/context_test.go b/integration/context_test.go index 24c57dcf..15c15785 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) { if err := PullIfMissing(ctx, client, req.Model); err != nil { t.Fatalf("PullIfMissing failed: %v", err) } - DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) + DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) } func TestContextExhaustion(t *testing.T) { @@ -50,7 +50,7 @@ func TestContextExhaustion(t *testing.T) { // Set up the test data req := api.GenerateRequest{ Model: smol, - Prompt: "Write me a story with a ton of emojis?", + Prompt: "Write me a story in english with a lot of emojis", Stream: &stream, Options: map[string]any{ "temperature": 0, diff --git a/integration/embed_test.go b/integration/embed_test.go index 09369dbb..eb00f4ba 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -38,8 +38,9 @@ func TestAllMiniLMEmbeddings(t *testing.T) { defer cleanup() req := api.EmbeddingRequest{ - Model: "all-minilm", - Prompt: "why is the sky blue?", + Model: "all-minilm", + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 10 * time.Second}, } res, err := embeddingTestHelper(ctx, client, t, req) diff --git a/integration/utils_test.go b/integration/utils_test.go index 2bb6a157..7901fed3 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -502,6 +502,22 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap done <- 0 }() + var response string + verify := func() { + // Verify the response contains the expected data + response = buf.String() + atLeastOne := false + for _, resp := range anyResp { + if strings.Contains(strings.ToLower(response), resp) { + atLeastOne = true + break + } + } + if !atLeastOne { + t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response) + } + } + select { case <-stallTimer.C: if buf.Len() == 0 { @@ -517,21 +533,14 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap if genErr != nil { t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt) } - // Verify the response contains the expected data - response := buf.String() - atLeastOne := false - for _, resp := range anyResp { - if strings.Contains(strings.ToLower(response), resp) { - atLeastOne = true - break - } - } - if !atLeastOne { - t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response) - } + verify() slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response) case <-ctx.Done(): - t.Error("outer test context done while waiting for generate") + // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see + // if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass + // if they are still generating valid responses + slog.Warn("outer test context done while waiting for generate") + verify() } return context } @@ -552,7 +561,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { KeepAlive: &api.Duration{Duration: 10 * time.Second}, }, { Model: smol, - Prompt: "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply", + Prompt: "how do rainbows form? Be brief but factual in your reply", Stream: &stream, KeepAlive: &api.Duration{Duration: 10 * time.Second}, }, { @@ -570,9 +579,9 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { [][]string{ {"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"}, {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"}, - {"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states", "cultural", "hardship", "autumn", "festival"}, + {"water", "droplet", "refracted", "reflect", "color", "spectrum"}, {"fourth", "july", "declaration", "independence"}, - {"nitrogen", "oxygen", "carbon", "dioxide"}, + {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor"}, } } @@ -599,6 +608,22 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR done <- 0 }() + var response string + verify := func() { + // Verify the response contains the expected data + response = buf.String() + atLeastOne := false + for _, resp := range anyResp { + if strings.Contains(strings.ToLower(response), resp) { + atLeastOne = true + break + } + } + if !atLeastOne { + t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages) + } + } + select { case <-stallTimer.C: if buf.Len() == 0 { @@ -614,23 +639,14 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR if genErr != nil { t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages) } - - // Verify the response contains the expected data - response := buf.String() - atLeastOne := false - for _, resp := range anyResp { - if strings.Contains(strings.ToLower(response), resp) { - atLeastOne = true - break - } - } - if !atLeastOne { - t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages) - } - + verify() slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response) case <-ctx.Done(): - t.Error("outer test context done while waiting for generate") + // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see + // if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass + // if they are still generating valid responses + slog.Warn("outer test context done while waiting for chat") + verify() } return &api.Message{Role: role, Content: buf.String()} } diff --git a/llama/llama.go b/llama/llama.go index ac2c112c..88672a03 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -515,33 +515,34 @@ func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, } nChunks := C.mtmd_input_chunks_size(ic) numEmbed := llamaContext.Model().NEmbd() - lastChunkSize := 0 + embed := make([][]float32, 0) for i := range int(nChunks) { chunk := C.mtmd_input_chunks_get(ic, C.size_t(i)) numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk)) - lastChunkSize = numTokens + slog.Debug("chunk tokens", "index", i, "numTokens", numTokens) // Encode the chunk if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) { return nil, errors.New("unable to encode mtmd image chunk") } - } - // Get the embeddings - embed := make([][]float32, lastChunkSize) - embd := C.mtmd_get_output_embd(c.c) - if nil == embd { - return nil, errors.New("failed to get image embedding") - } + // Get the embeddings for this chunk + chunkEmbed := make([][]float32, numTokens) + chunkEmbd := C.mtmd_get_output_embd(c.c) + if nil == chunkEmbd { + continue + } - // Extend the embedding array for each token - s := unsafe.Slice((*float32)(embd), numEmbed*lastChunkSize) - rows := make([]float32, len(s)) - copy(rows, s) - for i := range lastChunkSize { - embed[i] = rows[i*numEmbed : (i+1)*numEmbed] + // Extend the embedding array for each token + s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed) + rows := make([]float32, len(s)) + copy(rows, s) + for i := range numTokens { + chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed] + } + embed = append(embed, chunkEmbed...) } - + slog.Debug("image embeddings", "totalEmbeddings", len(embed)) return embed, nil } diff --git a/llm/memory.go b/llm/memory.go index ce128eb5..7a87b28f 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin var kvct string if useFlashAttention { requested := strings.ToLower(envconfig.KvCacheType()) - if requested != "" && f.SupportsKVCacheType(requested) { + if f.SupportsKVCacheType(requested) { kvct = requested } } diff --git a/llm/server.go b/llm/server.go index 664a69fb..75f049bc 100644 --- a/llm/server.go +++ b/llm/server.go @@ -148,7 +148,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a var textProcessor model.TextProcessor var err error if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { - textProcessor, err = model.NewTextProcessor(modelPath) + if len(projectors) == 0 { + textProcessor, err = model.NewTextProcessor(modelPath) + } else { + err = errors.New("split vision models aren't supported") + } if err != nil { // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) @@ -161,11 +165,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } } - newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates() - if newEstimates { - slog.Info("enabling new memory estimates") - } - // Verify the requested context size is <= the model training size trainCtx := f.KV().ContextLength() if opts.NumCtx > int(trainCtx) && trainCtx > 0 { @@ -173,6 +172,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a opts.NumCtx = int(trainCtx) } + opts.NumBatch = min(opts.NumBatch, opts.NumCtx) + loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()} defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount() @@ -218,7 +219,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a // Flash Attention also supports kv cache quantization // Enable if the requested and kv cache type is supported by the model - if kvct != "" && f.SupportsKVCacheType(kvct) { + if f.SupportsKVCacheType(kvct) { loadRequest.KvCacheType = kvct } else { slog.Warn("kv cache type not supported by model", "type", kvct) @@ -431,7 +432,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } }() - if newEstimates { + if textProcessor != nil { return &ollamaServer{llmServer: s}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil diff --git a/openai/openai.go b/openai/openai.go index 9c7c41cb..b6a8a95e 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -76,8 +76,9 @@ type JsonSchema struct { } type EmbedRequest struct { - Input any `json:"input"` - Model string `json:"model"` + Input any `json:"input"` + Model string `json:"model"` + Dimensions int `json:"dimensions,omitempty"` } type StreamOptions struct { @@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc { } var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil { + if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) return } diff --git a/parser/parser.go b/parser/parser.go index d40a79c2..e080f1bb 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -246,7 +246,7 @@ func filesForModel(path string) ([]string, error) { for _, match := range matches { if ct, err := detectContentType(match); err != nil { return nil, err - } else if ct != contentType { + } else if len(contentType) > 0 && ct != contentType { return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match) } } @@ -255,7 +255,8 @@ func filesForModel(path string) ([]string, error) { } var files []string - if st, _ := glob(filepath.Join(path, "*.safetensors"), "application/octet-stream"); len(st) > 0 { + // some safetensors files do not properly match "application/octet-stream", so skip checking their contentType + if st, _ := glob(filepath.Join(path, "*.safetensors"), ""); len(st) > 0 { // safetensors files might be unresolved git lfs references; skip if they are // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors files = append(files, st...) diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 955ef9b3..f558f7b8 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -34,8 +34,8 @@ type InputCache struct { func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) { numCtx := kvSize / int32(numSlots) - if numCtx < 1 { - return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots) + if int(numCtx) < batchSize { + return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots) } slots := make([]InputCacheSlot, numSlots) @@ -70,11 +70,9 @@ func kvCacheTypeFromStr(s string) ml.DType { } func (c *InputCache) Close() { - if c == nil { - return + if c != nil && c.cache != nil { + c.cache.Close() } - - c.cache.Close() } // Locking: Operations on InputCacheSlot (including finding one diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index df3ce1d9..1081a1f5 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -18,7 +18,6 @@ import ( "reflect" "regexp" "runtime" - "runtime/debug" "strconv" "strings" "sync" @@ -1086,9 +1085,13 @@ func (s *Server) allocModel( // Convert memory allocation panics to errors defer func() { if r := recover(); r != nil { - debug.PrintStack() if err, ok := r.(error); ok { - panicErr = err + var noMem ml.ErrNoMem + if errors.As(err, &noMem) { + panicErr = noMem + } else { + panic(r) + } } else { panic(r) } diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 27f3eb9d..37fe8796 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -78,7 +78,7 @@ function checkEnv() { } -function buildOllama() { +function buildCPU() { mkdir -Force -path "${script:DIST_DIR}\" if ($script:ARCH -ne "arm64") { Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" @@ -90,20 +90,72 @@ function buildOllama() { if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --install build --component CPU --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } +} +function buildCUDA11() { + # CUDA v11 claims to be compatible with MSVC 2022, but the latest updates are no longer compatible + # 19.40 is the last compiler version that works, but recent udpates are 19.43 + # So this pins to MSVC 2019 for best compatibility + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { $hashEnv = @{} Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } - if ("$script:CUDA_DIRS".Contains("v12")) { - $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }} - $env:CUDAToolkit_ROOT=$hashEnv[$v12] - write-host "Building CUDA v12 backend libraries" - & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR + if ("$script:CUDA_DIRS".Contains("v11")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + write-host "Building CUDA v11 backend libraries $cuda" + $env:CUDAToolkit_ROOT=$cuda + & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11" + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --install build --component "CUDA" --strip + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } + } +} + +function buildCUDA12() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { + $hashEnv = @{} + Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } + if ("$script:CUDA_DIRS".Contains("v12.8")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + write-host "Building CUDA v12 backend libraries $cuda" + $env:CUDAToolkit_ROOT=$cuda + & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --install build --component "CUDA" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } + } +} + +function buildCUDA13() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { + $hashEnv = @{} + Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } + if ("$script:CUDA_DIRS".Contains("v13")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + $env:CUDAToolkit_ROOT=$cuda + write-host "Building CUDA v13 backend libraries $cuda" + & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13" + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --install build --component "CUDA" --strip + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } + } +} + +function buildROCm() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { if ($env:HIP_PATH) { write-host "Building ROCm backend libraries" if (-Not (get-command -ErrorAction silent ninja)) { @@ -129,6 +181,10 @@ function buildOllama() { if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } +} + +function buildOllama() { + mkdir -Force -path "${script:DIST_DIR}\" write-host "Building ollama CLI" & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" . if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} @@ -236,6 +292,10 @@ function distZip() { checkEnv try { if ($($args.count) -eq 0) { + buildCPU + buildCUDA12 + buildCUDA13 + buildROCm buildOllama buildApp gatherDependencies diff --git a/server/routes.go b/server/routes.go index e6e4e2c4..5114cb74 100644 --- a/server/routes.go +++ b/server/routes.go @@ -576,7 +576,12 @@ func (s *Server) EmbedHandler(c *gin.Context) { if err != nil { return err } - embeddings[i] = normalize(embedding) + // TODO: this first normalization should be done by the model + embedding = normalize(embedding) + if req.Dimensions > 0 && req.Dimensions < len(embedding) { + embedding = normalize(embedding[:req.Dimensions]) + } + embeddings[i] = embedding return nil }) } @@ -602,11 +607,7 @@ func normalize(vec []float32) []float32 { sum += v * v } - norm := float32(0.0) - if sum > 0 { - norm = float32(1.0 / math.Sqrt(float64(sum))) - } - + norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12)) for i := range vec { vec[i] *= norm }