Compare commits
204 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
51e1480751 | ||
|
|
0a2d92081b | ||
|
|
c88647104d | ||
|
|
05aff4a4f1 | ||
|
|
0d140bd1af | ||
|
|
93e45f0f0d | ||
|
|
a342160803 | ||
|
|
f6c29409dc | ||
|
|
7d25b9e194 | ||
|
|
36d64fb531 | ||
|
|
d828517e78 | ||
|
|
14977a9350 | ||
|
|
29f63f37c8 | ||
|
|
3d99d9779a | ||
|
|
6d02a43a75 | ||
|
|
5483497d7a | ||
|
|
934dd9e196 | ||
|
|
1188f408dd | ||
|
|
15c7d30d9a | ||
|
|
9862317174 | ||
|
|
ec9eb28f4c | ||
|
|
1bdd816910 | ||
|
|
5d347f6d6f | ||
|
|
b97eb2b858 | ||
|
|
ad6f6a1d29 | ||
|
|
6723a40be6 | ||
|
|
3258a89b6e | ||
|
|
1c093e97af | ||
|
|
a8d9c2648e | ||
|
|
0334e67ffd | ||
|
|
e0ead1adee | ||
|
|
d515aed6c3 | ||
|
|
7f551c41e7 | ||
|
|
5fe7ba1b9b | ||
|
|
d2b63c19b3 | ||
|
|
94f110b35a | ||
|
|
5d22953ba7 | ||
|
|
d245dffed8 | ||
|
|
cb13784a11 | ||
|
|
bc1a818fdc | ||
|
|
ba2253dc30 | ||
|
|
68e04c7ff8 | ||
|
|
270679932f | ||
|
|
65fb3ff49d | ||
|
|
e2a0b24435 | ||
|
|
1813ff85a0 | ||
|
|
b531777a66 | ||
|
|
fe3ec8dbf0 | ||
|
|
c744134287 | ||
|
|
4be41d2d45 | ||
|
|
de670570c9 | ||
|
|
201d93716e | ||
|
|
160cecc8e2 | ||
|
|
8b6e5baee7 | ||
|
|
75d17fc6c2 | ||
|
|
8fafc8af77 | ||
|
|
c3c85aa06c | ||
|
|
0d713051a2 | ||
|
|
c4c5a4a01e | ||
|
|
3dcfd5f69e | ||
|
|
53a969d509 | ||
|
|
08fbb60bb2 | ||
|
|
850da848c5 | ||
|
|
2aba569a2a | ||
|
|
fd8aa947f3 | ||
|
|
ddaca643d0 | ||
|
|
05982a95cb | ||
|
|
4987f13d34 | ||
|
|
e638f2acb6 | ||
|
|
18087f2ec7 | ||
|
|
6c833d5f8d | ||
|
|
6544e14735 | ||
|
|
5db8a818a1 | ||
|
|
6db8da9958 | ||
|
|
0c68ec8d6a | ||
|
|
70d9e363e1 | ||
|
|
1a2feb2a97 | ||
|
|
aab2190420 | ||
|
|
629db9dc43 | ||
|
|
e0cd511661 | ||
|
|
207332078f | ||
|
|
93085127f4 | ||
|
|
c00fa9cc2b | ||
|
|
df411c4b02 | ||
|
|
3d32249c74 | ||
|
|
d681cd7c29 | ||
|
|
47298fce39 | ||
|
|
4a48937ef1 | ||
|
|
967a82f52f | ||
|
|
bbbc73d637 | ||
|
|
15e3611d3d | ||
|
|
77060d462c | ||
|
|
1b91d4dda1 | ||
|
|
7d965258ce | ||
|
|
6a62b894c7 | ||
|
|
90d429f5a8 | ||
|
|
1fc35f1260 | ||
|
|
aa45f7ce27 | ||
|
|
4e5d862ec4 | ||
|
|
303be9304c | ||
|
|
bd15eba4e4 | ||
|
|
bc71278670 | ||
|
|
918231931c | ||
|
|
04c1849878 | ||
|
|
2c2f4deaa9 | ||
|
|
292767afb4 | ||
|
|
ae5e0f0889 | ||
|
|
19e6796eac | ||
|
|
33801c1597 | ||
|
|
e4340667e3 | ||
|
|
2fa1e92a99 | ||
|
|
07e36761c3 | ||
|
|
c29fb007c0 | ||
|
|
730ed6e9e1 | ||
|
|
dc06601677 | ||
|
|
1ed2881ef0 | ||
|
|
0bda72892c | ||
|
|
55ca827267 | ||
|
|
c68f367ef6 | ||
|
|
fdb109469f | ||
|
|
05a43e078a | ||
|
|
bc8909fb38 | ||
|
|
6b50f2b9cd | ||
|
|
35ac4eb12c | ||
|
|
3d0b1734c0 | ||
|
|
efaee8c2d6 | ||
|
|
734b57da0e | ||
|
|
83021fcf0f | ||
|
|
0469861d9d | ||
|
|
04431b50fa | ||
|
|
c47154c08d | ||
|
|
b04e46da3e | ||
|
|
34efbbd3f0 | ||
|
|
05ba4ca1f4 | ||
|
|
5a56ff3cf0 | ||
|
|
2fba04b5fb | ||
|
|
fbd82ba5bb | ||
|
|
2e742544bf | ||
|
|
bbb195a6ff | ||
|
|
fd88cd7cb0 | ||
|
|
e1979c571a | ||
|
|
bf78ed6ee9 | ||
|
|
a40d427bce | ||
|
|
64883e3c4c | ||
|
|
41efdd4048 | ||
|
|
c23e6f4cae | ||
|
|
af060eb250 | ||
|
|
ae5c33008e | ||
|
|
000a3ec8b9 | ||
|
|
3677842ff1 | ||
|
|
242df70a75 | ||
|
|
dba39b2eee | ||
|
|
9f3a37fd36 | ||
|
|
7460259eb3 | ||
|
|
22ccdd74c2 | ||
|
|
0c3d0e7533 | ||
|
|
e7f56ef3d8 | ||
|
|
eb0a5d4459 | ||
|
|
ceac416ec2 | ||
|
|
2717dce6fe | ||
|
|
9b8187b487 | ||
|
|
8b894933a7 | ||
|
|
9c5bf342bc | ||
|
|
564b558c92 | ||
|
|
a417ac97ee | ||
|
|
05d53457af | ||
|
|
b225508c9b | ||
|
|
fa1c987a29 | ||
|
|
ad95d5b30b | ||
|
|
c253433d68 | ||
|
|
a1cff89b30 | ||
|
|
93c64ea1b1 | ||
|
|
3f6642f6fc | ||
|
|
6f7117145f | ||
|
|
472feec2ff | ||
|
|
47991940d4 | ||
|
|
9f3f80891d | ||
|
|
92b96d54ef | ||
|
|
9d56e63dbf | ||
|
|
053092185e | ||
|
|
44a6792873 | ||
|
|
e4ce68311a | ||
|
|
26214125e8 | ||
|
|
61fb912ca4 | ||
|
|
aba1575315 | ||
|
|
eb10390de9 | ||
|
|
feb18cd710 | ||
|
|
8a7e2055d2 | ||
|
|
29ddfc2cab | ||
|
|
71cb86af3e | ||
|
|
5198956372 | ||
|
|
17a023f34b | ||
|
|
8d6fffaead | ||
|
|
20b53eaa72 | ||
|
|
6745182885 | ||
|
|
f810ec741c | ||
|
|
e119783e66 | ||
|
|
1a558f98e2 | ||
|
|
7b91c9ce51 | ||
|
|
950d33aa30 | ||
|
|
9714e38dd0 | ||
|
|
4378ae4ffa | ||
|
|
501cb38b8c | ||
|
|
5994e8e8fd |
82
.github/workflows/release.yaml
vendored
@@ -65,14 +65,36 @@ jobs:
|
|||||||
arch: amd64
|
arch: amd64
|
||||||
preset: 'CUDA 12'
|
preset: 'CUDA 12'
|
||||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
||||||
|
cuda-components:
|
||||||
|
- '"cudart"'
|
||||||
|
- '"nvcc"'
|
||||||
|
- '"cublas"'
|
||||||
|
- '"cublas_dev"'
|
||||||
cuda-version: '12.8'
|
cuda-version: '12.8'
|
||||||
flags: ''
|
flags: ''
|
||||||
|
runner_dir: 'cuda_v12'
|
||||||
|
- os: windows
|
||||||
|
arch: amd64
|
||||||
|
preset: 'CUDA 13'
|
||||||
|
install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
|
||||||
|
cuda-components:
|
||||||
|
- '"cudart"'
|
||||||
|
- '"nvcc"'
|
||||||
|
- '"cublas"'
|
||||||
|
- '"cublas_dev"'
|
||||||
|
- '"crt"'
|
||||||
|
- '"nvvm"'
|
||||||
|
- '"nvptxcompiler"'
|
||||||
|
cuda-version: '13.0'
|
||||||
|
flags: ''
|
||||||
|
runner_dir: 'cuda_v13'
|
||||||
- os: windows
|
- os: windows
|
||||||
arch: amd64
|
arch: amd64
|
||||||
preset: 'ROCm 6'
|
preset: 'ROCm 6'
|
||||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||||
rocm-version: '6.2'
|
rocm-version: '6.2'
|
||||||
flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
||||||
|
runner_dir: 'rocm'
|
||||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||||
environment: release
|
environment: release
|
||||||
env:
|
env:
|
||||||
@@ -96,7 +118,7 @@ jobs:
|
|||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||||
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
|
$subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
|
||||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,9 +160,10 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
|
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
|
||||||
cmake --build --parallel --preset "${{ matrix.preset }}"
|
cmake --build --parallel --preset "${{ matrix.preset }}"
|
||||||
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
|
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
|
||||||
|
Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
|
||||||
env:
|
env:
|
||||||
CMAKE_GENERATOR: Ninja
|
CMAKE_GENERATOR: Ninja
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
@@ -153,19 +176,19 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
os: [windows]
|
os: [windows]
|
||||||
arch: [amd64, arm64]
|
arch: [amd64, arm64]
|
||||||
|
include:
|
||||||
|
- os: windows
|
||||||
|
arch: amd64
|
||||||
|
llvmarch: x86_64
|
||||||
|
- os: windows
|
||||||
|
arch: arm64
|
||||||
|
llvmarch: aarch64
|
||||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||||
environment: release
|
environment: release
|
||||||
needs: [setup-environment]
|
needs: [setup-environment]
|
||||||
env:
|
env:
|
||||||
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
|
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
|
||||||
steps:
|
steps:
|
||||||
- name: Install AMD64 system dependencies
|
|
||||||
if: matrix.arch == 'amd64'
|
|
||||||
run: |
|
|
||||||
$ErrorActionPreference = "Stop"
|
|
||||||
Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install ARM64 system dependencies
|
- name: Install ARM64 system dependencies
|
||||||
if: matrix.arch == 'arm64'
|
if: matrix.arch == 'arm64'
|
||||||
run: |
|
run: |
|
||||||
@@ -177,15 +200,29 @@ jobs:
|
|||||||
|
|
||||||
choco install -y --no-progress git gzip
|
choco install -y --no-progress git gzip
|
||||||
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: Install clang and gcc-compat
|
||||||
Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
|
run: |
|
||||||
Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
|
$ErrorActionPreference = "Stop"
|
||||||
$installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
|
Set-ExecutionPolicy Bypass -Scope Process -Force
|
||||||
echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-${{ matrix.llvmarch }}.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt.zip"
|
||||||
|
Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt.zip -DestinationPath "C:\Program Files\"
|
||||||
|
$installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt*").path
|
||||||
|
echo "$installPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
|
- name: Verify gcc is actually clang
|
||||||
|
run: |
|
||||||
|
$ErrorActionPreference='Continue'
|
||||||
|
$version=& gcc -v 2>&1
|
||||||
|
$version=$version -join "`n"
|
||||||
|
echo "gcc is $version"
|
||||||
|
if ($version -notmatch 'clang') {
|
||||||
|
echo "ERROR: GCC must be clang for proper utf16 handling"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
$ErrorActionPreference='Stop'
|
||||||
- run: |
|
- run: |
|
||||||
go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
|
go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
@@ -200,13 +237,13 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: amd64
|
arch: amd64
|
||||||
target: archive
|
target: archive_novulkan
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: amd64
|
arch: amd64
|
||||||
target: rocm
|
target: rocm
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: arm64
|
arch: arm64
|
||||||
target: archive
|
target: archive_novulkan
|
||||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||||
environment: release
|
environment: release
|
||||||
needs: setup-environment
|
needs: setup-environment
|
||||||
@@ -232,7 +269,7 @@ jobs:
|
|||||||
case "$COMPONENT" in
|
case "$COMPONENT" in
|
||||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_sbsa) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||||
@@ -262,12 +299,14 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: arm64
|
arch: arm64
|
||||||
|
target: novulkan
|
||||||
build-args: |
|
build-args: |
|
||||||
CGO_CFLAGS
|
CGO_CFLAGS
|
||||||
CGO_CXXFLAGS
|
CGO_CXXFLAGS
|
||||||
GOFLAGS
|
GOFLAGS
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: amd64
|
arch: amd64
|
||||||
|
target: novulkan
|
||||||
build-args: |
|
build-args: |
|
||||||
CGO_CFLAGS
|
CGO_CFLAGS
|
||||||
CGO_CXXFLAGS
|
CGO_CXXFLAGS
|
||||||
@@ -280,6 +319,14 @@ jobs:
|
|||||||
CGO_CXXFLAGS
|
CGO_CXXFLAGS
|
||||||
GOFLAGS
|
GOFLAGS
|
||||||
FLAVOR=rocm
|
FLAVOR=rocm
|
||||||
|
- os: linux
|
||||||
|
arch: amd64
|
||||||
|
suffix: '-vulkan'
|
||||||
|
target: default
|
||||||
|
build-args: |
|
||||||
|
CGO_CFLAGS
|
||||||
|
CGO_CXXFLAGS
|
||||||
|
GOFLAGS
|
||||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||||
environment: release
|
environment: release
|
||||||
needs: setup-environment
|
needs: setup-environment
|
||||||
@@ -297,6 +344,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
platforms: ${{ matrix.os }}/${{ matrix.arch }}
|
platforms: ${{ matrix.os }}/${{ matrix.arch }}
|
||||||
|
target: ${{ matrix.target }}
|
||||||
build-args: ${{ matrix.build-args }}
|
build-args: ${{ matrix.build-args }}
|
||||||
outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
|
outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
|
||||||
cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
|
cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
|
||||||
|
|||||||
51
.github/workflows/test.yaml
vendored
@@ -46,12 +46,18 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
container: nvidia/cuda:12.8.1-devel-ubuntu22.04
|
container: nvidia/cuda:13.0.0-devel-ubuntu22.04
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
extra-packages: rocm-libs
|
extra-packages: rocm-libs
|
||||||
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
|
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
|
||||||
|
- preset: Vulkan
|
||||||
|
container: ubuntu:22.04
|
||||||
|
extra-packages: >
|
||||||
|
mesa-vulkan-drivers vulkan-tools
|
||||||
|
libvulkan1 libvulkan-dev
|
||||||
|
vulkan-sdk cmake ccache g++ make
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
container: ${{ matrix.container }}
|
container: ${{ matrix.container }}
|
||||||
steps:
|
steps:
|
||||||
@@ -59,7 +65,19 @@ jobs:
|
|||||||
- run: |
|
- run: |
|
||||||
[ -n "${{ matrix.container }}" ] || sudo=sudo
|
[ -n "${{ matrix.container }}" ] || sudo=sudo
|
||||||
$sudo apt-get update
|
$sudo apt-get update
|
||||||
|
# Add LunarG Vulkan SDK apt repo for Ubuntu 22.04
|
||||||
|
if [ "${{ matrix.preset }}" = "Vulkan" ]; then
|
||||||
|
$sudo apt-get install -y --no-install-recommends wget gnupg ca-certificates software-properties-common
|
||||||
|
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | $sudo gpg --dearmor -o /usr/share/keyrings/lunarg-archive-keyring.gpg
|
||||||
|
# Use signed-by to bind the repo to the installed keyring to avoid NO_PUBKEY
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/lunarg-archive-keyring.gpg] https://packages.lunarg.com/vulkan/1.4.313 jammy main" | $sudo tee /etc/apt/sources.list.d/lunarg-vulkan-1.4.313-jammy.list > /dev/null
|
||||||
|
$sudo apt-get update
|
||||||
|
fi
|
||||||
$sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
|
$sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
|
||||||
|
# Export VULKAN_SDK if provided by LunarG package (defensive)
|
||||||
|
if [ -d "/usr/lib/x86_64-linux-gnu/vulkan" ] && [ "${{ matrix.preset }}" = "Vulkan" ]; then
|
||||||
|
echo "VULKAN_SDK=/usr" >> $GITHUB_ENV
|
||||||
|
fi
|
||||||
env:
|
env:
|
||||||
DEBIAN_FRONTEND: noninteractive
|
DEBIAN_FRONTEND: noninteractive
|
||||||
- uses: actions/cache@v4
|
- uses: actions/cache@v4
|
||||||
@@ -78,23 +96,35 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
||||||
|
cuda-components:
|
||||||
|
- '"cudart"'
|
||||||
|
- '"nvcc"'
|
||||||
|
- '"cublas"'
|
||||||
|
- '"cublas_dev"'
|
||||||
|
- '"crt"'
|
||||||
|
- '"nvvm"'
|
||||||
|
- '"nvptxcompiler"'
|
||||||
|
cuda-version: '13.0'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||||
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
||||||
|
- preset: Vulkan
|
||||||
|
install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
|
||||||
runs-on: windows
|
runs-on: windows
|
||||||
steps:
|
steps:
|
||||||
- run: |
|
- run: |
|
||||||
choco install -y --no-progress ccache ninja
|
choco install -y --no-progress ccache ninja
|
||||||
ccache -o cache_dir=${{ github.workspace }}\.ccache
|
ccache -o cache_dir=${{ github.workspace }}\.ccache
|
||||||
- if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
|
- if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm' || matrix.preset == 'Vulkan'
|
||||||
id: cache-install
|
id: cache-install
|
||||||
uses: actions/cache/restore@v4
|
uses: actions/cache/restore@v4
|
||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
|
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
|
||||||
C:\Program Files\AMD\ROCm
|
C:\Program Files\AMD\ROCm
|
||||||
|
C:\VulkanSDK
|
||||||
key: ${{ matrix.install }}
|
key: ${{ matrix.install }}
|
||||||
- if: matrix.preset == 'CUDA'
|
- if: matrix.preset == 'CUDA'
|
||||||
name: Install CUDA ${{ matrix.cuda-version }}
|
name: Install CUDA ${{ matrix.cuda-version }}
|
||||||
@@ -102,7 +132,8 @@ jobs:
|
|||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
|
$subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
|
||||||
|
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
||||||
}
|
}
|
||||||
|
|
||||||
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
||||||
@@ -123,6 +154,18 @@ jobs:
|
|||||||
echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||||
echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
|
echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||||
echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
|
echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||||
|
- if: matrix.preset == 'Vulkan'
|
||||||
|
name: Install Vulkan ${{ matrix.rocm-version }}
|
||||||
|
run: |
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||||
|
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||||
|
Start-Process -FilePath .\install.exe -ArgumentList "-c","--am","--al","in" -NoNewWindow -Wait
|
||||||
|
}
|
||||||
|
|
||||||
|
$vulkanPath = (Resolve-Path "C:\VulkanSDK\*").path
|
||||||
|
echo "$vulkanPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "VULKAN_SDK=$vulkanPath" >> $env:GITHUB_ENV
|
||||||
- if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
|
- if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
|
||||||
uses: actions/cache/save@v4
|
uses: actions/cache/save@v4
|
||||||
with:
|
with:
|
||||||
|
|||||||
1
.gitignore
vendored
@@ -8,6 +8,7 @@
|
|||||||
dist
|
dist
|
||||||
build
|
build
|
||||||
.cache
|
.cache
|
||||||
|
.gocache
|
||||||
*.exe
|
*.exe
|
||||||
.idea
|
.idea
|
||||||
test_data
|
test_data
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
|
set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
|
||||||
set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
|
set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})
|
||||||
|
|
||||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR})
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR})
|
||||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR})
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR})
|
||||||
@@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
|
|||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
|
||||||
install(TARGETS ggml-cuda
|
install(TARGETS ggml-cuda
|
||||||
RUNTIME_DEPENDENCIES
|
RUNTIME_DEPENDENCIES
|
||||||
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
|
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
|
||||||
PRE_INCLUDE_REGEXES cublas cublasLt cudart
|
PRE_INCLUDE_REGEXES cublas cublasLt cudart
|
||||||
PRE_EXCLUDE_REGEXES ".*"
|
PRE_EXCLUDE_REGEXES ".*"
|
||||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
|
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
|
||||||
@@ -99,14 +99,17 @@ check_language(HIP)
|
|||||||
if(CMAKE_HIP_COMPILER)
|
if(CMAKE_HIP_COMPILER)
|
||||||
set(HIP_PLATFORM "amd")
|
set(HIP_PLATFORM "amd")
|
||||||
|
|
||||||
find_package(hip REQUIRED)
|
|
||||||
if(NOT AMDGPU_TARGETS)
|
if(NOT AMDGPU_TARGETS)
|
||||||
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(803|902|906(:xnack-)|90c(:xnack-)|1010(:xnack-)|1011(:xnack-)|1012(:xnack-)|103[0-6]|110[0-3]|115[01]|120[01])$")
|
find_package(hip REQUIRED)
|
||||||
elseif(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
|
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(803|90[012]|906(:xnack-)|90c(:xnack-)|1010(:xnack-)|1011(:xnack-)|1012(:xnack-)|103[0-6]|110[0-3]|115[0123]|120[01])$")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
|
||||||
list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
|
list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(AMDGPU_TARGETS)
|
if(AMDGPU_TARGETS)
|
||||||
|
find_package(hip REQUIRED)
|
||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
@@ -115,7 +118,6 @@ if(CMAKE_HIP_COMPILER)
|
|||||||
|
|
||||||
target_compile_definitions(ggml-hip PRIVATE GGML_HIP_NO_VMM)
|
target_compile_definitions(ggml-hip PRIVATE GGML_HIP_NO_VMM)
|
||||||
|
|
||||||
set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
|
|
||||||
install(TARGETS ggml-hip
|
install(TARGETS ggml-hip
|
||||||
RUNTIME_DEPENDENCY_SET rocm
|
RUNTIME_DEPENDENCY_SET rocm
|
||||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
||||||
@@ -126,15 +128,27 @@ if(CMAKE_HIP_COMPILER)
|
|||||||
PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
|
PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
|
||||||
PRE_EXCLUDE_REGEXES ".*"
|
PRE_EXCLUDE_REGEXES ".*"
|
||||||
POST_EXCLUDE_REGEXES "system32"
|
POST_EXCLUDE_REGEXES "system32"
|
||||||
RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
|
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
||||||
LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
|
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
||||||
)
|
)
|
||||||
|
|
||||||
foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
|
foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
|
||||||
if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
|
if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
|
||||||
install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
|
install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP)
|
||||||
break()
|
break()
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
find_package(Vulkan)
|
||||||
|
if(Vulkan_FOUND)
|
||||||
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
|
||||||
|
install(TARGETS ggml-vulkan
|
||||||
|
RUNTIME_DEPENDENCIES
|
||||||
|
PRE_INCLUDE_REGEXES vulkan
|
||||||
|
PRE_EXCLUDE_REGEXES ".*"
|
||||||
|
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||||
|
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|||||||
@@ -18,14 +18,30 @@
|
|||||||
"name": "CUDA",
|
"name": "CUDA",
|
||||||
"inherits": [ "Default" ]
|
"inherits": [ "Default" ]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
|
||||||
|
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
"CMAKE_CUDA_ARCHITECTURES": "50;52;60;61;70;75;80;86;89;90;90a;120",
|
||||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 13",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
|
||||||
|
"CMAKE_CUDA_FLAGS": "-t 2"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "JetPack 5",
|
"name": "JetPack 5",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
@@ -52,8 +68,12 @@
|
|||||||
"inherits": [ "ROCm" ],
|
"inherits": [ "ROCm" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
||||||
"AMDGPU_TARGETS": "gfx803;gfx902;gfx1030;gfx1031;gfx1032;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1200;gfx1201;gfx900:xnack-;gfx906:xnack-;gfx90c:xnack-;gfx1010:xnack-;gfx1011:xnack-;gfx1012:xnack-;"
|
"AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Vulkan",
|
||||||
|
"inherits": [ "Default" ]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"buildPresets": [
|
"buildPresets": [
|
||||||
@@ -72,11 +92,21 @@
|
|||||||
"configurePreset": "CUDA",
|
"configurePreset": "CUDA",
|
||||||
"targets": [ "ggml-cuda" ]
|
"targets": [ "ggml-cuda" ]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "CUDA 11"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
"configurePreset": "CUDA 12"
|
"configurePreset": "CUDA 12"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 13",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "CUDA 13"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "JetPack 5",
|
"name": "JetPack 5",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
@@ -96,6 +126,11 @@
|
|||||||
"name": "ROCm 6",
|
"name": "ROCm 6",
|
||||||
"inherits": [ "ROCm" ],
|
"inherits": [ "ROCm" ],
|
||||||
"configurePreset": "ROCm 6"
|
"configurePreset": "ROCm 6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Vulkan",
|
||||||
|
"targets": [ "ggml-vulkan" ],
|
||||||
|
"configurePreset": "Vulkan"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
116
Dockerfile
@@ -1,11 +1,13 @@
|
|||||||
# vim: filetype=dockerfile
|
# vim: filetype=dockerfile
|
||||||
|
|
||||||
ARG FLAVOR=${TARGETARCH}
|
ARG FLAVOR=${TARGETARCH}
|
||||||
|
ARG PARALLEL=8
|
||||||
|
|
||||||
ARG ROCMVERSION=6.3.3
|
ARG ROCMVERSION=6.3.3
|
||||||
ARG JETPACK5VERSION=r35.4.1
|
ARG JETPACK5VERSION=r35.4.1
|
||||||
ARG JETPACK6VERSION=r36.4.0
|
ARG JETPACK6VERSION=r36.4.0
|
||||||
ARG CMAKEVERSION=3.31.2
|
ARG CMAKEVERSION=3.31.2
|
||||||
|
ARG VULKANVERSION=1.4.321.1
|
||||||
|
|
||||||
# We require gcc v10 minimum. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
# We require gcc v10 minimum. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
||||||
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
||||||
@@ -16,6 +18,16 @@ RUN yum install -y yum-utils \
|
|||||||
&& dnf install -y ccache \
|
&& dnf install -y ccache \
|
||||||
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
||||||
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||||
|
ARG VULKANVERSION
|
||||||
|
RUN wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
|
||||||
|
&& tar xvf /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
|
||||||
|
&& dnf -y install ninja-build \
|
||||||
|
&& ln -s /usr/bin/python3 /usr/bin/python \
|
||||||
|
&& /${VULKANVERSION}/vulkansdk -j 8 vulkan-headers \
|
||||||
|
&& /${VULKANVERSION}/vulkansdk -j 8 shaderc
|
||||||
|
RUN cp -r /${VULKANVERSION}/x86_64/include/* /usr/local/include/ \
|
||||||
|
&& cp -r /${VULKANVERSION}/x86_64/lib/* /usr/local/lib
|
||||||
|
ENV PATH=/${VULKANVERSION}/x86_64/bin:$PATH
|
||||||
|
|
||||||
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
||||||
# install epel-release for ccache
|
# install epel-release for ccache
|
||||||
@@ -34,26 +46,52 @@ ENV LDFLAGS=-s
|
|||||||
FROM base AS cpu
|
FROM base AS cpu
|
||||||
RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
|
RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
|
||||||
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
|
||||||
|
ARG PARALLEL
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
cmake --preset 'CPU' \
|
cmake --preset 'CPU' \
|
||||||
&& cmake --build --parallel --preset 'CPU' \
|
&& cmake --build --parallel ${PARALLEL} --preset 'CPU' \
|
||||||
&& cmake --install build --component CPU --strip --parallel 8
|
&& cmake --install build --component CPU --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
|
FROM base AS cuda-11
|
||||||
|
ARG CUDA11VERSION=11.8
|
||||||
|
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||||
|
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||||
|
ARG PARALLEL
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
|
||||||
|
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
|
||||||
|
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
FROM base AS cuda-12
|
FROM base AS cuda-12
|
||||||
ARG CUDA12VERSION=12.8
|
ARG CUDA12VERSION=12.8
|
||||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||||
ENV PATH=/usr/local/cuda-12/bin:$PATH
|
ENV PATH=/usr/local/cuda-12/bin:$PATH
|
||||||
|
ARG PARALLEL
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
cmake --preset 'CUDA 12' \
|
cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
|
||||||
&& cmake --build --parallel --preset 'CUDA 12' \
|
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
|
||||||
&& cmake --install build --component CUDA --strip --parallel 8
|
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
|
|
||||||
|
FROM base AS cuda-13
|
||||||
|
ARG CUDA13VERSION=13.0
|
||||||
|
RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
|
||||||
|
ENV PATH=/usr/local/cuda-13/bin:$PATH
|
||||||
|
ARG PARALLEL
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
|
||||||
|
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
|
||||||
|
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
|
|
||||||
FROM base AS rocm-6
|
FROM base AS rocm-6
|
||||||
ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
|
ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
|
||||||
|
ARG PARALLEL
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
cmake --preset 'ROCm 6' \
|
cmake --preset 'ROCm 6' -DOLLAMA_RUNNER_DIR="rocm" \
|
||||||
&& cmake --build --parallel --preset 'ROCm 6' \
|
&& cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
|
||||||
&& cmake --install build --component HIP --strip --parallel 8
|
&& cmake --install build --component HIP --strip --parallel ${PARALLEL}
|
||||||
|
RUN rm -f dist/lib/ollama/rocm/rocblas/library/*gfx90[06]*
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
|
||||||
ARG CMAKEVERSION
|
ARG CMAKEVERSION
|
||||||
@@ -61,10 +99,11 @@ RUN apt-get update && apt-get install -y curl ccache \
|
|||||||
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||||
COPY CMakeLists.txt CMakePresets.json .
|
COPY CMakeLists.txt CMakePresets.json .
|
||||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||||
|
ARG PARALLEL
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
cmake --preset 'JetPack 5' \
|
cmake --preset 'JetPack 5' -DOLLAMA_RUNNER_DIR="cuda_jetpack5" \
|
||||||
&& cmake --build --parallel --preset 'JetPack 5' \
|
&& cmake --build --parallel ${PARALLEL} --preset 'JetPack 5' \
|
||||||
&& cmake --install build --component CUDA --strip --parallel 8
|
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
|
||||||
ARG CMAKEVERSION
|
ARG CMAKEVERSION
|
||||||
@@ -72,10 +111,18 @@ RUN apt-get update && apt-get install -y curl ccache \
|
|||||||
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||||
COPY CMakeLists.txt CMakePresets.json .
|
COPY CMakeLists.txt CMakePresets.json .
|
||||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||||
|
ARG PARALLEL
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
cmake --preset 'JetPack 6' \
|
cmake --preset 'JetPack 6' -DOLLAMA_RUNNER_DIR="cuda_jetpack6" \
|
||||||
&& cmake --build --parallel --preset 'JetPack 6' \
|
&& cmake --build --parallel ${PARALLEL} --preset 'JetPack 6' \
|
||||||
&& cmake --install build --component CUDA --strip --parallel 8
|
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||||
|
|
||||||
|
FROM base AS vulkan
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'Vulkan' -DOLLAMA_RUNNER_DIR="vulkan" \
|
||||||
|
&& cmake --build --parallel --preset 'Vulkan' \
|
||||||
|
&& cmake --install build --component Vulkan --strip --parallel 8
|
||||||
|
|
||||||
|
|
||||||
FROM base AS build
|
FROM base AS build
|
||||||
WORKDIR /go/src/github.com/ollama/ollama
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
@@ -92,25 +139,56 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
|
|||||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS amd64
|
FROM --platform=linux/amd64 scratch AS amd64
|
||||||
COPY --from=cuda-12 dist/lib/ollama /lib/ollama
|
# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
|
||||||
|
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
|
||||||
|
COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
|
||||||
|
COPY --from=vulkan dist/lib/ollama /lib/ollama/
|
||||||
|
|
||||||
FROM --platform=linux/arm64 scratch AS arm64
|
FROM --platform=linux/arm64 scratch AS arm64
|
||||||
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
|
# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
|
||||||
COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
|
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
|
||||||
COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
|
COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
|
||||||
|
COPY --from=jetpack-5 dist/lib/ollama/ /lib/ollama/
|
||||||
|
COPY --from=jetpack-6 dist/lib/ollama/ /lib/ollama/
|
||||||
|
|
||||||
FROM scratch AS rocm
|
FROM scratch AS rocm
|
||||||
COPY --from=rocm-6 dist/lib/ollama /lib/ollama
|
COPY --from=rocm-6 dist/lib/ollama /lib/ollama
|
||||||
|
|
||||||
FROM ${FLAVOR} AS archive
|
FROM ${FLAVOR} AS archive
|
||||||
|
ARG VULKANVERSION
|
||||||
COPY --from=cpu dist/lib/ollama /lib/ollama
|
COPY --from=cpu dist/lib/ollama /lib/ollama
|
||||||
COPY --from=build /bin/ollama /bin/ollama
|
COPY --from=build /bin/ollama /bin/ollama
|
||||||
|
|
||||||
FROM ubuntu:24.04
|
# Temporary opt-out stages for Vulkan
|
||||||
|
FROM --platform=linux/amd64 scratch AS amd64_novulkan
|
||||||
|
# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
|
||||||
|
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
|
||||||
|
COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
|
||||||
|
FROM arm64 AS arm64_novulkan
|
||||||
|
FROM ${FLAVOR}_novulkan AS archive_novulkan
|
||||||
|
COPY --from=cpu dist/lib/ollama /lib/ollama
|
||||||
|
COPY --from=build /bin/ollama /bin/ollama
|
||||||
|
FROM ubuntu:24.04 AS novulkan
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y ca-certificates \
|
&& apt-get install -y ca-certificates \
|
||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
COPY --from=archive_novulkan /bin /usr/bin
|
||||||
|
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
|
COPY --from=archive_novulkan /lib/ollama /usr/lib/ollama
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
ENV OLLAMA_HOST=0.0.0.0:11434
|
||||||
|
EXPOSE 11434
|
||||||
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
|
CMD ["serve"]
|
||||||
|
|
||||||
|
FROM ubuntu:24.04 AS default
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y ca-certificates libvulkan1 \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=archive /bin /usr/bin
|
COPY --from=archive /bin /usr/bin
|
||||||
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
COPY --from=archive /lib/ollama /usr/lib/ollama
|
COPY --from=archive /lib/ollama /usr/lib/ollama
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||||
WORKDIR=llama/vendor
|
WORKDIR=llama/vendor
|
||||||
FETCH_HEAD=e54d41befcc1575f4c898c5ff4ef43970cead75f
|
FETCH_HEAD=7049736b2dd9011bf819e298b844ebbc4b5afdc9
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help:
|
||||||
|
|||||||
@@ -435,6 +435,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
|
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
|
||||||
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
|
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
|
||||||
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
|
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
|
||||||
|
- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
|
||||||
|
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
|
||||||
|
|
||||||
### Cloud
|
### Cloud
|
||||||
|
|
||||||
@@ -481,6 +483,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
|
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
|
||||||
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
|
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
|
||||||
- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
|
- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
|
||||||
|
- [VT Code](https://github.com/vinhnx/vtcode) - VT Code is a Rust-based terminal coding agent with semantic code intelligence via Tree-sitter. Ollama integration for running local/cloud models with configurable endpoints.
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
|
|
||||||
@@ -564,6 +567,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
|
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
|
||||||
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
|
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
|
||||||
- [Neuro SAN](https://github.com/cognizant-ai-lab/neuro-san-studio) (Data-driven multi-agent orchestration framework) with [example](https://github.com/cognizant-ai-lab/neuro-san-studio/blob/main/docs/user_guide.md#ollama)
|
- [Neuro SAN](https://github.com/cognizant-ai-lab/neuro-san-studio) (Data-driven multi-agent orchestration framework) with [example](https://github.com/cognizant-ai-lab/neuro-san-studio/blob/main/docs/user_guide.md#ollama)
|
||||||
|
- [achatbot-go](https://github.com/ai-bot-pro/achatbot-go) a multimodal(text/audio/image) chatbot.
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
|
|||||||
@@ -45,6 +45,12 @@ func checkError(resp *http.Response, body []byte) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusUnauthorized {
|
||||||
|
authError := AuthorizationError{StatusCode: resp.StatusCode}
|
||||||
|
json.Unmarshal(body, &authError)
|
||||||
|
return authError
|
||||||
|
}
|
||||||
|
|
||||||
apiError := StatusError{StatusCode: resp.StatusCode}
|
apiError := StatusError{StatusCode: resp.StatusCode}
|
||||||
|
|
||||||
err := json.Unmarshal(body, &apiError)
|
err := json.Unmarshal(body, &apiError)
|
||||||
@@ -215,6 +221,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
|||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
var errorResponse struct {
|
var errorResponse struct {
|
||||||
Error string `json:"error,omitempty"`
|
Error string `json:"error,omitempty"`
|
||||||
|
SigninURL string `json:"signin_url,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
bts := scanner.Bytes()
|
bts := scanner.Bytes()
|
||||||
@@ -222,7 +229,13 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
|||||||
return fmt.Errorf("unmarshal: %w", err)
|
return fmt.Errorf("unmarshal: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if response.StatusCode >= http.StatusBadRequest {
|
if response.StatusCode == http.StatusUnauthorized {
|
||||||
|
return AuthorizationError{
|
||||||
|
StatusCode: response.StatusCode,
|
||||||
|
Status: response.Status,
|
||||||
|
SigninURL: errorResponse.SigninURL,
|
||||||
|
}
|
||||||
|
} else if response.StatusCode >= http.StatusBadRequest {
|
||||||
return StatusError{
|
return StatusError{
|
||||||
StatusCode: response.StatusCode,
|
StatusCode: response.StatusCode,
|
||||||
Status: response.Status,
|
Status: response.Status,
|
||||||
@@ -428,3 +441,21 @@ func (c *Client) Version(ctx context.Context) (string, error) {
|
|||||||
|
|
||||||
return version.Version, nil
|
return version.Version, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Signout will signout a client for a local ollama server.
|
||||||
|
func (c *Client) Signout(ctx context.Context) error {
|
||||||
|
return c.do(ctx, http.MethodPost, "/api/signout", nil, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disconnect will disconnect an ollama instance from ollama.com.
|
||||||
|
func (c *Client) Disconnect(ctx context.Context, encodedKey string) error {
|
||||||
|
return c.do(ctx, http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey), nil, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Client) Whoami(ctx context.Context) (*UserResponse, error) {
|
||||||
|
var resp UserResponse
|
||||||
|
if err := c.do(ctx, http.MethodPost, "/api/me", nil, &resp); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &resp, nil
|
||||||
|
}
|
||||||
|
|||||||
125
api/types.go
@@ -11,6 +11,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
)
|
)
|
||||||
@@ -36,6 +38,19 @@ func (e StatusError) Error() string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type AuthorizationError struct {
|
||||||
|
StatusCode int
|
||||||
|
Status string
|
||||||
|
SigninURL string `json:"signin_url"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e AuthorizationError) Error() string {
|
||||||
|
if e.Status != "" {
|
||||||
|
return e.Status
|
||||||
|
}
|
||||||
|
return "something went wrong, please see the ollama server logs for details"
|
||||||
|
}
|
||||||
|
|
||||||
// ImageData represents the raw binary data of an image file.
|
// ImageData represents the raw binary data of an image file.
|
||||||
type ImageData []byte
|
type ImageData []byte
|
||||||
|
|
||||||
@@ -91,6 +106,14 @@ type GenerateRequest struct {
|
|||||||
// before this option was introduced)
|
// before this option was introduced)
|
||||||
Think *ThinkValue `json:"think,omitempty"`
|
Think *ThinkValue `json:"think,omitempty"`
|
||||||
|
|
||||||
|
// Truncate is a boolean that, when set to true, truncates the chat history messages
|
||||||
|
// if the rendered prompt exceeds the context length limit.
|
||||||
|
Truncate *bool `json:"truncate,omitempty"`
|
||||||
|
|
||||||
|
// Shift is a boolean that, when set to true, shifts the chat history
|
||||||
|
// when hitting the context length limit instead of erroring.
|
||||||
|
Shift *bool `json:"shift,omitempty"`
|
||||||
|
|
||||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||||
// template instead of calling the model.
|
// template instead of calling the model.
|
||||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||||
@@ -125,6 +148,14 @@ type ChatRequest struct {
|
|||||||
// for supported models.
|
// for supported models.
|
||||||
Think *ThinkValue `json:"think,omitempty"`
|
Think *ThinkValue `json:"think,omitempty"`
|
||||||
|
|
||||||
|
// Truncate is a boolean that, when set to true, truncates the chat history messages
|
||||||
|
// if the rendered prompt exceeds the context length limit.
|
||||||
|
Truncate *bool `json:"truncate,omitempty"`
|
||||||
|
|
||||||
|
// Shift is a boolean that, when set to true, shifts the chat history
|
||||||
|
// when hitting the context length limit instead of erroring.
|
||||||
|
Shift *bool `json:"shift,omitempty"`
|
||||||
|
|
||||||
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
|
||||||
// template instead of calling the model.
|
// template instead of calling the model.
|
||||||
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
|
||||||
@@ -173,7 +204,7 @@ type ToolCall struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type ToolCallFunction struct {
|
type ToolCallFunction struct {
|
||||||
Index int `json:"index,omitempty"`
|
Index int `json:"index"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Arguments ToolCallFunctionArguments `json:"arguments"`
|
Arguments ToolCallFunctionArguments `json:"arguments"`
|
||||||
}
|
}
|
||||||
@@ -235,9 +266,9 @@ func (pt PropertyType) String() string {
|
|||||||
|
|
||||||
type ToolProperty struct {
|
type ToolProperty struct {
|
||||||
AnyOf []ToolProperty `json:"anyOf,omitempty"`
|
AnyOf []ToolProperty `json:"anyOf,omitempty"`
|
||||||
Type PropertyType `json:"type"`
|
Type PropertyType `json:"type,omitempty"`
|
||||||
Items any `json:"items,omitempty"`
|
Items any `json:"items,omitempty"`
|
||||||
Description string `json:"description"`
|
Description string `json:"description,omitempty"`
|
||||||
Enum []any `json:"enum,omitempty"`
|
Enum []any `json:"enum,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -301,7 +332,7 @@ func (t *ToolFunctionParameters) String() string {
|
|||||||
|
|
||||||
type ToolFunction struct {
|
type ToolFunction struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Description string `json:"description"`
|
Description string `json:"description,omitempty"`
|
||||||
Parameters ToolFunctionParameters `json:"parameters"`
|
Parameters ToolFunctionParameters `json:"parameters"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -313,12 +344,28 @@ func (t *ToolFunction) String() string {
|
|||||||
// ChatResponse is the response returned by [Client.Chat]. Its fields are
|
// ChatResponse is the response returned by [Client.Chat]. Its fields are
|
||||||
// similar to [GenerateResponse].
|
// similar to [GenerateResponse].
|
||||||
type ChatResponse struct {
|
type ChatResponse struct {
|
||||||
|
// Model is the model name that generated the response.
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
// RemoteModel is the name of the upstream model that generated the response.
|
||||||
|
RemoteModel string `json:"remote_model,omitempty"`
|
||||||
|
|
||||||
|
// RemoteHost is the URL of the upstream Ollama host that generated the response.
|
||||||
|
RemoteHost string `json:"remote_host,omitempty"`
|
||||||
|
|
||||||
|
// CreatedAt is the timestamp of the response.
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
|
// Message contains the message or part of a message from the model.
|
||||||
Message Message `json:"message"`
|
Message Message `json:"message"`
|
||||||
|
|
||||||
|
// Done specifies if the response is complete.
|
||||||
|
Done bool `json:"done"`
|
||||||
|
|
||||||
|
// DoneReason is the reason the model stopped generating text.
|
||||||
DoneReason string `json:"done_reason,omitempty"`
|
DoneReason string `json:"done_reason,omitempty"`
|
||||||
|
|
||||||
Done bool `json:"done"`
|
DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
|
||||||
|
|
||||||
Metrics
|
Metrics
|
||||||
}
|
}
|
||||||
@@ -329,13 +376,6 @@ type DebugInfo struct {
|
|||||||
ImageCount int `json:"image_count,omitempty"`
|
ImageCount int `json:"image_count,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DebugTemplateResponse is returned when _debug_render_only is set to true
|
|
||||||
type DebugTemplateResponse struct {
|
|
||||||
Model string `json:"model"`
|
|
||||||
CreatedAt time.Time `json:"created_at"`
|
|
||||||
DebugInfo DebugInfo `json:"_debug_info"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type Metrics struct {
|
type Metrics struct {
|
||||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||||
@@ -388,8 +428,12 @@ type EmbedRequest struct {
|
|||||||
// this request.
|
// this request.
|
||||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||||
|
|
||||||
|
// Truncate truncates the input to fit the model's max sequence length.
|
||||||
Truncate *bool `json:"truncate,omitempty"`
|
Truncate *bool `json:"truncate,omitempty"`
|
||||||
|
|
||||||
|
// Dimensions truncates the output embedding to the specified dimension.
|
||||||
|
Dimensions int `json:"dimensions,omitempty"`
|
||||||
|
|
||||||
// Options lists model-specific options.
|
// Options lists model-specific options.
|
||||||
Options map[string]any `json:"options"`
|
Options map[string]any `json:"options"`
|
||||||
}
|
}
|
||||||
@@ -427,19 +471,48 @@ type EmbeddingResponse struct {
|
|||||||
|
|
||||||
// CreateRequest is the request passed to [Client.Create].
|
// CreateRequest is the request passed to [Client.Create].
|
||||||
type CreateRequest struct {
|
type CreateRequest struct {
|
||||||
|
// Model is the model name to create.
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
// Stream specifies whether the response is streaming; it is true by default.
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
|
// Quantize is the quantization format for the model; leave blank to not change the quantization level.
|
||||||
Quantize string `json:"quantize,omitempty"`
|
Quantize string `json:"quantize,omitempty"`
|
||||||
|
|
||||||
|
// From is the name of the model or file to use as the source.
|
||||||
From string `json:"from,omitempty"`
|
From string `json:"from,omitempty"`
|
||||||
|
|
||||||
|
// RemoteHost is the URL of the upstream ollama API for the model (if any).
|
||||||
|
RemoteHost string `json:"remote_host,omitempty"`
|
||||||
|
|
||||||
|
// Files is a map of files include when creating the model.
|
||||||
Files map[string]string `json:"files,omitempty"`
|
Files map[string]string `json:"files,omitempty"`
|
||||||
|
|
||||||
|
// Adapters is a map of LoRA adapters to include when creating the model.
|
||||||
Adapters map[string]string `json:"adapters,omitempty"`
|
Adapters map[string]string `json:"adapters,omitempty"`
|
||||||
|
|
||||||
|
// Template is the template used when constructing a request to the model.
|
||||||
Template string `json:"template,omitempty"`
|
Template string `json:"template,omitempty"`
|
||||||
|
|
||||||
|
// License is a string or list of strings for licenses.
|
||||||
License any `json:"license,omitempty"`
|
License any `json:"license,omitempty"`
|
||||||
|
|
||||||
|
// System is the system prompt for the model.
|
||||||
System string `json:"system,omitempty"`
|
System string `json:"system,omitempty"`
|
||||||
|
|
||||||
|
// Parameters is a map of hyper-parameters which are applied to the model.
|
||||||
Parameters map[string]any `json:"parameters,omitempty"`
|
Parameters map[string]any `json:"parameters,omitempty"`
|
||||||
|
|
||||||
|
// Messages is a list of messages added to the model before chat and generation requests.
|
||||||
Messages []Message `json:"messages,omitempty"`
|
Messages []Message `json:"messages,omitempty"`
|
||||||
|
|
||||||
|
Renderer string `json:"renderer,omitempty"`
|
||||||
|
Parser string `json:"parser,omitempty"`
|
||||||
|
|
||||||
|
// Info is a map of additional information for the model
|
||||||
|
Info map[string]any `json:"info,omitempty"`
|
||||||
|
|
||||||
// Deprecated: set the model name with Model instead
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
// Deprecated: use Quantize instead
|
// Deprecated: use Quantize instead
|
||||||
@@ -476,8 +549,12 @@ type ShowResponse struct {
|
|||||||
Parameters string `json:"parameters,omitempty"`
|
Parameters string `json:"parameters,omitempty"`
|
||||||
Template string `json:"template,omitempty"`
|
Template string `json:"template,omitempty"`
|
||||||
System string `json:"system,omitempty"`
|
System string `json:"system,omitempty"`
|
||||||
|
Renderer string `json:"renderer,omitempty"`
|
||||||
|
Parser string `json:"parser,omitempty"`
|
||||||
Details ModelDetails `json:"details,omitempty"`
|
Details ModelDetails `json:"details,omitempty"`
|
||||||
Messages []Message `json:"messages,omitempty"`
|
Messages []Message `json:"messages,omitempty"`
|
||||||
|
RemoteModel string `json:"remote_model,omitempty"`
|
||||||
|
RemoteHost string `json:"remote_host,omitempty"`
|
||||||
ModelInfo map[string]any `json:"model_info,omitempty"`
|
ModelInfo map[string]any `json:"model_info,omitempty"`
|
||||||
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
|
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
|
||||||
Tensors []Tensor `json:"tensors,omitempty"`
|
Tensors []Tensor `json:"tensors,omitempty"`
|
||||||
@@ -538,6 +615,8 @@ type ProcessResponse struct {
|
|||||||
type ListModelResponse struct {
|
type ListModelResponse struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
RemoteModel string `json:"remote_model,omitempty"`
|
||||||
|
RemoteHost string `json:"remote_host,omitempty"`
|
||||||
ModifiedAt time.Time `json:"modified_at"`
|
ModifiedAt time.Time `json:"modified_at"`
|
||||||
Size int64 `json:"size"`
|
Size int64 `json:"size"`
|
||||||
Digest string `json:"digest"`
|
Digest string `json:"digest"`
|
||||||
@@ -565,6 +644,12 @@ type GenerateResponse struct {
|
|||||||
// Model is the model name that generated the response.
|
// Model is the model name that generated the response.
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
// RemoteModel is the name of the upstream model that generated the response.
|
||||||
|
RemoteModel string `json:"remote_model,omitempty"`
|
||||||
|
|
||||||
|
// RemoteHost is the URL of the upstream Ollama host that generated the response.
|
||||||
|
RemoteHost string `json:"remote_host,omitempty"`
|
||||||
|
|
||||||
// CreatedAt is the timestamp of the response.
|
// CreatedAt is the timestamp of the response.
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
@@ -588,6 +673,8 @@ type GenerateResponse struct {
|
|||||||
Metrics
|
Metrics
|
||||||
|
|
||||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||||
|
|
||||||
|
DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ModelDetails provides details about a model.
|
// ModelDetails provides details about a model.
|
||||||
@@ -600,6 +687,18 @@ type ModelDetails struct {
|
|||||||
QuantizationLevel string `json:"quantization_level"`
|
QuantizationLevel string `json:"quantization_level"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UserResponse provides information about a user.
|
||||||
|
type UserResponse struct {
|
||||||
|
ID uuid.UUID `json:"id"`
|
||||||
|
Email string `json:"email"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Bio string `json:"bio,omitempty"`
|
||||||
|
AvatarURL string `json:"avatarurl,omitempty"`
|
||||||
|
FirstName string `json:"firstname,omitempty"`
|
||||||
|
LastName string `json:"lastname,omitempty"`
|
||||||
|
Plan string `json:"plan,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// Tensor describes the metadata for a given tensor.
|
// Tensor describes the metadata for a given tensor.
|
||||||
type Tensor struct {
|
type Tensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
@@ -853,7 +952,7 @@ func (t *ThinkValue) UnmarshalJSON(data []byte) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\")")
|
return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", true, or false)")
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarshalJSON implements json.Marshaler
|
// MarshalJSON implements json.Marshaler
|
||||||
|
|||||||
@@ -298,6 +298,30 @@ func TestToolFunction_UnmarshalJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestToolCallFunction_IndexAlwaysMarshals(t *testing.T) {
|
||||||
|
fn := ToolCallFunction{
|
||||||
|
Name: "echo",
|
||||||
|
Arguments: ToolCallFunctionArguments{"message": "hi"},
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(fn)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
raw := map[string]any{}
|
||||||
|
require.NoError(t, json.Unmarshal(data, &raw))
|
||||||
|
require.Contains(t, raw, "index")
|
||||||
|
assert.Equal(t, float64(0), raw["index"])
|
||||||
|
|
||||||
|
fn.Index = 3
|
||||||
|
data, err = json.Marshal(fn)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
raw = map[string]any{}
|
||||||
|
require.NoError(t, json.Unmarshal(data, &raw))
|
||||||
|
require.Contains(t, raw, "index")
|
||||||
|
assert.Equal(t, float64(3), raw["index"])
|
||||||
|
}
|
||||||
|
|
||||||
func TestPropertyType_UnmarshalJSON(t *testing.T) {
|
func TestPropertyType_UnmarshalJSON(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
|
|||||||
15
auth/auth.go
@@ -18,21 +18,13 @@ import (
|
|||||||
|
|
||||||
const defaultPrivateKey = "id_ed25519"
|
const defaultPrivateKey = "id_ed25519"
|
||||||
|
|
||||||
func keyPath() (string, error) {
|
func GetPublicKey() (string, error) {
|
||||||
home, err := os.UserHomeDir()
|
home, err := os.UserHomeDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return filepath.Join(home, ".ollama", defaultPrivateKey), nil
|
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
|
||||||
}
|
|
||||||
|
|
||||||
func GetPublicKey() (string, error) {
|
|
||||||
keyPath, err := keyPath()
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
privateKeyFile, err := os.ReadFile(keyPath)
|
privateKeyFile, err := os.ReadFile(keyPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||||
@@ -59,11 +51,12 @@ func NewNonce(r io.Reader, length int) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Sign(ctx context.Context, bts []byte) (string, error) {
|
func Sign(ctx context.Context, bts []byte) (string, error) {
|
||||||
keyPath, err := keyPath()
|
home, err := os.UserHomeDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
|
||||||
privateKeyFile, err := os.ReadFile(keyPath)
|
privateKeyFile, err := os.ReadFile(keyPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||||
|
|||||||
212
cmd/cmd.go
@@ -47,6 +47,8 @@ import (
|
|||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const ConnectInstructions = "To sign in, navigate to:\n %s\n\n"
|
||||||
|
|
||||||
// ensureThinkingSupport emits a warning if the model does not advertise thinking support
|
// ensureThinkingSupport emits a warning if the model does not advertise thinking support
|
||||||
func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
|
func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
|
||||||
if name == "" {
|
if name == "" {
|
||||||
@@ -56,11 +58,9 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, cap := range resp.Capabilities {
|
if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
|
||||||
if cap == model.CapabilityThinking {
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
|
||||||
fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
|
fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -288,7 +288,17 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
|||||||
Think: opts.Think,
|
Think: opts.Think,
|
||||||
}
|
}
|
||||||
|
|
||||||
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
return client.Generate(cmd.Context(), req, func(r api.GenerateResponse) error {
|
||||||
|
if r.RemoteModel != "" && opts.ShowConnect {
|
||||||
|
p.StopAndClear()
|
||||||
|
if strings.HasPrefix(r.RemoteHost, "https://ollama.com") {
|
||||||
|
fmt.Fprintf(os.Stderr, "Connecting to '%s' on 'ollama.com' ⚡\n", r.RemoteModel)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(os.Stderr, "Connecting to '%s' on '%s'\n", r.RemoteModel, r.RemoteHost)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func StopHandler(cmd *cobra.Command, args []string) error {
|
func StopHandler(cmd *cobra.Command, args []string) error {
|
||||||
@@ -312,6 +322,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
Model: args[0],
|
Model: args[0],
|
||||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||||
Options: map[string]any{},
|
Options: map[string]any{},
|
||||||
|
ShowConnect: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
format, err := cmd.Flags().GetString("format")
|
format, err := cmd.Flags().GetString("format")
|
||||||
@@ -369,6 +380,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
prompts = append([]string{string(in)}, prompts...)
|
prompts = append([]string{string(in)}, prompts...)
|
||||||
|
opts.ShowConnect = false
|
||||||
opts.WordWrap = false
|
opts.WordWrap = false
|
||||||
interactive = false
|
interactive = false
|
||||||
}
|
}
|
||||||
@@ -435,6 +447,15 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
if interactive {
|
if interactive {
|
||||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
|
var sErr api.AuthorizationError
|
||||||
|
if errors.As(err, &sErr) && sErr.StatusCode == http.StatusUnauthorized {
|
||||||
|
fmt.Printf("You need to be signed in to Ollama to run Cloud models.\n\n")
|
||||||
|
|
||||||
|
if sErr.SigninURL != "" {
|
||||||
|
fmt.Printf(ConnectInstructions, sErr.SigninURL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -455,6 +476,59 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return generate(cmd, opts)
|
return generate(cmd, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SigninHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
user, err := client.Whoami(cmd.Context())
|
||||||
|
if err != nil {
|
||||||
|
var aErr api.AuthorizationError
|
||||||
|
if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
|
||||||
|
fmt.Println("You need to be signed in to Ollama to run Cloud models.")
|
||||||
|
fmt.Println()
|
||||||
|
|
||||||
|
if aErr.SigninURL != "" {
|
||||||
|
fmt.Printf(ConnectInstructions, aErr.SigninURL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if user != nil && user.Name != "" {
|
||||||
|
fmt.Printf("You are already signed in as user '%s'\n", user.Name)
|
||||||
|
fmt.Println()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func SignoutHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = client.Signout(cmd.Context())
|
||||||
|
if err != nil {
|
||||||
|
var aErr api.AuthorizationError
|
||||||
|
if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
|
||||||
|
fmt.Println("You are not signed in to ollama.com")
|
||||||
|
fmt.Println()
|
||||||
|
return nil
|
||||||
|
} else {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println("You have signed out of ollama.com")
|
||||||
|
fmt.Println()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func PushHandler(cmd *cobra.Command, args []string) error {
|
func PushHandler(cmd *cobra.Command, args []string) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
client, err := api.ClientFromEnvironment()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -466,6 +540,25 @@ func PushHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
n := model.ParseName(args[0])
|
||||||
|
if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
|
||||||
|
_, err := client.Whoami(cmd.Context())
|
||||||
|
if err != nil {
|
||||||
|
var aErr api.AuthorizationError
|
||||||
|
if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
|
||||||
|
fmt.Println("You need to be signed in to push models to ollama.com.")
|
||||||
|
fmt.Println()
|
||||||
|
|
||||||
|
if aErr.SigninURL != "" {
|
||||||
|
fmt.Printf(ConnectInstructions, aErr.SigninURL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
p := progress.NewProgress(os.Stderr)
|
||||||
defer p.Stop()
|
defer p.Stop()
|
||||||
|
|
||||||
@@ -502,12 +595,12 @@ func PushHandler(cmd *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
request := api.PushRequest{Name: args[0], Insecure: insecure}
|
request := api.PushRequest{Name: args[0], Insecure: insecure}
|
||||||
|
|
||||||
n := model.ParseName(args[0])
|
|
||||||
if err := client.Push(cmd.Context(), &request, fn); err != nil {
|
if err := client.Push(cmd.Context(), &request, fn); err != nil {
|
||||||
if spinner != nil {
|
if spinner != nil {
|
||||||
spinner.Stop()
|
spinner.Stop()
|
||||||
}
|
}
|
||||||
if strings.Contains(err.Error(), "access denied") {
|
errStr := strings.ToLower(err.Error())
|
||||||
|
if strings.Contains(errStr, "access denied") || strings.Contains(errStr, "unauthorized") {
|
||||||
return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
|
return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
@@ -541,7 +634,14 @@ func ListHandler(cmd *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
for _, m := range models.Models {
|
for _, m := range models.Models {
|
||||||
if len(args) == 0 || strings.HasPrefix(strings.ToLower(m.Name), strings.ToLower(args[0])) {
|
if len(args) == 0 || strings.HasPrefix(strings.ToLower(m.Name), strings.ToLower(args[0])) {
|
||||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
|
var size string
|
||||||
|
if m.RemoteModel != "" {
|
||||||
|
size = "-"
|
||||||
|
} else {
|
||||||
|
size = format.HumanBytes(m.Size)
|
||||||
|
}
|
||||||
|
|
||||||
|
data = append(data, []string{m.Name, m.Digest[:12], size, format.HumanTime(m.ModifiedAt, "Never")})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -626,8 +726,8 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
|
|||||||
KeepAlive: &api.Duration{Duration: 0},
|
KeepAlive: &api.Duration{Duration: 0},
|
||||||
}
|
}
|
||||||
if err := loadOrUnloadModel(cmd, opts); err != nil {
|
if err := loadOrUnloadModel(cmd, opts); err != nil {
|
||||||
if !strings.Contains(err.Error(), "not found") {
|
if !strings.Contains(strings.ToLower(err.Error()), "not found") {
|
||||||
return fmt.Errorf("unable to stop existing running model \"%s\": %s", args[0], err)
|
fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -738,12 +838,36 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tableRender("Model", func() (rows [][]string) {
|
tableRender("Model", func() (rows [][]string) {
|
||||||
|
if resp.RemoteHost != "" {
|
||||||
|
rows = append(rows, []string{"", "Remote model", resp.RemoteModel})
|
||||||
|
rows = append(rows, []string{"", "Remote URL", resp.RemoteHost})
|
||||||
|
}
|
||||||
|
|
||||||
if resp.ModelInfo != nil {
|
if resp.ModelInfo != nil {
|
||||||
arch := resp.ModelInfo["general.architecture"].(string)
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
rows = append(rows, []string{"", "architecture", arch})
|
rows = append(rows, []string{"", "architecture", arch})
|
||||||
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
|
|
||||||
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
|
var paramStr string
|
||||||
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
|
if resp.Details.ParameterSize != "" {
|
||||||
|
paramStr = resp.Details.ParameterSize
|
||||||
|
} else if v, ok := resp.ModelInfo["general.parameter_count"]; ok {
|
||||||
|
if f, ok := v.(float64); ok {
|
||||||
|
paramStr = format.HumanNumber(uint64(f))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{"", "parameters", paramStr})
|
||||||
|
|
||||||
|
if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
|
||||||
|
if f, ok := v.(float64); ok {
|
||||||
|
rows = append(rows, []string{"", "context length", strconv.FormatFloat(f, 'f', -1, 64)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)]; ok {
|
||||||
|
if f, ok := v.(float64); ok {
|
||||||
|
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(f, 'f', -1, 64)})
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
rows = append(rows, []string{"", "architecture", resp.Details.Family})
|
rows = append(rows, []string{"", "architecture", resp.Details.Family})
|
||||||
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
|
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
|
||||||
@@ -991,6 +1115,52 @@ type runOptions struct {
|
|||||||
KeepAlive *api.Duration
|
KeepAlive *api.Duration
|
||||||
Think *api.ThinkValue
|
Think *api.ThinkValue
|
||||||
HideThinking bool
|
HideThinking bool
|
||||||
|
ShowConnect bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r runOptions) Copy() runOptions {
|
||||||
|
var messages []api.Message
|
||||||
|
if r.Messages != nil {
|
||||||
|
messages = make([]api.Message, len(r.Messages))
|
||||||
|
copy(messages, r.Messages)
|
||||||
|
}
|
||||||
|
|
||||||
|
var images []api.ImageData
|
||||||
|
if r.Images != nil {
|
||||||
|
images = make([]api.ImageData, len(r.Images))
|
||||||
|
copy(images, r.Images)
|
||||||
|
}
|
||||||
|
|
||||||
|
var opts map[string]any
|
||||||
|
if r.Options != nil {
|
||||||
|
opts = make(map[string]any, len(r.Options))
|
||||||
|
for k, v := range r.Options {
|
||||||
|
opts[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var think *api.ThinkValue
|
||||||
|
if r.Think != nil {
|
||||||
|
cThink := *r.Think
|
||||||
|
think = &cThink
|
||||||
|
}
|
||||||
|
|
||||||
|
return runOptions{
|
||||||
|
Model: r.Model,
|
||||||
|
ParentModel: r.ParentModel,
|
||||||
|
Prompt: r.Prompt,
|
||||||
|
Messages: messages,
|
||||||
|
WordWrap: r.WordWrap,
|
||||||
|
Format: r.Format,
|
||||||
|
System: r.System,
|
||||||
|
Images: images,
|
||||||
|
Options: opts,
|
||||||
|
MultiModal: r.MultiModal,
|
||||||
|
KeepAlive: r.KeepAlive,
|
||||||
|
Think: think,
|
||||||
|
HideThinking: r.HideThinking,
|
||||||
|
ShowConnect: r.ShowConnect,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type displayResponseState struct {
|
type displayResponseState struct {
|
||||||
@@ -1546,6 +1716,22 @@ func NewCLI() *cobra.Command {
|
|||||||
|
|
||||||
pushCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
pushCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||||
|
|
||||||
|
signinCmd := &cobra.Command{
|
||||||
|
Use: "signin",
|
||||||
|
Short: "Sign in to ollama.com",
|
||||||
|
Args: cobra.ExactArgs(0),
|
||||||
|
PreRunE: checkServerHeartbeat,
|
||||||
|
RunE: SigninHandler,
|
||||||
|
}
|
||||||
|
|
||||||
|
signoutCmd := &cobra.Command{
|
||||||
|
Use: "signout",
|
||||||
|
Short: "Sign out from ollama.com",
|
||||||
|
Args: cobra.ExactArgs(0),
|
||||||
|
PreRunE: checkServerHeartbeat,
|
||||||
|
RunE: SignoutHandler,
|
||||||
|
}
|
||||||
|
|
||||||
listCmd := &cobra.Command{
|
listCmd := &cobra.Command{
|
||||||
Use: "list",
|
Use: "list",
|
||||||
Aliases: []string{"ls"},
|
Aliases: []string{"ls"},
|
||||||
@@ -1640,6 +1826,8 @@ func NewCLI() *cobra.Command {
|
|||||||
stopCmd,
|
stopCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
|
signinCmd,
|
||||||
|
signoutCmd,
|
||||||
listCmd,
|
listCmd,
|
||||||
psCmd,
|
psCmd,
|
||||||
copyCmd,
|
copyCmd,
|
||||||
|
|||||||
328
cmd/cmd_test.go
@@ -3,10 +3,12 @@ package cmd
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@@ -304,6 +306,8 @@ func TestDeleteHandler(t *testing.T) {
|
|||||||
w.WriteHeader(http.StatusOK)
|
w.WriteHeader(http.StatusOK)
|
||||||
} else {
|
} else {
|
||||||
w.WriteHeader(http.StatusNotFound)
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
errPayload := `{"error":"model '%s' not found"}`
|
||||||
|
w.Write([]byte(fmt.Sprintf(errPayload, req.Name)))
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -346,7 +350,7 @@ func TestDeleteHandler(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
err := DeleteHandler(cmd, []string{"test-model-not-found"})
|
err := DeleteHandler(cmd, []string{"test-model-not-found"})
|
||||||
if err == nil || !strings.Contains(err.Error(), "unable to stop existing running model \"test-model-not-found\"") {
|
if err == nil || !strings.Contains(err.Error(), "model 'test-model-not-found' not found") {
|
||||||
t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
|
t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -488,9 +492,35 @@ func TestPushHandler(t *testing.T) {
|
|||||||
w.(http.Flusher).Flush()
|
w.(http.Flusher).Flush()
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/api/me": func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPost {
|
||||||
|
t.Errorf("expected POST request, got %s", r.Method)
|
||||||
|
}
|
||||||
|
},
|
||||||
},
|
},
|
||||||
expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
|
expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "not signed in push",
|
||||||
|
modelName: "notsignedin-model",
|
||||||
|
serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
|
||||||
|
"/api/me": func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPost {
|
||||||
|
t.Errorf("expected POST request, got %s", r.Method)
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.WriteHeader(http.StatusUnauthorized)
|
||||||
|
err := json.NewEncoder(w).Encode(map[string]string{
|
||||||
|
"error": "unauthorized",
|
||||||
|
"signin_url": "https://somethingsomething",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectedOutput: "You need to be signed in to push",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "unauthorized push",
|
name: "unauthorized push",
|
||||||
modelName: "unauthorized-model",
|
modelName: "unauthorized-model",
|
||||||
@@ -499,12 +529,17 @@ func TestPushHandler(t *testing.T) {
|
|||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
w.WriteHeader(http.StatusUnauthorized)
|
w.WriteHeader(http.StatusUnauthorized)
|
||||||
err := json.NewEncoder(w).Encode(map[string]string{
|
err := json.NewEncoder(w).Encode(map[string]string{
|
||||||
"error": "access denied",
|
"error": "403: {\"errors\":[{\"code\":\"ACCESS DENIED\", \"message\":\"access denied\"}]}",
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/api/me": func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPost {
|
||||||
|
t.Errorf("expected POST request, got %s", r.Method)
|
||||||
|
}
|
||||||
|
},
|
||||||
},
|
},
|
||||||
expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
|
expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
|
||||||
},
|
},
|
||||||
@@ -522,6 +557,10 @@ func TestPushHandler(t *testing.T) {
|
|||||||
defer mockServer.Close()
|
defer mockServer.Close()
|
||||||
|
|
||||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
t.Setenv("HOME", tmpDir)
|
||||||
|
t.Setenv("USERPROFILE", tmpDir)
|
||||||
|
initializeKeypair()
|
||||||
|
|
||||||
cmd := &cobra.Command{}
|
cmd := &cobra.Command{}
|
||||||
cmd.Flags().Bool("insecure", false, "")
|
cmd.Flags().Bool("insecure", false, "")
|
||||||
@@ -557,7 +596,7 @@ func TestPushHandler(t *testing.T) {
|
|||||||
t.Errorf("expected no error, got %v", err)
|
t.Errorf("expected no error, got %v", err)
|
||||||
}
|
}
|
||||||
if tt.expectedOutput != "" {
|
if tt.expectedOutput != "" {
|
||||||
if got := string(stdout); got != tt.expectedOutput {
|
if got := string(stdout); !strings.Contains(got, tt.expectedOutput) {
|
||||||
t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
|
t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -915,3 +954,286 @@ func TestNewCreateRequest(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunOptions_Copy(t *testing.T) {
|
||||||
|
// Setup test data
|
||||||
|
originalKeepAlive := &api.Duration{Duration: 5 * time.Minute}
|
||||||
|
originalThink := &api.ThinkValue{Value: "test reasoning"}
|
||||||
|
|
||||||
|
original := runOptions{
|
||||||
|
Model: "test-model",
|
||||||
|
ParentModel: "parent-model",
|
||||||
|
Prompt: "test prompt",
|
||||||
|
Messages: []api.Message{
|
||||||
|
{Role: "user", Content: "hello"},
|
||||||
|
{Role: "assistant", Content: "hi there"},
|
||||||
|
},
|
||||||
|
WordWrap: true,
|
||||||
|
Format: "json",
|
||||||
|
System: "system prompt",
|
||||||
|
Images: []api.ImageData{
|
||||||
|
[]byte("image1"),
|
||||||
|
[]byte("image2"),
|
||||||
|
},
|
||||||
|
Options: map[string]any{
|
||||||
|
"temperature": 0.7,
|
||||||
|
"max_tokens": 1000,
|
||||||
|
"top_p": 0.9,
|
||||||
|
},
|
||||||
|
MultiModal: true,
|
||||||
|
KeepAlive: originalKeepAlive,
|
||||||
|
Think: originalThink,
|
||||||
|
HideThinking: false,
|
||||||
|
ShowConnect: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test the copy
|
||||||
|
copied := original.Copy()
|
||||||
|
|
||||||
|
// Test 1: Verify the copy is not the same instance
|
||||||
|
if &copied == &original {
|
||||||
|
t.Error("Copy should return a different instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 2: Verify all fields are copied correctly
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
got interface{}
|
||||||
|
want interface{}
|
||||||
|
}{
|
||||||
|
{"Model", copied.Model, original.Model},
|
||||||
|
{"ParentModel", copied.ParentModel, original.ParentModel},
|
||||||
|
{"Prompt", copied.Prompt, original.Prompt},
|
||||||
|
{"WordWrap", copied.WordWrap, original.WordWrap},
|
||||||
|
{"Format", copied.Format, original.Format},
|
||||||
|
{"System", copied.System, original.System},
|
||||||
|
{"MultiModal", copied.MultiModal, original.MultiModal},
|
||||||
|
{"HideThinking", copied.HideThinking, original.HideThinking},
|
||||||
|
{"ShowConnect", copied.ShowConnect, original.ShowConnect},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
if !reflect.DeepEqual(tt.got, tt.want) {
|
||||||
|
t.Errorf("%s mismatch: got %v, want %v", tt.name, tt.got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 3: Verify Messages slice is deeply copied
|
||||||
|
if len(copied.Messages) != len(original.Messages) {
|
||||||
|
t.Errorf("Messages length mismatch: got %d, want %d", len(copied.Messages), len(original.Messages))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Messages) > 0 && &copied.Messages[0] == &original.Messages[0] {
|
||||||
|
t.Error("Messages should be different instances")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify original to verify independence
|
||||||
|
if len(original.Messages) > 0 {
|
||||||
|
originalContent := original.Messages[0].Content
|
||||||
|
original.Messages[0].Content = "modified"
|
||||||
|
if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
|
||||||
|
t.Error("Messages should be independent after copy")
|
||||||
|
}
|
||||||
|
// Restore for other tests
|
||||||
|
original.Messages[0].Content = originalContent
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 4: Verify Images slice is deeply copied
|
||||||
|
if len(copied.Images) != len(original.Images) {
|
||||||
|
t.Errorf("Images length mismatch: got %d, want %d", len(copied.Images), len(original.Images))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Images) > 0 && &copied.Images[0] == &original.Images[0] {
|
||||||
|
t.Error("Images should be different instances")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify original to verify independence
|
||||||
|
if len(original.Images) > 0 {
|
||||||
|
originalImage := original.Images[0]
|
||||||
|
original.Images[0] = []byte("modified")
|
||||||
|
if len(copied.Images) > 0 && string(copied.Images[0]) == "modified" {
|
||||||
|
t.Error("Images should be independent after copy")
|
||||||
|
}
|
||||||
|
// Restore for other tests
|
||||||
|
original.Images[0] = originalImage
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 5: Verify Options map is deeply copied
|
||||||
|
if len(copied.Options) != len(original.Options) {
|
||||||
|
t.Errorf("Options length mismatch: got %d, want %d", len(copied.Options), len(original.Options))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Options) > 0 && &copied.Options == &original.Options {
|
||||||
|
t.Error("Options map should be different instances")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify original to verify independence
|
||||||
|
if len(original.Options) > 0 {
|
||||||
|
originalTemp := original.Options["temperature"]
|
||||||
|
original.Options["temperature"] = 0.9
|
||||||
|
if copied.Options["temperature"] == 0.9 {
|
||||||
|
t.Error("Options should be independent after copy")
|
||||||
|
}
|
||||||
|
// Restore for other tests
|
||||||
|
original.Options["temperature"] = originalTemp
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 6: Verify KeepAlive pointer is copied (shallow copy)
|
||||||
|
if copied.KeepAlive != original.KeepAlive {
|
||||||
|
t.Error("KeepAlive pointer should be the same (shallow copy)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 7: Verify Think pointer creates a new instance
|
||||||
|
if original.Think != nil && copied.Think == original.Think {
|
||||||
|
t.Error("Think should be a different instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
if original.Think != nil && copied.Think != nil {
|
||||||
|
if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
|
||||||
|
t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 8: Test with zero values
|
||||||
|
zeroOriginal := runOptions{}
|
||||||
|
zeroCopy := zeroOriginal.Copy()
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(zeroCopy, zeroOriginal) {
|
||||||
|
fmt.Printf("orig: %#v\ncopy: %#v\n", zeroOriginal, zeroCopy)
|
||||||
|
t.Error("Copy of zero value should equal original zero value")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunOptions_Copy_EmptySlicesAndMaps(t *testing.T) {
|
||||||
|
// Test with empty slices and maps
|
||||||
|
original := runOptions{
|
||||||
|
Messages: []api.Message{},
|
||||||
|
Images: []api.ImageData{},
|
||||||
|
Options: map[string]any{},
|
||||||
|
}
|
||||||
|
|
||||||
|
copied := original.Copy()
|
||||||
|
|
||||||
|
if copied.Messages == nil {
|
||||||
|
t.Error("Empty Messages slice should remain empty, not nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Images == nil {
|
||||||
|
t.Error("Empty Images slice should remain empty, not nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Options == nil {
|
||||||
|
t.Error("Empty Options map should remain empty, not nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Messages) != 0 {
|
||||||
|
t.Error("Empty Messages slice should remain empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Images) != 0 {
|
||||||
|
t.Error("Empty Images slice should remain empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Options) != 0 {
|
||||||
|
t.Error("Empty Options map should remain empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunOptions_Copy_NilPointers(t *testing.T) {
|
||||||
|
// Test with nil pointers
|
||||||
|
original := runOptions{
|
||||||
|
KeepAlive: nil,
|
||||||
|
Think: nil,
|
||||||
|
}
|
||||||
|
|
||||||
|
copied := original.Copy()
|
||||||
|
|
||||||
|
if copied.KeepAlive != nil {
|
||||||
|
t.Error("Nil KeepAlive should remain nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Think != nil {
|
||||||
|
t.Error("Nil Think should remain nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
think *api.ThinkValue
|
||||||
|
}{
|
||||||
|
{"nil Think", nil},
|
||||||
|
{"bool true", &api.ThinkValue{Value: true}},
|
||||||
|
{"bool false", &api.ThinkValue{Value: false}},
|
||||||
|
{"string value", &api.ThinkValue{Value: "reasoning text"}},
|
||||||
|
{"int value", &api.ThinkValue{Value: 42}},
|
||||||
|
{"nil value", &api.ThinkValue{Value: nil}},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
original := runOptions{Think: tt.think}
|
||||||
|
copied := original.Copy()
|
||||||
|
|
||||||
|
if tt.think == nil {
|
||||||
|
if copied.Think != nil {
|
||||||
|
t.Error("Nil Think should remain nil")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Think == nil {
|
||||||
|
t.Error("Non-nil Think should not become nil")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Think == original.Think {
|
||||||
|
t.Error("Think should be a different instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
|
||||||
|
t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunOptions_Copy_Independence(t *testing.T) {
|
||||||
|
// Test that modifications to original don't affect copy
|
||||||
|
originalThink := &api.ThinkValue{Value: "original"}
|
||||||
|
original := runOptions{
|
||||||
|
Model: "original-model",
|
||||||
|
Messages: []api.Message{{Role: "user", Content: "original"}},
|
||||||
|
Options: map[string]any{"key": "value"},
|
||||||
|
Think: originalThink,
|
||||||
|
}
|
||||||
|
|
||||||
|
copied := original.Copy()
|
||||||
|
|
||||||
|
// Modify original
|
||||||
|
original.Model = "modified-model"
|
||||||
|
if len(original.Messages) > 0 {
|
||||||
|
original.Messages[0].Content = "modified"
|
||||||
|
}
|
||||||
|
original.Options["key"] = "modified"
|
||||||
|
if original.Think != nil {
|
||||||
|
original.Think.Value = "modified"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify copy is unchanged
|
||||||
|
if copied.Model == "modified-model" {
|
||||||
|
t.Error("Copy Model should not be affected by original modification")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
|
||||||
|
t.Error("Copy Messages should not be affected by original modification")
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Options["key"] == "modified" {
|
||||||
|
t.Error("Copy Options should not be affected by original modification")
|
||||||
|
}
|
||||||
|
|
||||||
|
if copied.Think != nil && copied.Think.Value == "modified" {
|
||||||
|
t.Error("Copy Think should not be affected by original modification")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -195,16 +195,24 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
fmt.Println("Usage:\n /load <modelname>")
|
fmt.Println("Usage:\n /load <modelname>")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
origOpts := opts.Copy()
|
||||||
|
|
||||||
opts.Model = args[1]
|
opts.Model = args[1]
|
||||||
opts.Messages = []api.Message{}
|
opts.Messages = []api.Message{}
|
||||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||||
opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
|
opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if strings.Contains(err.Error(), "not found") {
|
||||||
|
fmt.Printf("Couldn't find model '%s'\n", opts.Model)
|
||||||
|
opts = origOpts.Copy()
|
||||||
|
continue
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
if strings.Contains(err.Error(), "not found") {
|
if strings.Contains(err.Error(), "not found") {
|
||||||
fmt.Printf("error: %v\n", err)
|
fmt.Printf("Couldn't find model '%s'\n", opts.Model)
|
||||||
|
opts = origOpts.Copy()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if strings.Contains(err.Error(), "does not support thinking") {
|
if strings.Contains(err.Error(), "does not support thinking") {
|
||||||
|
|||||||
@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
conv = &qwen2Model{}
|
conv = &qwen2Model{}
|
||||||
case "Qwen2_5_VLForConditionalGeneration":
|
case "Qwen2_5_VLForConditionalGeneration":
|
||||||
conv = &qwen25VLModel{}
|
conv = &qwen25VLModel{}
|
||||||
|
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
||||||
|
conv = &qwen3VLModel{}
|
||||||
case "BertModel":
|
case "BertModel":
|
||||||
conv = &bertModel{}
|
conv = &bertModel{}
|
||||||
case "CohereForCausalLM":
|
case "CohereForCausalLM":
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ type bertModel struct {
|
|||||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||||
NormEpsilon float32 `json:"norm_epsilon"`
|
NormEpsilon float32 `json:"norm_epsilon"`
|
||||||
|
normalizeEmbeddings bool
|
||||||
|
|
||||||
PoolingType uint32
|
PoolingType uint32
|
||||||
}
|
}
|
||||||
@@ -54,9 +55,11 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
|||||||
|
|
||||||
var pooling string
|
var pooling string
|
||||||
for _, m := range modules {
|
for _, m := range modules {
|
||||||
if m.Type == "sentence_transformers.models.Pooling" {
|
switch m.Type {
|
||||||
|
case "sentence_transformers.models.Pooling":
|
||||||
pooling = m.Path
|
pooling = m.Path
|
||||||
break
|
case "sentence_transformers.models.Normalize":
|
||||||
|
p.normalizeEmbeddings = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,6 +93,7 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["general.architecture"] = "bert"
|
kv["general.architecture"] = "bert"
|
||||||
kv["bert.attention.causal"] = false
|
kv["bert.attention.causal"] = false
|
||||||
kv["bert.pooling_type"] = p.PoolingType
|
kv["bert.pooling_type"] = p.PoolingType
|
||||||
|
kv["bert.normalize_embeddings"] = p.normalizeEmbeddings
|
||||||
|
|
||||||
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||||
|
|
||||||
|
|||||||
@@ -85,6 +85,19 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
case "scales":
|
case "scales":
|
||||||
mxfp4s[name].scales = t
|
mxfp4s[name].scales = t
|
||||||
}
|
}
|
||||||
|
} else if strings.HasSuffix(t.Name(), "gate_up_exps.bias") {
|
||||||
|
// gate_up_exps is interleaved, need to split into gate_exps and up_exps
|
||||||
|
// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
|
||||||
|
out = append(out, slices.Collect(splitDim(t, 1,
|
||||||
|
split{
|
||||||
|
Replacer: strings.NewReplacer("gate_up_exps", "gate_exps"),
|
||||||
|
slices: []tensor.Slice{nil, tensor.S(0, int(t.Shape()[1]), 2)},
|
||||||
|
},
|
||||||
|
split{
|
||||||
|
Replacer: strings.NewReplacer("gate_up_exps", "up_exps"),
|
||||||
|
slices: []tensor.Slice{nil, tensor.S(1, int(t.Shape()[1]), 2)},
|
||||||
|
},
|
||||||
|
))...)
|
||||||
} else {
|
} else {
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
@@ -97,17 +110,28 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
|
|
||||||
for name, mxfp4 := range mxfp4s {
|
for name, mxfp4 := range mxfp4s {
|
||||||
dims := mxfp4.blocks.Shape()
|
dims := mxfp4.blocks.Shape()
|
||||||
|
if strings.Contains(name, "ffn_down_exps") {
|
||||||
if !strings.HasSuffix(name, ".weight") {
|
|
||||||
name += ".weight"
|
|
||||||
}
|
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: name,
|
Name: name + ".weight",
|
||||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||||
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
||||||
WriterTo: mxfp4,
|
WriterTo: mxfp4,
|
||||||
})
|
})
|
||||||
|
} else if strings.Contains(name, "ffn_gate_up_exps") {
|
||||||
|
// gate_up_exps is interleaved, need to split into gate_exps and up_exps
|
||||||
|
// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: strings.Replace(name, "gate_up", "gate", 1) + ".weight",
|
||||||
|
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||||
|
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||||
|
WriterTo: mxfp4.slice(1, 0, int(dims[1]), 2),
|
||||||
|
}, &ggml.Tensor{
|
||||||
|
Name: strings.Replace(name, "gate_up", "up", 1) + ".weight",
|
||||||
|
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||||
|
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||||
|
WriterTo: mxfp4.slice(1, 1, int(dims[1]), 2),
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return out
|
return out
|
||||||
@@ -158,9 +182,21 @@ func (m *gptossModel) Replacements() []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type mxfp4 struct {
|
type mxfp4 struct {
|
||||||
|
slices []tensor.Slice
|
||||||
|
|
||||||
blocks, scales Tensor
|
blocks, scales Tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *mxfp4) slice(dim, start, end, step int) *mxfp4 {
|
||||||
|
slice := slices.Repeat([]tensor.Slice{nil}, len(m.blocks.Shape()))
|
||||||
|
slice[dim] = tensor.S(start, end, step)
|
||||||
|
return &mxfp4{
|
||||||
|
slices: slice,
|
||||||
|
blocks: m.blocks,
|
||||||
|
scales: m.scales,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
|
func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
if _, err := m.blocks.WriteTo(&b); err != nil {
|
if _, err := m.blocks.WriteTo(&b); err != nil {
|
||||||
@@ -204,6 +240,13 @@ func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
|
|||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(m.slices) > 0 {
|
||||||
|
out, err = out.Slice(m.slices...)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
out = tensor.Materialize(out)
|
out = tensor.Materialize(out)
|
||||||
|
|
||||||
if err := out.Reshape(out.Shape().TotalSize()); err != nil {
|
if err := out.Reshape(out.Shape().TotalSize()); err != nil {
|
||||||
|
|||||||
157
convert/convert_qwen3.go
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
)
|
||||||
|
|
||||||
|
type qwen3Model struct {
|
||||||
|
ModelParameters
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
HeadDim uint32 `json:"head_dim"`
|
||||||
|
NumExperts uint32 `json:"num_experts"`
|
||||||
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
|
NormTopkProb bool `json:"norm_topk_prob"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RopeScaling struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Factor ropeFactor `json:"factor"`
|
||||||
|
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
|
MropeSection []int32 `json:"mrope_section"`
|
||||||
|
} `json:"rope_scaling"`
|
||||||
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// KV implements ModelConverter.
|
||||||
|
func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
|
||||||
|
arch := "qwen3"
|
||||||
|
if q.NumExperts > 0 {
|
||||||
|
arch += "moe"
|
||||||
|
}
|
||||||
|
|
||||||
|
kv := q.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = arch
|
||||||
|
kv["block_count"] = q.HiddenLayers
|
||||||
|
kv["context_length"] = q.MaxPositionEmbeddings
|
||||||
|
kv["embedding_length"] = q.HiddenSize
|
||||||
|
kv["feed_forward_length"] = q.IntermediateSize
|
||||||
|
kv["attention.head_count"] = q.NumAttentionHeads
|
||||||
|
kv["attention.head_count_kv"] = q.NumKeyValueHeads
|
||||||
|
kv["attention.key_length"] = q.HeadDim
|
||||||
|
kv["attention.value_length"] = q.HeadDim
|
||||||
|
|
||||||
|
if q.NumExperts > 0 {
|
||||||
|
kv["expert_count"] = q.NumExperts
|
||||||
|
kv["expert_used_count"] = q.NumExpertsPerToken
|
||||||
|
kv["norm_top_k_prob"] = q.NormTopkProb
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["rope.freq_base"] = q.RopeTheta
|
||||||
|
kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
|
||||||
|
|
||||||
|
switch q.RopeScaling.Type {
|
||||||
|
case "":
|
||||||
|
// no scaling
|
||||||
|
case "yarn":
|
||||||
|
kv["rope.scaling.type"] = q.RopeScaling.Type
|
||||||
|
kv["rope.scaling.factor"] = q.RopeScaling.Factor
|
||||||
|
case "mrope", "default":
|
||||||
|
kv["rope.mrope_section"] = q.RopeScaling.MropeSection
|
||||||
|
default:
|
||||||
|
panic("unknown rope scaling type")
|
||||||
|
}
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tensors implements ModelConverter.
|
||||||
|
func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
|
// TODO: handle split experts
|
||||||
|
|
||||||
|
for _, t := range ts {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(t.Name(), "ffn_gate_up_exps"):
|
||||||
|
afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
|
||||||
|
for t := range splitDim(t, 2,
|
||||||
|
split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
|
||||||
|
split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
|
||||||
|
) {
|
||||||
|
t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
|
||||||
|
out = append(out, t)
|
||||||
|
}
|
||||||
|
case strings.Contains(t.Name(), "ffn_down_exps"):
|
||||||
|
shape := slices.Clone(t.Shape())
|
||||||
|
shape[1], shape[2] = shape[2], shape[1]
|
||||||
|
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := make([]int, len(shape))
|
||||||
|
for i := range shape {
|
||||||
|
dims[i] = int(shape[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
tt, err := tensor.Transpose(tt, 0, 2, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// flatten tensor so it can be written as a vector
|
||||||
|
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return native.VectorF32(tt.(*tensor.Dense))
|
||||||
|
})
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: shape,
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replacements implements ModelConverter.
|
||||||
|
func (q *qwen3Model) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"lm_head", "output",
|
||||||
|
"model.embed_tokens", "token_embd",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"input_layernorm", "attn_norm",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.k_norm", "attn_k_norm",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.q_norm", "attn_q_norm",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"mlp.gate.weight", "ffn_gate_inp.weight",
|
||||||
|
"mlp.experts.down_proj", "ffn_down_exps.weight",
|
||||||
|
"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
|
||||||
|
"post_attention_layernorm", "ffn_norm",
|
||||||
|
"model.norm", "output_norm",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ ModelConverter = (*qwen3Model)(nil)
|
||||||
116
convert/convert_qwen3vl.go
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"encoding/json"
|
||||||
|
"io/fs"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type qwen3VLModel struct {
|
||||||
|
qwen3Model `json:"text_config"`
|
||||||
|
|
||||||
|
VisionModel struct {
|
||||||
|
Depth uint32 `json:"depth"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
NumHeads uint32 `json:"num_heads"`
|
||||||
|
InChannels uint32 `json:"in_channels"`
|
||||||
|
PatchSize uint32 `json:"patch_size"`
|
||||||
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||||
|
WindowSize uint32 `json:"window_size"`
|
||||||
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||||
|
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
|
||||||
|
|
||||||
|
Size struct {
|
||||||
|
ShortestEdge uint32 `json:"shortest_edge"`
|
||||||
|
LongestEdge uint32 `json:"longest_edge"`
|
||||||
|
} `json:"size"`
|
||||||
|
|
||||||
|
ImageMean []float32 `json:"image_mean"`
|
||||||
|
ImageStd []float32 `json:"image_std"`
|
||||||
|
} `json:"vision_config"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return json.Unmarshal(bts, &m.VisionModel)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := m.qwen3Model.KV(t)
|
||||||
|
|
||||||
|
arch := "qwen3vl"
|
||||||
|
if m.NumExperts > 0 {
|
||||||
|
arch += "moe"
|
||||||
|
}
|
||||||
|
// override architecture
|
||||||
|
kv["general.architecture"] = arch
|
||||||
|
|
||||||
|
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
|
||||||
|
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
|
||||||
|
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
|
||||||
|
kv["vision.num_channels"] = m.VisionModel.InChannels
|
||||||
|
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
|
||||||
|
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
|
||||||
|
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
|
||||||
|
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
|
||||||
|
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
|
||||||
|
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
|
||||||
|
|
||||||
|
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
|
||||||
|
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
|
||||||
|
|
||||||
|
kv["vision.image_mean"] = m.VisionModel.ImageMean
|
||||||
|
kv["vision.image_std"] = m.VisionModel.ImageStd
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var rest []Tensor
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(t.Name(), "attn_qkv"):
|
||||||
|
out = append(out, slices.Collect(splitDim(t, 0,
|
||||||
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
|
||||||
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
|
||||||
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
|
||||||
|
))...)
|
||||||
|
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
|
||||||
|
shape := t.Shape()
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
rest = append(rest, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return append(m.qwen3Model.Tensors(rest), out...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *qwen3VLModel) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
m.qwen3Model.Replacements(),
|
||||||
|
"model.language_", "",
|
||||||
|
"model.visual", "v",
|
||||||
|
"patch_embed.proj", "patch_embed",
|
||||||
|
"blocks", "blk",
|
||||||
|
"attn.qkv", "attn_qkv",
|
||||||
|
"attn.proj", "attn_out",
|
||||||
|
"deepstack_merger_list", "deepstack_merger",
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -18,6 +18,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -339,13 +340,8 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
||||||
|
if diff := cmp.Diff(c.Expected, actual); diff != "" {
|
||||||
for _, k := range slices.Sorted(maps.Keys(c.Expected)) {
|
t.Errorf("mismatch (-want +got):\n%s", diff)
|
||||||
if v, ok := actual[k]; !ok {
|
|
||||||
t.Errorf("missing %s", k)
|
|
||||||
} else if v != c.Expected[k] {
|
|
||||||
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ type safetensor struct {
|
|||||||
|
|
||||||
func (st safetensor) Kind() uint32 {
|
func (st safetensor) Kind() uint32 {
|
||||||
kind := st.tensorBase.Kind()
|
kind := st.tensorBase.Kind()
|
||||||
if st.dtype == "BF16" && kind != tensorKindFP32 {
|
if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
|
||||||
kind = tensorKindBF16
|
kind = tensorKindBF16
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -230,3 +230,65 @@ func TestSafetensors(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSafetensorKind(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
st safetensor
|
||||||
|
expected uint32
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "BF16 dtype with non-v. prefix and non-FP32 base kind should return BF16",
|
||||||
|
st: safetensor{
|
||||||
|
tensorBase: &tensorBase{
|
||||||
|
name: "weight.matrix",
|
||||||
|
shape: []uint64{10, 10}, // will default to FP16
|
||||||
|
},
|
||||||
|
dtype: "BF16",
|
||||||
|
},
|
||||||
|
expected: tensorKindBF16,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "BF16 dtype with v. prefix should return base kind",
|
||||||
|
st: safetensor{
|
||||||
|
tensorBase: &tensorBase{
|
||||||
|
name: "v.weight.matrix",
|
||||||
|
shape: []uint64{10, 10}, // will default to FP16
|
||||||
|
},
|
||||||
|
dtype: "BF16",
|
||||||
|
},
|
||||||
|
expected: tensorKindFP16,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "BF16 dtype with FP32 base kind should return FP32",
|
||||||
|
st: safetensor{
|
||||||
|
tensorBase: &tensorBase{
|
||||||
|
name: "weight.matrix",
|
||||||
|
shape: []uint64{10}, // will default to FP32
|
||||||
|
},
|
||||||
|
dtype: "BF16",
|
||||||
|
},
|
||||||
|
expected: tensorKindFP32,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Non-BF16 dtype should return base kind",
|
||||||
|
st: safetensor{
|
||||||
|
tensorBase: &tensorBase{
|
||||||
|
name: "weight.matrix",
|
||||||
|
shape: []uint64{10, 10}, // will default to FP16
|
||||||
|
},
|
||||||
|
dtype: "FP16",
|
||||||
|
},
|
||||||
|
expected: tensorKindFP16,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := tt.st.Kind()
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Kind() = %d, expected %d", result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -17,9 +17,10 @@ import (
|
|||||||
type split struct {
|
type split struct {
|
||||||
*strings.Replacer
|
*strings.Replacer
|
||||||
dim int
|
dim int
|
||||||
|
slices []tensor.Slice
|
||||||
|
|
||||||
// fn is an optional function to apply to the tensor after slicing
|
// afterFunc is an optional function to apply to the tensor after slicing
|
||||||
fn func(tensor.Tensor) (tensor.Tensor, error)
|
afterFunc func(tensor.Tensor) (tensor.Tensor, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
||||||
@@ -32,9 +33,12 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
|||||||
shape := slices.Clone(t.Shape())
|
shape := slices.Clone(t.Shape())
|
||||||
shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
|
shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
|
||||||
|
|
||||||
slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
|
slice := split.slices
|
||||||
|
if len(slice) == 0 {
|
||||||
|
slice = slices.Repeat([]tensor.Slice{nil}, len(shape))
|
||||||
slice[dim] = tensor.S(offset, offset+int(shape[dim]))
|
slice[dim] = tensor.S(offset, offset+int(shape[dim]))
|
||||||
offset += int(shape[dim])
|
offset += int(shape[dim])
|
||||||
|
}
|
||||||
|
|
||||||
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
dims := make([]int, len(shape))
|
dims := make([]int, len(shape))
|
||||||
@@ -50,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
|||||||
|
|
||||||
tt = tensor.Materialize(tt)
|
tt = tensor.Materialize(tt)
|
||||||
|
|
||||||
if split.fn != nil {
|
if split.afterFunc != nil {
|
||||||
tt, err = split.fn(tt)
|
tt, err = split.afterFunc(tt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
|
|||||||
t.Run("split with transpose", func(t *testing.T) {
|
t.Run("split with transpose", func(t *testing.T) {
|
||||||
next, stop := iter.Pull(splitDim(&r, 1,
|
next, stop := iter.Pull(splitDim(&r, 1,
|
||||||
split{Replacer: strings.NewReplacer("a", "x")},
|
split{Replacer: strings.NewReplacer("a", "x")},
|
||||||
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
||||||
return tensor.Transpose(tt, 1, 0)
|
return tensor.Transpose(tt, 1, 0)
|
||||||
}},
|
}},
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
//go:build linux || windows
|
|
||||||
|
|
||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
|
|
||||||
func rocmLibUsable(libDir string) bool {
|
|
||||||
slog.Debug("evaluating potential rocm lib dir " + libDir)
|
|
||||||
for _, g := range ROCmLibGlobs {
|
|
||||||
res, _ := filepath.Glob(filepath.Join(libDir, g))
|
|
||||||
if len(res) == 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetSupportedGFX(libDir string) ([]string, error) {
|
|
||||||
var ret []string
|
|
||||||
files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
for _, file := range files {
|
|
||||||
ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
|
|
||||||
}
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func commonAMDValidateLibDir() (string, error) {
|
|
||||||
// Favor our bundled version
|
|
||||||
|
|
||||||
// Installer payload location if we're running the installed binary
|
|
||||||
rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
|
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
|
||||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
|
||||||
return rocmTargetDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prefer explicit HIP env var
|
|
||||||
hipPath := os.Getenv("HIP_PATH")
|
|
||||||
if hipPath != "" {
|
|
||||||
hipLibDir := filepath.Join(hipPath, "bin")
|
|
||||||
if rocmLibUsable(hipLibDir) {
|
|
||||||
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
|
||||||
return hipLibDir, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scan the LD_LIBRARY_PATH or PATH
|
|
||||||
pathEnv := "LD_LIBRARY_PATH"
|
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
pathEnv = "PATH"
|
|
||||||
}
|
|
||||||
|
|
||||||
paths := os.Getenv(pathEnv)
|
|
||||||
for _, path := range filepath.SplitList(paths) {
|
|
||||||
d, err := filepath.Abs(path)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if rocmLibUsable(d) {
|
|
||||||
return d, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Well known location(s)
|
|
||||||
for _, path := range RocmStandardLocations {
|
|
||||||
if rocmLibUsable(path) {
|
|
||||||
return path, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return "", errors.New("no suitable rocm found, falling back to CPU")
|
|
||||||
}
|
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"syscall"
|
|
||||||
"unsafe"
|
|
||||||
|
|
||||||
"golang.org/x/sys/windows"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
hipSuccess = 0
|
|
||||||
hipErrorNoDevice = 100
|
|
||||||
)
|
|
||||||
|
|
||||||
type hipDevicePropMinimal struct {
|
|
||||||
Name [256]byte
|
|
||||||
unused1 [140]byte
|
|
||||||
GcnArchName [256]byte // gfx####
|
|
||||||
iGPU int // Doesn't seem to actually report correctly
|
|
||||||
unused2 [128]byte
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wrap the amdhip64.dll library for GPU discovery
|
|
||||||
type HipLib struct {
|
|
||||||
dll windows.Handle
|
|
||||||
hipGetDeviceCount uintptr
|
|
||||||
hipGetDeviceProperties uintptr
|
|
||||||
hipMemGetInfo uintptr
|
|
||||||
hipSetDevice uintptr
|
|
||||||
hipDriverGetVersion uintptr
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewHipLib() (*HipLib, error) {
|
|
||||||
// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
|
|
||||||
h, err := windows.LoadLibrary("amdhip64_6.dll")
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
|
|
||||||
}
|
|
||||||
hl := &HipLib{}
|
|
||||||
hl.dll = h
|
|
||||||
hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return hl, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
|
|
||||||
// so we have to unload/reset the library after we do our initial discovery
|
|
||||||
// to make sure our updates to that variable are processed by llama.cpp
|
|
||||||
func (hl *HipLib) Release() {
|
|
||||||
err := windows.FreeLibrary(hl.dll)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to unload amdhip64.dll", "error", err)
|
|
||||||
}
|
|
||||||
hl.dll = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
|
|
||||||
if hl.dll == 0 {
|
|
||||||
return 0, 0, errors.New("dll has been unloaded")
|
|
||||||
}
|
|
||||||
var version int
|
|
||||||
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
|
|
||||||
if status != hipSuccess {
|
|
||||||
return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("hipDriverGetVersion", "version", version)
|
|
||||||
driverMajor = version / 10000000
|
|
||||||
driverMinor = (version - (driverMajor * 10000000)) / 100000
|
|
||||||
|
|
||||||
return driverMajor, driverMinor, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hl *HipLib) HipGetDeviceCount() int {
|
|
||||||
if hl.dll == 0 {
|
|
||||||
slog.Error("dll has been unloaded")
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
var count int
|
|
||||||
status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
|
|
||||||
if status == hipErrorNoDevice {
|
|
||||||
slog.Info("AMD ROCm reports no devices found")
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
if status != hipSuccess {
|
|
||||||
slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
|
|
||||||
}
|
|
||||||
return count
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hl *HipLib) HipSetDevice(device int) error {
|
|
||||||
if hl.dll == 0 {
|
|
||||||
return errors.New("dll has been unloaded")
|
|
||||||
}
|
|
||||||
status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
|
|
||||||
if status != hipSuccess {
|
|
||||||
return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
|
|
||||||
if hl.dll == 0 {
|
|
||||||
return nil, errors.New("dll has been unloaded")
|
|
||||||
}
|
|
||||||
var props hipDevicePropMinimal
|
|
||||||
status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
|
|
||||||
if status != hipSuccess {
|
|
||||||
return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
|
|
||||||
}
|
|
||||||
return &props, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// free, total, err
|
|
||||||
func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
|
|
||||||
if hl.dll == 0 {
|
|
||||||
return 0, 0, errors.New("dll has been unloaded")
|
|
||||||
}
|
|
||||||
var totalMemory uint64
|
|
||||||
var freeMemory uint64
|
|
||||||
status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
|
|
||||||
if status != hipSuccess {
|
|
||||||
return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
|
|
||||||
}
|
|
||||||
return freeMemory, totalMemory, nil
|
|
||||||
}
|
|
||||||
@@ -1,549 +0,0 @@
|
|||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bufio"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"io/fs"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"regexp"
|
|
||||||
"slices"
|
|
||||||
"sort"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Discovery logic for AMD/ROCm GPUs
|
|
||||||
|
|
||||||
const (
|
|
||||||
DriverVersionFile = "/sys/module/amdgpu/version"
|
|
||||||
AMDNodesSysfsDir = "/sys/class/kfd/kfd/topology/nodes/"
|
|
||||||
GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
|
|
||||||
|
|
||||||
// Prefix with the node dir
|
|
||||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
|
||||||
|
|
||||||
// Direct Rendering Manager sysfs location
|
|
||||||
DRMDeviceDirGlob = "/sys/class/drm/card*/device"
|
|
||||||
DRMTotalMemoryFile = "mem_info_vram_total"
|
|
||||||
DRMUsedMemoryFile = "mem_info_vram_used"
|
|
||||||
|
|
||||||
// In hex; properties file is in decimal
|
|
||||||
DRMUniqueIDFile = "unique_id"
|
|
||||||
DRMVendorFile = "vendor"
|
|
||||||
DRMDeviceFile = "device"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
// Used to validate if the given ROCm lib is usable
|
|
||||||
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
|
|
||||||
RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
|
|
||||||
)
|
|
||||||
|
|
||||||
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
|
||||||
// Only called once during bootstrap
|
|
||||||
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|
||||||
resp := []RocmGPUInfo{}
|
|
||||||
if !AMDDetected() {
|
|
||||||
return resp, fmt.Errorf("AMD GPUs not detected")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Opportunistic logging of driver version to aid in troubleshooting
|
|
||||||
driverMajor, driverMinor, err := AMDDriverVersion()
|
|
||||||
if err != nil {
|
|
||||||
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
|
||||||
slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
|
||||||
var visibleDevices []string
|
|
||||||
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
|
||||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
|
|
||||||
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
|
||||||
switch {
|
|
||||||
case rocrVD != "":
|
|
||||||
visibleDevices = strings.Split(rocrVD, ",")
|
|
||||||
case hipVD != "":
|
|
||||||
visibleDevices = strings.Split(hipVD, ",")
|
|
||||||
case gpuDO != "":
|
|
||||||
visibleDevices = strings.Split(gpuDO, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
|
||||||
var supported []string
|
|
||||||
var libDir string
|
|
||||||
|
|
||||||
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
|
|
||||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
|
||||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
|
||||||
sort.Slice(matches, func(i, j int) bool {
|
|
||||||
// /sys/class/kfd/kfd/topology/nodes/<number>/properties
|
|
||||||
a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("parse err", "error", err, "match", matches[i])
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("parse err", "error", err, "match", matches[i])
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return a < b
|
|
||||||
})
|
|
||||||
gpuCount := 0
|
|
||||||
gpuOrdinalID := 0
|
|
||||||
for _, match := range matches {
|
|
||||||
slog.Debug("evaluating amdgpu node " + match)
|
|
||||||
fp, err := os.Open(match)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to open sysfs node", "file", match, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
defer fp.Close()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
|
||||||
isCPU := false
|
|
||||||
var major, minor, patch uint64
|
|
||||||
var vendor, device, uniqueID uint64
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := strings.TrimSpace(scanner.Text())
|
|
||||||
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
|
||||||
if strings.HasPrefix(line, "gfx_target_version") {
|
|
||||||
ver := strings.Fields(line)
|
|
||||||
|
|
||||||
// Detect CPUs
|
|
||||||
if len(ver) == 2 && ver[1] == "0" {
|
|
||||||
slog.Debug("detected CPU " + match)
|
|
||||||
isCPU = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
|
||||||
slog.Warn("malformed "+match, "gfx_target_version", line)
|
|
||||||
// If this winds up being a CPU, our offsets may be wrong
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
l := len(ver[1])
|
|
||||||
var err1, err2, err3 error
|
|
||||||
patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
|
||||||
minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
|
||||||
major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
|
|
||||||
if err1 != nil || err2 != nil || err3 != nil {
|
|
||||||
slog.Debug("malformed int " + line)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
} else if strings.HasPrefix(line, "vendor_id") {
|
|
||||||
ver := strings.Fields(line)
|
|
||||||
if len(ver) != 2 {
|
|
||||||
slog.Debug("malformed", "vendor_id", line)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vendor, err = strconv.ParseUint(ver[1], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("malformed", "vendor_id", line, "error", err)
|
|
||||||
}
|
|
||||||
} else if strings.HasPrefix(line, "device_id") {
|
|
||||||
ver := strings.Fields(line)
|
|
||||||
if len(ver) != 2 {
|
|
||||||
slog.Debug("malformed", "device_id", line)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
device, err = strconv.ParseUint(ver[1], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("malformed", "device_id", line, "error", err)
|
|
||||||
}
|
|
||||||
} else if strings.HasPrefix(line, "unique_id") {
|
|
||||||
ver := strings.Fields(line)
|
|
||||||
if len(ver) != 2 {
|
|
||||||
slog.Debug("malformed", "unique_id", line)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("malformed", "unique_id", line, "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO - any other properties we want to extract and record?
|
|
||||||
// vendor_id + device_id -> pci lookup for "Name"
|
|
||||||
// Other metrics that may help us understand relative performance between multiple GPUs
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
|
|
||||||
// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
|
|
||||||
// do reliably report VRAM usage.
|
|
||||||
|
|
||||||
if isCPU {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip over any GPUs that are masked
|
|
||||||
if major == 0 && minor == 0 && patch == 0 {
|
|
||||||
slog.Debug("skipping gpu with gfx000")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look up the memory for the current node
|
|
||||||
totalMemory := uint64(0)
|
|
||||||
usedMemory := uint64(0)
|
|
||||||
var usedFile string
|
|
||||||
mapping := []struct {
|
|
||||||
id uint64
|
|
||||||
filename string
|
|
||||||
}{
|
|
||||||
{vendor, DRMVendorFile},
|
|
||||||
{device, DRMDeviceFile},
|
|
||||||
{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
|
|
||||||
}
|
|
||||||
slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
|
|
||||||
// Map over to DRM location to find the total/free memory
|
|
||||||
drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
|
|
||||||
for _, devDir := range drmMatches {
|
|
||||||
matched := true
|
|
||||||
for _, m := range mapping {
|
|
||||||
if m.id == 0 {
|
|
||||||
// Null ID means it didn't populate, so we can't use it to match
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
filename := filepath.Join(devDir, m.filename)
|
|
||||||
buf, err := os.ReadFile(filename)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
|
|
||||||
matched = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
|
|
||||||
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
|
|
||||||
matched = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if cmp != m.id {
|
|
||||||
matched = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !matched {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Found the matching DRM directory
|
|
||||||
slog.Debug("matched", "amdgpu", match, "drm", devDir)
|
|
||||||
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
|
|
||||||
buf, err := os.ReadFile(totalFile)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
|
|
||||||
usedMemory, err = getFreeMemory(usedFile)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to update used memory", "error", err)
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
var name string
|
|
||||||
// TODO - PCI ID lookup
|
|
||||||
if vendor > 0 && device > 0 {
|
|
||||||
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
|
|
||||||
var ID string
|
|
||||||
if uniqueID != 0 {
|
|
||||||
ID = fmt.Sprintf("GPU-%016x", uniqueID)
|
|
||||||
} else {
|
|
||||||
ID = strconv.Itoa(gpuOrdinalID)
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuInfo := RocmGPUInfo{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
Library: "rocm",
|
|
||||||
memInfo: memInfo{
|
|
||||||
TotalMemory: totalMemory,
|
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
|
||||||
},
|
|
||||||
ID: ID,
|
|
||||||
filterID: gpuOrdinalID,
|
|
||||||
Name: name,
|
|
||||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
|
||||||
DriverMajor: driverMajor,
|
|
||||||
DriverMinor: driverMinor,
|
|
||||||
},
|
|
||||||
usedFilepath: usedFile,
|
|
||||||
index: gpuCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keep track of numeric IDs based on valid GPUs
|
|
||||||
gpuCount += 1
|
|
||||||
|
|
||||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
|
||||||
if len(visibleDevices) > 0 {
|
|
||||||
include := false
|
|
||||||
for _, visible := range visibleDevices {
|
|
||||||
if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
|
|
||||||
include = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !include {
|
|
||||||
reason := "filtering out device per user request"
|
|
||||||
slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: reason,
|
|
||||||
})
|
|
||||||
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ordinal IDs are based on the visible GPUs
|
|
||||||
gpuOrdinalID += 1
|
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
|
||||||
if totalMemory < IGPUMemLimit {
|
|
||||||
reason := "unsupported Radeon iGPU detected skipping"
|
|
||||||
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: reason,
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
//minVer, err := strconv.Atoi(RocmComputeMajorMin)
|
|
||||||
//if err != nil {
|
|
||||||
// slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
|
|
||||||
//}
|
|
||||||
// if int(major) < minVer {
|
|
||||||
// reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
|
|
||||||
// slog.Warn(reason, "gpu", gpuID)
|
|
||||||
// unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
// GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
// Reason: reason,
|
|
||||||
// })
|
|
||||||
|
|
||||||
// continue
|
|
||||||
//}
|
|
||||||
|
|
||||||
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
|
||||||
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
|
||||||
|
|
||||||
// Final validation is gfx compatibility - load the library if we haven't already loaded it
|
|
||||||
// even if the user overrides, we still need to validate the library
|
|
||||||
if libDir == "" {
|
|
||||||
libDir, err = AMDValidateLibDir()
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("unable to verify rocm library: %w", err)
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: err.Error(),
|
|
||||||
})
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gpuInfo.DependencyPath = []string{libDir}
|
|
||||||
|
|
||||||
if gfxOverride == "" {
|
|
||||||
// Only load supported list once
|
|
||||||
if len(supported) == 0 {
|
|
||||||
supported, err = GetSupportedGFX(libDir)
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: err.Error(),
|
|
||||||
})
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
slog.Debug("rocm supported GPUs", "types", supported)
|
|
||||||
}
|
|
||||||
gfx := gpuInfo.Compute
|
|
||||||
if !slices.Contains[[]string, string](supported, gfx) {
|
|
||||||
reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
|
|
||||||
slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: reason,
|
|
||||||
})
|
|
||||||
|
|
||||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
|
||||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for env var workarounds
|
|
||||||
if name == "1002:687f" { // Vega RX 56
|
|
||||||
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, "HSA_ENABLE_SDMA=0")
|
|
||||||
}
|
|
||||||
|
|
||||||
// The GPU has passed all the verification steps and is supported
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
|
||||||
if len(resp) == 0 {
|
|
||||||
err := fmt.Errorf("no compatible amdgpu devices detected")
|
|
||||||
slog.Info(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if err := verifyKFDDriverAccess(); err != nil {
|
|
||||||
err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
|
|
||||||
slog.Error(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return resp, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Quick check for AMD driver so we can skip amdgpu discovery if not present
|
|
||||||
func AMDDetected() bool {
|
|
||||||
// Some driver versions (older?) don't have a version file, so just lookup the parent dir
|
|
||||||
sysfsDir := filepath.Dir(DriverVersionFile)
|
|
||||||
_, err := os.Stat(sysfsDir)
|
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
|
||||||
slog.Debug("amdgpu driver not detected " + sysfsDir)
|
|
||||||
return false
|
|
||||||
} else if err != nil {
|
|
||||||
slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prefer to use host installed ROCm, as long as it meets our minimum requirements
|
|
||||||
// failing that, tell the user how to download it on their own
|
|
||||||
func AMDValidateLibDir() (string, error) {
|
|
||||||
libDir, err := commonAMDValidateLibDir()
|
|
||||||
if err == nil {
|
|
||||||
return libDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Well known ollama installer path
|
|
||||||
installedRocmDir := "/usr/share/ollama/lib/rocm"
|
|
||||||
if rocmLibUsable(installedRocmDir) {
|
|
||||||
return installedRocmDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we still haven't found a usable rocm, the user will have to install it on their own
|
|
||||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
|
|
||||||
return "", errors.New("no suitable rocm found, falling back to CPU")
|
|
||||||
}
|
|
||||||
|
|
||||||
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
|
|
||||||
_, err = os.Stat(DriverVersionFile)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
|
|
||||||
}
|
|
||||||
fp, err := os.Open(DriverVersionFile)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
defer fp.Close()
|
|
||||||
verString, err := io.ReadAll(fp)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
pattern := `\A(\d+)\.(\d+).*`
|
|
||||||
regex := regexp.MustCompile(pattern)
|
|
||||||
match := regex.FindStringSubmatch(string(verString))
|
|
||||||
if len(match) < 2 {
|
|
||||||
return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
|
|
||||||
}
|
|
||||||
driverMajor, err = strconv.Atoi(match[1])
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
driverMinor, err = strconv.Atoi(match[2])
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
return driverMajor, driverMinor, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
|
||||||
if len(gpus) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
for i := range gpus {
|
|
||||||
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
|
|
||||||
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func getFreeMemory(usedFile string) (uint64, error) {
|
|
||||||
buf, err := os.ReadFile(usedFile)
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
|
|
||||||
}
|
|
||||||
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
|
|
||||||
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
|
|
||||||
}
|
|
||||||
return usedMemory, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func verifyKFDDriverAccess() error {
|
|
||||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
|
||||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, fs.ErrPermission) {
|
|
||||||
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
|
|
||||||
} else if errors.Is(err, fs.ErrNotExist) {
|
|
||||||
// Container runtime failure?
|
|
||||||
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
|
|
||||||
}
|
|
||||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
|
||||||
}
|
|
||||||
fd.Close()
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "rocm" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
|
|
||||||
if _, err := strconv.Atoi(info.ID); err == nil {
|
|
||||||
ids = append(ids, fmt.Sprintf("%d", info.filterID))
|
|
||||||
} else {
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(ids) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// There are 3 potential env vars to use to select GPUs.
|
|
||||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
|
|
||||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
||||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
||||||
return "ROCR_VISIBLE_DEVICES=" + strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
@@ -1,226 +0,0 @@
|
|||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"path/filepath"
|
|
||||||
"slices"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
|
|
||||||
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
|
|
||||||
iGPUName = "AMD 2099 Graphics"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
// Used to validate if the given ROCm lib is usable
|
|
||||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
|
|
||||||
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
|
|
||||||
)
|
|
||||||
|
|
||||||
// Only called once during bootstrap
|
|
||||||
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|
||||||
resp := []RocmGPUInfo{}
|
|
||||||
hl, err := NewHipLib()
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer hl.Release()
|
|
||||||
|
|
||||||
driverMajor, driverMinor, err := hl.AMDDriverVersion()
|
|
||||||
if err != nil {
|
|
||||||
// For now this is benign, but we may eventually need to fail compatibility checks
|
|
||||||
slog.Debug("error looking up amd driver version", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
|
||||||
count := hl.HipGetDeviceCount()
|
|
||||||
if count == 0 {
|
|
||||||
err := fmt.Errorf("no compatible amdgpu devices detected")
|
|
||||||
slog.Info(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
libDir, err := AMDValidateLibDir()
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("unable to verify rocm library: %w", err)
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var supported []string
|
|
||||||
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
|
||||||
if gfxOverride == "" {
|
|
||||||
supported, err = GetSupportedGFX(libDir)
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("detected hip devices", "count", count)
|
|
||||||
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
|
||||||
for i := range count {
|
|
||||||
err = hl.HipSetDevice(i)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("set device", "id", i, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
props, err := hl.HipGetDeviceProperties(i)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("get properties", "id", i, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
n := bytes.IndexByte(props.Name[:], 0)
|
|
||||||
name := string(props.Name[:n])
|
|
||||||
// TODO is UUID actually populated on windows?
|
|
||||||
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
|
||||||
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
|
||||||
gfx := string(props.GcnArchName[:n])
|
|
||||||
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
|
|
||||||
// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
|
||||||
// TODO Why isn't props.iGPU accurate!?
|
|
||||||
|
|
||||||
freeMemory, totalMemory, err := hl.HipMemGetInfo()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("get mem info", "id", i, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuInfo := RocmGPUInfo{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
Library: "rocm",
|
|
||||||
memInfo: memInfo{
|
|
||||||
TotalMemory: totalMemory,
|
|
||||||
FreeMemory: freeMemory,
|
|
||||||
},
|
|
||||||
// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
|
|
||||||
UnreliableFreeMemory: true,
|
|
||||||
|
|
||||||
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
|
||||||
filterID: i,
|
|
||||||
DependencyPath: []string{libDir},
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
|
||||||
Name: name,
|
|
||||||
Compute: gfx,
|
|
||||||
DriverMajor: driverMajor,
|
|
||||||
DriverMinor: driverMinor,
|
|
||||||
},
|
|
||||||
index: i,
|
|
||||||
}
|
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
|
||||||
if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
|
|
||||||
reason := "unsupported Radeon iGPU detected skipping"
|
|
||||||
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: reason,
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strip off Target Features when comparing
|
|
||||||
if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
|
|
||||||
reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
|
|
||||||
slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
Reason: reason,
|
|
||||||
})
|
|
||||||
// HSA_OVERRIDE_GFX_VERSION not supported on windows
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
|
||||||
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
|
||||||
|
|
||||||
return resp, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func AMDValidateLibDir() (string, error) {
|
|
||||||
libDir, err := commonAMDValidateLibDir()
|
|
||||||
if err == nil {
|
|
||||||
return libDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Installer payload (if we're running from some other location)
|
|
||||||
rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
|
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
|
||||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
|
||||||
return rocmTargetDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
|
||||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
|
||||||
return "", errors.New("no suitable rocm found, falling back to CPU")
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
|
||||||
if len(gpus) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
hl, err := NewHipLib()
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug(err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer hl.Release()
|
|
||||||
|
|
||||||
for i := range gpus {
|
|
||||||
err := hl.HipSetDevice(gpus[i].index)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
freeMemory, _, err := hl.HipMemGetInfo()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("get mem info", "id", i, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
|
|
||||||
gpus[i].FreeMemory = freeMemory
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "rocm" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
|
|
||||||
if _, err := strconv.Atoi(info.ID); err == nil {
|
|
||||||
ids = append(ids, fmt.Sprintf("%d", info.filterID))
|
|
||||||
} else {
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(ids) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// There are 3 potential env vars to use to select GPUs.
|
|
||||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
|
||||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
||||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
||||||
return "HIP_VISIBLE_DEVICES=" + strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
func IsNUMA() bool {
|
|
||||||
if runtime.GOOS != "linux" {
|
|
||||||
// numa support in llama.cpp is linux only
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
ids := map[string]any{}
|
|
||||||
packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
|
|
||||||
for _, packageId := range packageIds {
|
|
||||||
id, err := os.ReadFile(packageId)
|
|
||||||
if err == nil {
|
|
||||||
ids[strings.TrimSpace(string(id))] = struct{}{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return len(ids) > 1
|
|
||||||
}
|
|
||||||
@@ -4,7 +4,9 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -13,47 +15,6 @@ import (
|
|||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
var CudartGlobs = []string{
|
|
||||||
"/usr/local/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
|
||||||
"/opt/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcudart.so*",
|
|
||||||
"/usr/lib*/libcudart.so*",
|
|
||||||
"/usr/local/lib*/libcudart.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvmlGlobs = []string{}
|
|
||||||
|
|
||||||
var NvcudaGlobs = []string{
|
|
||||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
|
||||||
"/opt/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/lib*/libcuda.so*",
|
|
||||||
"/usr/local/lib*/libcuda.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiGlobs = []string{
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
|
||||||
"/usr/lib*/libze_intel_gpu.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
CudartMgmtName = "libcudart.so*"
|
|
||||||
NvcudaMgmtName = "libcuda.so*"
|
|
||||||
NvmlMgmtName = "" // not currently wired on linux
|
|
||||||
OneapiMgmtName = "libze_intel_gpu.so*"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
var mem memInfo
|
var mem memInfo
|
||||||
var total, available, free, buffers, cached, freeSwap uint64
|
var total, available, free, buffers, cached, freeSwap uint64
|
||||||
@@ -106,16 +67,17 @@ type linuxCpuInfo struct {
|
|||||||
CoreID string `cpuinfo:"core id"`
|
CoreID string `cpuinfo:"core id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUDetails() ([]CPU, error) {
|
func GetCPUDetails() []CPU {
|
||||||
file, err := os.Open(CpuInfoFilename)
|
file, err := os.Open(CpuInfoFilename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
slog.Warn("failed to get CPU details", "error", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
return linuxCPUDetails(file)
|
return linuxCPUDetails(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
func linuxCPUDetails(file io.Reader) ([]CPU, error) {
|
func linuxCPUDetails(file io.Reader) []CPU {
|
||||||
reColumns := regexp.MustCompile("\t+: ")
|
reColumns := regexp.MustCompile("\t+: ")
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
cpuInfos := []linuxCpuInfo{}
|
cpuInfos := []linuxCpuInfo{}
|
||||||
@@ -194,5 +156,17 @@ func linuxCPUDetails(file io.Reader) ([]CPU, error) {
|
|||||||
for _, k := range keys {
|
for _, k := range keys {
|
||||||
result = append(result, *socketByID[k])
|
result = append(result, *socketByID[k])
|
||||||
}
|
}
|
||||||
return result, nil
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsNUMA() bool {
|
||||||
|
ids := map[string]any{}
|
||||||
|
packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
|
||||||
|
for _, packageId := range packageIds {
|
||||||
|
id, err := os.ReadFile(packageId)
|
||||||
|
if err == nil {
|
||||||
|
ids[strings.TrimSpace(string(id))] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len(ids) > 1
|
||||||
}
|
}
|
||||||
@@ -2062,18 +2062,9 @@ power management:
|
|||||||
for k, v := range testCases {
|
for k, v := range testCases {
|
||||||
t.Run(k, func(t *testing.T) {
|
t.Run(k, func(t *testing.T) {
|
||||||
buf := bytes.NewBufferString(v.input)
|
buf := bytes.NewBufferString(v.input)
|
||||||
cpus, err := linuxCPUDetails(buf)
|
cpus := linuxCPUDetails(buf)
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("example", "scenario", k, "cpus", cpus)
|
slog.Info("example", "scenario", k, "cpus", cpus)
|
||||||
si := SystemInfo{
|
|
||||||
System: CPUInfo{
|
|
||||||
CPUs: cpus,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
threadCount := si.GetOptimalThreadCount()
|
|
||||||
if len(v.expCPUs) != len(cpus) {
|
if len(v.expCPUs) != len(cpus) {
|
||||||
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
|
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
|
||||||
}
|
}
|
||||||
@@ -2088,10 +2079,6 @@ power management:
|
|||||||
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
|
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if threadCount != v.expThreadCount {
|
|
||||||
t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -26,29 +26,6 @@ var (
|
|||||||
GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
|
GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
|
||||||
)
|
)
|
||||||
|
|
||||||
var CudartGlobs = []string{
|
|
||||||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvmlGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\nvml.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaGlobs = []string{
|
|
||||||
"c:\\windows\\system*\\nvcuda.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
CudartMgmtName = "cudart64_*.dll"
|
|
||||||
NvcudaMgmtName = "nvcuda.dll"
|
|
||||||
NvmlMgmtName = "nvml.dll"
|
|
||||||
OneapiMgmtName = "ze_intel_gpu64.dll"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
||||||
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
||||||
@@ -122,28 +99,23 @@ func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func getLogicalProcessorInformationEx() ([]byte, error) {
|
func getLogicalProcessorInformationEx() ([]byte, error) {
|
||||||
buf := make([]byte, 1)
|
buf := make([]byte, 1024)
|
||||||
bufSize := len(buf)
|
bufSize := len(buf)
|
||||||
ret, _, err := GetLogicalProcessorInformationEx.Call(
|
var err error
|
||||||
uintptr(RelationAll),
|
for range 3 {
|
||||||
uintptr(unsafe.Pointer(&buf[0])),
|
var ret uintptr
|
||||||
uintptr(unsafe.Pointer(&bufSize)),
|
|
||||||
)
|
|
||||||
if ret != 0 {
|
|
||||||
return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = make([]byte, bufSize)
|
|
||||||
ret, _, err = GetLogicalProcessorInformationEx.Call(
|
ret, _, err = GetLogicalProcessorInformationEx.Call(
|
||||||
uintptr(RelationAll),
|
uintptr(RelationAll),
|
||||||
uintptr(unsafe.Pointer(&buf[0])),
|
uintptr(unsafe.Pointer(&buf[0])),
|
||||||
uintptr(unsafe.Pointer(&bufSize)),
|
uintptr(unsafe.Pointer(&bufSize)),
|
||||||
)
|
)
|
||||||
if ret == 0 {
|
if ret == 1 && bufSize <= len(buf) {
|
||||||
return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
|
|
||||||
}
|
|
||||||
return buf, nil
|
return buf, nil
|
||||||
}
|
}
|
||||||
|
buf = make([]byte, bufSize)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("unable to determine CPU details: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
|
func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
|
||||||
var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
|
var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
|
||||||
@@ -217,10 +189,11 @@ func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
|
|||||||
return packages
|
return packages
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUDetails() ([]CPU, error) {
|
func GetCPUDetails() []CPU {
|
||||||
buf, err := getLogicalProcessorInformationEx()
|
buf, err := getLogicalProcessorInformationEx()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
slog.Warn("failed to get CPU details", "error", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
packages := processSystemLogicalProcessorInforationList(buf)
|
packages := processSystemLogicalProcessorInforationList(buf)
|
||||||
cpus := make([]CPU, len(packages))
|
cpus := make([]CPU, len(packages))
|
||||||
@@ -230,5 +203,10 @@ func GetCPUDetails() ([]CPU, error) {
|
|||||||
cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
|
cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
|
||||||
cpus[i].ThreadCount = pkg.threadCount
|
cpus[i].ThreadCount = pkg.threadCount
|
||||||
}
|
}
|
||||||
return cpus, nil
|
return cpus
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsNUMA() bool {
|
||||||
|
// numa support in ggml is linux only
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
//go:build linux || windows
|
|
||||||
|
|
||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"regexp"
|
|
||||||
"runtime"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
|
||||||
|
|
||||||
func cudaVariant(gpuInfo CudaGPUInfo) string {
|
|
||||||
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
|
||||||
if CudaTegra != "" {
|
|
||||||
ver := strings.Split(CudaTegra, ".")
|
|
||||||
if len(ver) > 0 {
|
|
||||||
return "jetpack" + ver[0]
|
|
||||||
}
|
|
||||||
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
|
||||||
r := regexp.MustCompile(` R(\d+) `)
|
|
||||||
m := r.FindSubmatch(data)
|
|
||||||
if len(m) != 2 {
|
|
||||||
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
|
||||||
} else {
|
|
||||||
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
|
||||||
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
|
||||||
// https://developer.nvidia.com/embedded/jetpack-archive
|
|
||||||
switch l4t {
|
|
||||||
case 35:
|
|
||||||
return "jetpack5"
|
|
||||||
case 36:
|
|
||||||
return "jetpack6"
|
|
||||||
default:
|
|
||||||
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "sbsa"
|
|
||||||
}
|
|
||||||
|
|
||||||
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
|
||||||
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
|
||||||
// The detected driver is older than Feb 2023
|
|
||||||
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
|
|
||||||
return "v11"
|
|
||||||
}
|
|
||||||
return "v12"
|
|
||||||
}
|
|
||||||
743
discover/gpu.go
@@ -1,722 +1,73 @@
|
|||||||
//go:build linux || windows
|
|
||||||
|
|
||||||
package discover
|
package discover
|
||||||
|
|
||||||
/*
|
|
||||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
|
||||||
#cgo windows LDFLAGS: -lpthread
|
|
||||||
|
|
||||||
#include "gpu_info.h"
|
|
||||||
*/
|
|
||||||
import "C"
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"unsafe"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type cudaHandles struct {
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
deviceCount int
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
cudart *C.cudart_handle_t
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
nvcuda *C.nvcuda_handle_t
|
|
||||||
nvml *C.nvml_handle_t
|
|
||||||
}
|
|
||||||
|
|
||||||
type oneapiHandles struct {
|
// GetSystemInfo returns the last cached state of the GPUs on the system
|
||||||
oneapi *C.oneapi_handle_t
|
func GetSystemInfo() ml.SystemInfo {
|
||||||
deviceCount int
|
memInfo, err := GetCPUMem()
|
||||||
}
|
|
||||||
|
|
||||||
const (
|
|
||||||
cudaMinimumMemory = 457 * format.MebiByte
|
|
||||||
rocmMinimumMemory = 457 * format.MebiByte
|
|
||||||
// TODO OneAPI minimum memory
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
gpuMutex sync.Mutex
|
|
||||||
bootstrapped bool
|
|
||||||
cpus []CPUInfo
|
|
||||||
cudaGPUs []CudaGPUInfo
|
|
||||||
nvcudaLibPath string
|
|
||||||
cudartLibPath string
|
|
||||||
oneapiLibPath string
|
|
||||||
nvmlLibPath string
|
|
||||||
rocmGPUs []RocmGPUInfo
|
|
||||||
oneapiGPUs []OneapiGPUInfo
|
|
||||||
|
|
||||||
// If any discovered GPUs are incompatible, report why
|
|
||||||
unsupportedGPUs []UnsupportedGPUInfo
|
|
||||||
|
|
||||||
// Keep track of errors during bootstrapping so that if GPUs are missing
|
|
||||||
// they expected to be present this may explain why
|
|
||||||
bootstrapErrors []error
|
|
||||||
)
|
|
||||||
|
|
||||||
// With our current CUDA compile flags, older than 5.0 will not work properly
|
|
||||||
// (string values used to allow ldflags overrides at build time)
|
|
||||||
var (
|
|
||||||
CudaComputeMajorMin = "5"
|
|
||||||
CudaComputeMinorMin = "0"
|
|
||||||
)
|
|
||||||
//change valute from 9 to 8 would release the gfx version limits ,refer to https://github.com/likelovewant/ollama-for-amd/issues/51
|
|
||||||
var RocmComputeMajorMin = "8"
|
|
||||||
|
|
||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
|
||||||
func initCudaHandles() *cudaHandles {
|
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
|
||||||
|
|
||||||
cHandles := &cudaHandles{}
|
|
||||||
// Short Circuit if we already know which library to use
|
|
||||||
// ignore bootstrap errors in this case since we already recorded them
|
|
||||||
if nvmlLibPath != "" {
|
|
||||||
cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
if nvcudaLibPath != "" {
|
|
||||||
cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
if cudartLibPath != "" {
|
|
||||||
cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("searching for GPU discovery libraries for NVIDIA")
|
|
||||||
var cudartMgmtPatterns []string
|
|
||||||
|
|
||||||
// Aligned with driver, we can't carry as payloads
|
|
||||||
nvcudaMgmtPatterns := NvcudaGlobs
|
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
|
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
|
||||||
|
|
||||||
if len(NvmlGlobs) > 0 {
|
|
||||||
nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
|
|
||||||
if len(nvmlLibPaths) > 0 {
|
|
||||||
nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
|
|
||||||
if nvml != nil {
|
|
||||||
slog.Debug("nvidia-ml loaded", "library", libPath)
|
|
||||||
cHandles.nvml = nvml
|
|
||||||
nvmlLibPath = libPath
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
bootstrapErrors = append(bootstrapErrors, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
|
|
||||||
if len(nvcudaLibPaths) > 0 {
|
|
||||||
deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
|
|
||||||
if nvcuda != nil {
|
|
||||||
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
|
|
||||||
cHandles.nvcuda = nvcuda
|
|
||||||
cHandles.deviceCount = deviceCount
|
|
||||||
nvcudaLibPath = libPath
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
bootstrapErrors = append(bootstrapErrors, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
|
|
||||||
if len(cudartLibPaths) > 0 {
|
|
||||||
deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
|
|
||||||
if cudart != nil {
|
|
||||||
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
|
|
||||||
cHandles.cudart = cudart
|
|
||||||
cHandles.deviceCount = deviceCount
|
|
||||||
cudartLibPath = libPath
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
bootstrapErrors = append(bootstrapErrors, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return cHandles
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
|
||||||
func initOneAPIHandles() *oneapiHandles {
|
|
||||||
oHandles := &oneapiHandles{}
|
|
||||||
|
|
||||||
// Short Circuit if we already know which library to use
|
|
||||||
// ignore bootstrap errors in this case since we already recorded them
|
|
||||||
if oneapiLibPath != "" {
|
|
||||||
oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
|
|
||||||
return oHandles
|
|
||||||
}
|
|
||||||
|
|
||||||
oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
|
|
||||||
if len(oneapiLibPaths) > 0 {
|
|
||||||
var err error
|
|
||||||
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
|
|
||||||
if err != nil {
|
|
||||||
bootstrapErrors = append(bootstrapErrors, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return oHandles
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetCPUInfo() GpuInfoList {
|
|
||||||
gpuMutex.Lock()
|
|
||||||
if !bootstrapped {
|
|
||||||
gpuMutex.Unlock()
|
|
||||||
GetGPUInfo()
|
|
||||||
} else {
|
|
||||||
gpuMutex.Unlock()
|
|
||||||
}
|
|
||||||
return GpuInfoList{cpus[0].GpuInfo}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
|
||||||
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
|
||||||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
|
||||||
gpuMutex.Lock()
|
|
||||||
defer gpuMutex.Unlock()
|
|
||||||
needRefresh := true
|
|
||||||
var cHandles *cudaHandles
|
|
||||||
var oHandles *oneapiHandles
|
|
||||||
defer func() {
|
|
||||||
if cHandles != nil {
|
|
||||||
if cHandles.cudart != nil {
|
|
||||||
C.cudart_release(*cHandles.cudart)
|
|
||||||
}
|
|
||||||
if cHandles.nvcuda != nil {
|
|
||||||
C.nvcuda_release(*cHandles.nvcuda)
|
|
||||||
}
|
|
||||||
if cHandles.nvml != nil {
|
|
||||||
C.nvml_release(*cHandles.nvml)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if oHandles != nil {
|
|
||||||
if oHandles.oneapi != nil {
|
|
||||||
// TODO - is this needed?
|
|
||||||
C.oneapi_release(*oHandles.oneapi)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
if !bootstrapped {
|
|
||||||
slog.Info("looking for compatible GPUs")
|
|
||||||
cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
|
|
||||||
}
|
|
||||||
cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
|
|
||||||
}
|
|
||||||
bootstrapErrors = []error{}
|
|
||||||
needRefresh = false
|
|
||||||
var memInfo C.mem_info_t
|
|
||||||
|
|
||||||
mem, err := GetCPUMem()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("error looking up system memory", "error", err)
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
}
|
}
|
||||||
|
var threadCount int
|
||||||
details, err := GetCPUDetails()
|
cpus := GetCPUDetails()
|
||||||
if err != nil {
|
for _, c := range cpus {
|
||||||
slog.Warn("failed to lookup CPU details", "error", err)
|
threadCount += c.CoreCount - c.EfficiencyCoreCount
|
||||||
}
|
|
||||||
cpus = []CPUInfo{
|
|
||||||
{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
memInfo: mem,
|
|
||||||
Library: "cpu",
|
|
||||||
ID: "0",
|
|
||||||
},
|
|
||||||
CPUs: details,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load ALL libraries
|
if threadCount == 0 {
|
||||||
cHandles = initCudaHandles()
|
// Fall back to Go's num CPU
|
||||||
|
threadCount = runtime.NumCPU()
|
||||||
// NVIDIA
|
|
||||||
for i := range cHandles.deviceCount {
|
|
||||||
if cHandles.cudart != nil || cHandles.nvcuda != nil {
|
|
||||||
gpuInfo := CudaGPUInfo{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
Library: "cuda",
|
|
||||||
},
|
|
||||||
index: i,
|
|
||||||
}
|
}
|
||||||
var driverMajor int
|
|
||||||
var driverMinor int
|
return ml.SystemInfo{
|
||||||
if cHandles.cudart != nil {
|
ThreadCount: threadCount,
|
||||||
C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
|
TotalMemory: memInfo.TotalMemory,
|
||||||
driverMajor = int(cHandles.cudart.driver_major)
|
FreeMemory: memInfo.FreeMemory,
|
||||||
driverMinor = int(cHandles.cudart.driver_minor)
|
FreeSwap: memInfo.FreeSwap,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func cudaJetpack() string {
|
||||||
|
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
||||||
|
if CudaTegra != "" {
|
||||||
|
ver := strings.Split(CudaTegra, ".")
|
||||||
|
if len(ver) > 0 {
|
||||||
|
return "jetpack" + ver[0]
|
||||||
|
}
|
||||||
|
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
||||||
|
r := regexp.MustCompile(` R(\d+) `)
|
||||||
|
m := r.FindSubmatch(data)
|
||||||
|
if len(m) != 2 {
|
||||||
|
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
||||||
} else {
|
} else {
|
||||||
C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
|
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
||||||
driverMajor = int(cHandles.nvcuda.driver_major)
|
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
||||||
driverMinor = int(cHandles.nvcuda.driver_minor)
|
// https://developer.nvidia.com/embedded/jetpack-archive
|
||||||
}
|
switch l4t {
|
||||||
if memInfo.err != nil {
|
case 35:
|
||||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
return "jetpack5"
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
case 36:
|
||||||
continue
|
return "jetpack6"
|
||||||
}
|
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
|
||||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
|
||||||
gpuInfo.computeMajor = int(memInfo.major)
|
|
||||||
gpuInfo.computeMinor = int(memInfo.minor)
|
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
|
||||||
gpuInfo.DriverMajor = driverMajor
|
|
||||||
gpuInfo.DriverMinor = driverMinor
|
|
||||||
variant := cudaVariant(gpuInfo)
|
|
||||||
|
|
||||||
// Start with our bundled libraries
|
|
||||||
if variant != "" {
|
|
||||||
variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
|
|
||||||
if _, err := os.Stat(variantPath); err == nil {
|
|
||||||
// Put the variant directory first in the search path to avoid runtime linking to the wrong library
|
|
||||||
gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
|
||||||
gpuInfo.Variant = variant
|
|
||||||
|
|
||||||
if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
|
|
||||||
unsupportedGPUs = append(unsupportedGPUs,
|
|
||||||
UnsupportedGPUInfo{
|
|
||||||
GpuInfo: gpuInfo.GpuInfo,
|
|
||||||
})
|
|
||||||
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// query the management library as well so we can record any skew between the two
|
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
|
||||||
if cHandles.nvml != nil {
|
|
||||||
uuid := C.CString(gpuInfo.ID)
|
|
||||||
defer C.free(unsafe.Pointer(uuid))
|
|
||||||
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
|
||||||
if memInfo.err != nil {
|
|
||||||
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
|
||||||
} else {
|
|
||||||
if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
|
|
||||||
gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
|
|
||||||
slog.Info("detected OS VRAM overhead",
|
|
||||||
"id", gpuInfo.ID,
|
|
||||||
"library", gpuInfo.Library,
|
|
||||||
"compute", gpuInfo.Compute,
|
|
||||||
"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
|
|
||||||
"name", gpuInfo.Name,
|
|
||||||
"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
|
||||||
cudaGPUs = append(cudaGPUs, gpuInfo)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Intel
|
|
||||||
if envconfig.IntelGPU() {
|
|
||||||
oHandles = initOneAPIHandles()
|
|
||||||
if oHandles != nil && oHandles.oneapi != nil {
|
|
||||||
for d := range oHandles.oneapi.num_drivers {
|
|
||||||
if oHandles.oneapi == nil {
|
|
||||||
// shouldn't happen
|
|
||||||
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
|
|
||||||
for i := range devCount {
|
|
||||||
gpuInfo := OneapiGPUInfo{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
Library: "oneapi",
|
|
||||||
},
|
|
||||||
driverIndex: int(d),
|
|
||||||
gpuIndex: int(i),
|
|
||||||
}
|
|
||||||
// TODO - split bootstrapping from updating free memory
|
|
||||||
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
|
|
||||||
// TODO - convert this to MinimumMemory based on testing...
|
|
||||||
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
|
||||||
memInfo.free = C.uint64_t(totalFreeMem)
|
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
|
||||||
gpuInfo.DependencyPath = []string{LibOllamaPath}
|
|
||||||
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
rocmGPUs, err = AMDGetGPUInfo()
|
|
||||||
|
|
||||||
// The ID field is used in context of the filtered set of GPUS
|
|
||||||
// so we have to replace any of these numeric IDs with their
|
|
||||||
// placement in this set of GPUs
|
|
||||||
for i := range rocmGPUs {
|
|
||||||
if _, err := strconv.Atoi(rocmGPUs[i].ID); err == nil {
|
|
||||||
rocmGPUs[i].ID = strconv.Itoa(i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
bootstrapErrors = append(bootstrapErrors, err)
|
|
||||||
}
|
|
||||||
bootstrapped = true
|
|
||||||
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
|
|
||||||
slog.Info("no compatible GPUs were discovered")
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
|
|
||||||
}
|
|
||||||
|
|
||||||
// For detected GPUs, load library if not loaded
|
|
||||||
|
|
||||||
// Refresh free memory usage
|
|
||||||
if needRefresh {
|
|
||||||
mem, err := GetCPUMem()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("error looking up system memory", "error", err)
|
|
||||||
} else {
|
|
||||||
slog.Debug("updating system memory data",
|
|
||||||
slog.Group(
|
|
||||||
"before",
|
|
||||||
"total", format.HumanBytes2(cpus[0].TotalMemory),
|
|
||||||
"free", format.HumanBytes2(cpus[0].FreeMemory),
|
|
||||||
"free_swap", format.HumanBytes2(cpus[0].FreeSwap),
|
|
||||||
),
|
|
||||||
slog.Group(
|
|
||||||
"now",
|
|
||||||
"total", format.HumanBytes2(mem.TotalMemory),
|
|
||||||
"free", format.HumanBytes2(mem.FreeMemory),
|
|
||||||
"free_swap", format.HumanBytes2(mem.FreeSwap),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
cpus[0].FreeMemory = mem.FreeMemory
|
|
||||||
cpus[0].FreeSwap = mem.FreeSwap
|
|
||||||
}
|
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
|
||||||
if cHandles == nil && len(cudaGPUs) > 0 {
|
|
||||||
cHandles = initCudaHandles()
|
|
||||||
}
|
|
||||||
for i, gpu := range cudaGPUs {
|
|
||||||
if cHandles.nvml != nil {
|
|
||||||
uuid := C.CString(gpu.ID)
|
|
||||||
defer C.free(unsafe.Pointer(uuid))
|
|
||||||
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
|
||||||
} else if cHandles.cudart != nil {
|
|
||||||
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
|
||||||
} else if cHandles.nvcuda != nil {
|
|
||||||
C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
|
|
||||||
memInfo.used = memInfo.total - memInfo.free
|
|
||||||
} else {
|
|
||||||
// shouldn't happen
|
|
||||||
slog.Warn("no valid cuda library loaded to refresh vram usage")
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if memInfo.err != nil {
|
|
||||||
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if memInfo.free == 0 {
|
|
||||||
slog.Warn("error looking up nvidia GPU memory")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if cHandles.nvml != nil && gpu.OSOverhead > 0 {
|
|
||||||
// When using the management library update based on recorded overhead
|
|
||||||
memInfo.free -= C.uint64_t(gpu.OSOverhead)
|
|
||||||
}
|
|
||||||
slog.Debug("updating cuda memory data",
|
|
||||||
"gpu", gpu.ID,
|
|
||||||
"name", gpu.Name,
|
|
||||||
"overhead", format.HumanBytes2(gpu.OSOverhead),
|
|
||||||
slog.Group(
|
|
||||||
"before",
|
|
||||||
"total", format.HumanBytes2(gpu.TotalMemory),
|
|
||||||
"free", format.HumanBytes2(gpu.FreeMemory),
|
|
||||||
),
|
|
||||||
slog.Group(
|
|
||||||
"now",
|
|
||||||
"total", format.HumanBytes2(uint64(memInfo.total)),
|
|
||||||
"free", format.HumanBytes2(uint64(memInfo.free)),
|
|
||||||
"used", format.HumanBytes2(uint64(memInfo.used)),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
|
|
||||||
}
|
|
||||||
|
|
||||||
if oHandles == nil && len(oneapiGPUs) > 0 {
|
|
||||||
oHandles = initOneAPIHandles()
|
|
||||||
}
|
|
||||||
for i, gpu := range oneapiGPUs {
|
|
||||||
if oHandles.oneapi == nil {
|
|
||||||
// shouldn't happen
|
|
||||||
slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
|
|
||||||
// TODO - convert this to MinimumMemory based on testing...
|
|
||||||
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
|
||||||
memInfo.free = C.uint64_t(totalFreeMem)
|
|
||||||
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("problem refreshing ROCm free memory", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resp := []GpuInfo{}
|
|
||||||
for _, gpu := range cudaGPUs {
|
|
||||||
resp = append(resp, gpu.GpuInfo)
|
|
||||||
}
|
|
||||||
for _, gpu := range rocmGPUs {
|
|
||||||
resp = append(resp, gpu.GpuInfo)
|
|
||||||
}
|
|
||||||
for _, gpu := range oneapiGPUs {
|
|
||||||
resp = append(resp, gpu.GpuInfo)
|
|
||||||
}
|
|
||||||
if len(resp) == 0 {
|
|
||||||
resp = append(resp, cpus[0].GpuInfo)
|
|
||||||
}
|
|
||||||
return resp
|
|
||||||
}
|
|
||||||
|
|
||||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
|
||||||
gpuLibPaths := []string{}
|
|
||||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
|
||||||
|
|
||||||
// search our bundled libraries first
|
|
||||||
patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
|
|
||||||
|
|
||||||
var ldPaths []string
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "windows":
|
|
||||||
ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
|
|
||||||
case "linux":
|
|
||||||
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
|
|
||||||
}
|
|
||||||
|
|
||||||
// then search the system's LD_LIBRARY_PATH
|
|
||||||
for _, p := range ldPaths {
|
|
||||||
p, err := filepath.Abs(p)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
patterns = append(patterns, filepath.Join(p, baseLibName))
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally, search the default patterns provided by the caller
|
|
||||||
patterns = append(patterns, defaultPatterns...)
|
|
||||||
slog.Debug("gpu library search", "globs", patterns)
|
|
||||||
for _, pattern := range patterns {
|
|
||||||
// Nvidia PhysX known to return bogus results
|
|
||||||
if strings.Contains(pattern, "PhysX") {
|
|
||||||
slog.Debug("skipping PhysX cuda library path", "path", pattern)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Ignore glob discovery errors
|
|
||||||
matches, _ := filepath.Glob(pattern)
|
|
||||||
for _, match := range matches {
|
|
||||||
// Resolve any links so we don't try the same lib multiple times
|
|
||||||
// and weed out any dups across globs
|
|
||||||
libPath := match
|
|
||||||
tmp := match
|
|
||||||
var err error
|
|
||||||
for ; err == nil; tmp, err = os.Readlink(libPath) {
|
|
||||||
if !filepath.IsAbs(tmp) {
|
|
||||||
tmp = filepath.Join(filepath.Dir(libPath), tmp)
|
|
||||||
}
|
|
||||||
libPath = tmp
|
|
||||||
}
|
|
||||||
new := true
|
|
||||||
for _, cmp := range gpuLibPaths {
|
|
||||||
if cmp == libPath {
|
|
||||||
new = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if new {
|
|
||||||
gpuLibPaths = append(gpuLibPaths, libPath)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
|
|
||||||
return gpuLibPaths
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bootstrap the runtime library
|
|
||||||
// Returns: num devices, handle, libPath, error
|
|
||||||
func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
|
|
||||||
var resp C.cudart_init_resp_t
|
|
||||||
resp.ch.verbose = getVerboseState()
|
|
||||||
var err error
|
|
||||||
for _, libPath := range cudartLibPaths {
|
|
||||||
lib := C.CString(libPath)
|
|
||||||
defer C.free(unsafe.Pointer(lib))
|
|
||||||
C.cudart_init(lib, &resp)
|
|
||||||
if resp.err != nil {
|
|
||||||
err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
|
|
||||||
slog.Debug(err.Error())
|
|
||||||
C.free(unsafe.Pointer(resp.err))
|
|
||||||
} else {
|
|
||||||
err = nil
|
|
||||||
return int(resp.num_devices), &resp.ch, libPath, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0, nil, "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bootstrap the driver library
|
|
||||||
// Returns: num devices, handle, libPath, error
|
|
||||||
func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
|
|
||||||
var resp C.nvcuda_init_resp_t
|
|
||||||
resp.ch.verbose = getVerboseState()
|
|
||||||
var err error
|
|
||||||
for _, libPath := range nvcudaLibPaths {
|
|
||||||
lib := C.CString(libPath)
|
|
||||||
defer C.free(unsafe.Pointer(lib))
|
|
||||||
C.nvcuda_init(lib, &resp)
|
|
||||||
if resp.err != nil {
|
|
||||||
// Decide what log level based on the type of error message to help users understand why
|
|
||||||
switch resp.cudaErr {
|
|
||||||
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
|
|
||||||
err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
case C.CUDA_ERROR_NO_DEVICE:
|
|
||||||
err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
|
|
||||||
slog.Info(err.Error())
|
|
||||||
case C.CUDA_ERROR_UNKNOWN:
|
|
||||||
err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
|
|
||||||
slog.Warn(err.Error())
|
|
||||||
default:
|
default:
|
||||||
msg := C.GoString(resp.err)
|
// Newer Jetson systems use the SBSU runtime
|
||||||
if strings.Contains(msg, "wrong ELF class") {
|
slog.Debug("unrecognized L4T version", "nv_tegra_release", string(data))
|
||||||
slog.Debug("skipping 32bit library", "library", libPath)
|
|
||||||
} else {
|
|
||||||
err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
|
|
||||||
slog.Info(err.Error())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
C.free(unsafe.Pointer(resp.err))
|
|
||||||
} else {
|
|
||||||
err = nil
|
|
||||||
return int(resp.num_devices), &resp.ch, libPath, err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0, nil, "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bootstrap the management library
|
|
||||||
// Returns: handle, libPath, error
|
|
||||||
func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
|
|
||||||
var resp C.nvml_init_resp_t
|
|
||||||
resp.ch.verbose = getVerboseState()
|
|
||||||
var err error
|
|
||||||
for _, libPath := range nvmlLibPaths {
|
|
||||||
lib := C.CString(libPath)
|
|
||||||
defer C.free(unsafe.Pointer(lib))
|
|
||||||
C.nvml_init(lib, &resp)
|
|
||||||
if resp.err != nil {
|
|
||||||
err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
|
|
||||||
slog.Info(err.Error())
|
|
||||||
C.free(unsafe.Pointer(resp.err))
|
|
||||||
} else {
|
|
||||||
err = nil
|
|
||||||
return &resp.ch, libPath, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil, "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// bootstrap the Intel GPU library
|
|
||||||
// Returns: num devices, handle, libPath, error
|
|
||||||
func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
|
|
||||||
var resp C.oneapi_init_resp_t
|
|
||||||
num_devices := 0
|
|
||||||
resp.oh.verbose = getVerboseState()
|
|
||||||
var err error
|
|
||||||
for _, libPath := range oneapiLibPaths {
|
|
||||||
lib := C.CString(libPath)
|
|
||||||
defer C.free(unsafe.Pointer(lib))
|
|
||||||
C.oneapi_init(lib, &resp)
|
|
||||||
if resp.err != nil {
|
|
||||||
err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
|
|
||||||
slog.Debug(err.Error())
|
|
||||||
C.free(unsafe.Pointer(resp.err))
|
|
||||||
} else {
|
|
||||||
err = nil
|
|
||||||
for i := range resp.oh.num_drivers {
|
|
||||||
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
|
|
||||||
}
|
|
||||||
return num_devices, &resp.oh, libPath, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0, nil, "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
func getVerboseState() C.uint16_t {
|
|
||||||
if envconfig.LogLevel() < slog.LevelInfo {
|
|
||||||
return C.uint16_t(1)
|
|
||||||
}
|
|
||||||
return C.uint16_t(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given the list of GPUs this instantiation is targeted for,
|
|
||||||
// figure out the visible devices environment variable
|
|
||||||
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
|
|
||||||
if len(l) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
vd := []string{}
|
|
||||||
// Only filter the AMD GPUs at this level, let all NVIDIA devices through
|
|
||||||
if tmp := rocmGetVisibleDevicesEnv(l); tmp != "" {
|
|
||||||
vd = append(vd, tmp)
|
|
||||||
}
|
|
||||||
return vd
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetSystemInfo() SystemInfo {
|
|
||||||
gpus := GetGPUInfo()
|
|
||||||
gpuMutex.Lock()
|
|
||||||
defer gpuMutex.Unlock()
|
|
||||||
discoveryErrors := []string{}
|
|
||||||
for _, err := range bootstrapErrors {
|
|
||||||
discoveryErrors = append(discoveryErrors, err.Error())
|
|
||||||
}
|
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
|
||||||
gpus = []GpuInfo{}
|
|
||||||
}
|
|
||||||
|
|
||||||
return SystemInfo{
|
|
||||||
System: cpus[0],
|
|
||||||
GPUs: gpus,
|
|
||||||
UnsupportedGPUs: unsupportedGPUs,
|
|
||||||
DiscoveryErrors: discoveryErrors,
|
|
||||||
}
|
}
|
||||||
|
return ""
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
//go:build darwin
|
|
||||||
|
|
||||||
package discover
|
package discover
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -11,7 +9,6 @@ import "C"
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"runtime"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
@@ -21,39 +18,6 @@ const (
|
|||||||
metalMinimumMemory = 512 * format.MebiByte
|
metalMinimumMemory = 512 * format.MebiByte
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
|
||||||
mem, _ := GetCPUMem()
|
|
||||||
if runtime.GOARCH == "amd64" {
|
|
||||||
return []GpuInfo{
|
|
||||||
{
|
|
||||||
Library: "cpu",
|
|
||||||
memInfo: mem,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
info := GpuInfo{
|
|
||||||
Library: "metal",
|
|
||||||
ID: "0",
|
|
||||||
}
|
|
||||||
info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
|
|
||||||
|
|
||||||
// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
|
|
||||||
info.FreeMemory = info.TotalMemory
|
|
||||||
|
|
||||||
info.MinimumMemory = metalMinimumMemory
|
|
||||||
return []GpuInfo{info}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetCPUInfo() GpuInfoList {
|
|
||||||
mem, _ := GetCPUMem()
|
|
||||||
return []GpuInfo{
|
|
||||||
{
|
|
||||||
Library: "cpu",
|
|
||||||
memInfo: mem,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
@@ -62,13 +26,7 @@ func GetCPUMem() (memInfo, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
|
func GetCPUDetails() []CPU {
|
||||||
// No-op on darwin
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetSystemInfo() SystemInfo {
|
|
||||||
mem, _ := GetCPUMem()
|
|
||||||
query := "hw.perflevel0.physicalcpu"
|
query := "hw.perflevel0.physicalcpu"
|
||||||
perfCores, err := syscall.SysctlUint32(query)
|
perfCores, err := syscall.SysctlUint32(query)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -81,19 +39,16 @@ func GetSystemInfo() SystemInfo {
|
|||||||
query = "hw.logicalcpu"
|
query = "hw.logicalcpu"
|
||||||
logicalCores, _ := syscall.SysctlUint32(query)
|
logicalCores, _ := syscall.SysctlUint32(query)
|
||||||
|
|
||||||
return SystemInfo{
|
return []CPU{
|
||||||
System: CPUInfo{
|
|
||||||
GpuInfo: GpuInfo{
|
|
||||||
memInfo: mem,
|
|
||||||
},
|
|
||||||
CPUs: []CPU{
|
|
||||||
{
|
{
|
||||||
CoreCount: int(perfCores + efficiencyCores),
|
CoreCount: int(perfCores + efficiencyCores),
|
||||||
EfficiencyCoreCount: int(efficiencyCores),
|
EfficiencyCoreCount: int(efficiencyCores),
|
||||||
ThreadCount: int(logicalCores),
|
ThreadCount: int(logicalCores),
|
||||||
},
|
},
|
||||||
},
|
|
||||||
},
|
|
||||||
GPUs: GetGPUInfo(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func IsNUMA() bool {
|
||||||
|
// numa support in ggml is linux only
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
#ifndef __GPU_INFO_H__
|
|
||||||
#define __GPU_INFO_H__
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include <dlfcn.h>
|
|
||||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
|
|
||||||
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
|
|
||||||
#define LOAD_ERR() strdup(dlerror())
|
|
||||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
|
||||||
#else
|
|
||||||
#include <windows.h>
|
|
||||||
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
|
|
||||||
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
|
|
||||||
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
|
|
||||||
#define LOAD_ERR() ({\
|
|
||||||
LPSTR messageBuffer = NULL; \
|
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
|
|
||||||
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
|
|
||||||
char *resp = strdup(messageBuffer); \
|
|
||||||
LocalFree(messageBuffer); \
|
|
||||||
resp; \
|
|
||||||
})
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOG
|
|
||||||
#define LOG(verbose, ...) \
|
|
||||||
do { \
|
|
||||||
if (verbose) { \
|
|
||||||
fprintf(stderr, __VA_ARGS__); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GPU_ID_LEN 64
|
|
||||||
#define GPU_NAME_LEN 96
|
|
||||||
|
|
||||||
typedef struct mem_info {
|
|
||||||
char *err; // If non-nill, caller responsible for freeing
|
|
||||||
char gpu_id[GPU_ID_LEN];
|
|
||||||
char gpu_name[GPU_NAME_LEN];
|
|
||||||
uint64_t total;
|
|
||||||
uint64_t free;
|
|
||||||
uint64_t used;
|
|
||||||
|
|
||||||
// Compute Capability
|
|
||||||
int major;
|
|
||||||
int minor;
|
|
||||||
int patch;
|
|
||||||
} mem_info_t;
|
|
||||||
|
|
||||||
void cpu_check_ram(mem_info_t *resp);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "gpu_info_cudart.h"
|
|
||||||
#include "gpu_info_nvcuda.h"
|
|
||||||
#include "gpu_info_nvml.h"
|
|
||||||
#include "gpu_info_oneapi.h"
|
|
||||||
|
|
||||||
#endif // __GPU_INFO_H__
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,181 +0,0 @@
|
|||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
#include <inttypes.h>
|
|
||||||
#include "gpu_info_cudart.h"
|
|
||||||
|
|
||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|
||||||
cudartReturn_t ret;
|
|
||||||
resp->err = NULL;
|
|
||||||
resp->num_devices = 0;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
int i;
|
|
||||||
|
|
||||||
struct lookup {
|
|
||||||
char *s;
|
|
||||||
void **p;
|
|
||||||
} l[] = {
|
|
||||||
{"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
|
|
||||||
{"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
|
|
||||||
{"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
|
|
||||||
{"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
|
|
||||||
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
|
|
||||||
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
|
|
||||||
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
|
|
||||||
{"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
|
|
||||||
{NULL, NULL},
|
|
||||||
};
|
|
||||||
|
|
||||||
resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
|
|
||||||
if (!resp->ch.handle) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
|
|
||||||
snprintf(buf, buflen,
|
|
||||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
|
||||||
cudart_lib_path, msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
|
||||||
if (!*(l[i].p)) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
|
||||||
msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*resp->ch.cudaSetDevice)(0);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
|
|
||||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
snprintf(buf, buflen, "cudart init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int version = 0;
|
|
||||||
|
|
||||||
// Report driver version if we're in verbose mode, ignore errors
|
|
||||||
ret = (*resp->ch.cudaDriverGetVersion)(&version);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
|
|
||||||
} else {
|
|
||||||
resp->ch.driver_major = version / 1000;
|
|
||||||
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
|
|
||||||
resp->err = NULL;
|
|
||||||
cudartMemory_t memInfo = {0,0,0};
|
|
||||||
cudartReturn_t ret;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
|
||||||
resp->err = strdup("cudart handle isn't initialized");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cudaSetDevice)(i);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "cudart device failed to initialize");
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaDeviceProp_t props;
|
|
||||||
ret = (*h.cudaGetDeviceProperties)(&props, i);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
} else {
|
|
||||||
int allNull = 1;
|
|
||||||
for (int j = 0; j < 16; j++) {
|
|
||||||
if (props.uuid.bytes[j] != 0) {
|
|
||||||
allNull = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (allNull != 0) {
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
|
||||||
} else {
|
|
||||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
|
||||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
|
||||||
props.uuid.bytes[0],
|
|
||||||
props.uuid.bytes[1],
|
|
||||||
props.uuid.bytes[2],
|
|
||||||
props.uuid.bytes[3],
|
|
||||||
props.uuid.bytes[4],
|
|
||||||
props.uuid.bytes[5],
|
|
||||||
props.uuid.bytes[6],
|
|
||||||
props.uuid.bytes[7],
|
|
||||||
props.uuid.bytes[8],
|
|
||||||
props.uuid.bytes[9],
|
|
||||||
props.uuid.bytes[10],
|
|
||||||
props.uuid.bytes[11],
|
|
||||||
props.uuid.bytes[12],
|
|
||||||
props.uuid.bytes[13],
|
|
||||||
props.uuid.bytes[14],
|
|
||||||
props.uuid.bytes[15]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
resp->major = props.major;
|
|
||||||
resp->minor = props.minor;
|
|
||||||
|
|
||||||
// TODO add other useful properties from props
|
|
||||||
}
|
|
||||||
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
resp->total = memInfo.total;
|
|
||||||
resp->free = memInfo.free;
|
|
||||||
resp->used = memInfo.used;
|
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
|
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
|
|
||||||
LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
|
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
|
||||||
}
|
|
||||||
|
|
||||||
void cudart_release(cudart_handle_t h) {
|
|
||||||
LOG(h.verbose, "releasing cudart library\n");
|
|
||||||
UNLOAD_LIBRARY(h.handle);
|
|
||||||
h.handle = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,145 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
#ifndef __GPU_INFO_CUDART_H__
|
|
||||||
#define __GPU_INFO_CUDART_H__
|
|
||||||
#include "gpu_info.h"
|
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
|
||||||
typedef enum cudartReturn_enum {
|
|
||||||
CUDART_SUCCESS = 0,
|
|
||||||
CUDART_ERROR_INVALID_VALUE = 1,
|
|
||||||
CUDART_ERROR_MEMORY_ALLOCATION = 2,
|
|
||||||
CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
|
|
||||||
// Other values omitted for now...
|
|
||||||
} cudartReturn_t;
|
|
||||||
|
|
||||||
typedef enum cudartDeviceAttr_enum {
|
|
||||||
cudartDevAttrComputeCapabilityMajor = 75,
|
|
||||||
cudartDevAttrComputeCapabilityMinor = 76,
|
|
||||||
|
|
||||||
// TODO - not yet wired up but may be useful for Jetson or other
|
|
||||||
// integrated GPU scenarios with shared memory
|
|
||||||
cudaDevAttrIntegrated = 18
|
|
||||||
|
|
||||||
} cudartDeviceAttr_t;
|
|
||||||
|
|
||||||
typedef void *cudartDevice_t; // Opaque is sufficient
|
|
||||||
typedef struct cudartMemory_st {
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
size_t used;
|
|
||||||
} cudartMemory_t;
|
|
||||||
|
|
||||||
typedef struct cudaUUID {
|
|
||||||
unsigned char bytes[16];
|
|
||||||
} cudaUUID_t;
|
|
||||||
typedef struct cudaDeviceProp {
|
|
||||||
char name[256]; /**< ASCII string identifying device */
|
|
||||||
cudaUUID_t uuid; /**< 16-byte unique identifier */
|
|
||||||
char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
|
|
||||||
unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
|
|
||||||
size_t totalGlobalMem; /**< Global memory available on device in bytes */
|
|
||||||
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
|
|
||||||
int regsPerBlock; /**< 32-bit registers available per block */
|
|
||||||
int warpSize; /**< Warp size in threads */
|
|
||||||
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
|
|
||||||
int maxThreadsPerBlock; /**< Maximum number of threads per block */
|
|
||||||
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
|
|
||||||
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
|
|
||||||
int clockRate; /**< Clock frequency in kilohertz */
|
|
||||||
size_t totalConstMem; /**< Constant memory available on device in bytes */
|
|
||||||
int major; /**< Major compute capability */
|
|
||||||
int minor; /**< Minor compute capability */
|
|
||||||
size_t textureAlignment; /**< Alignment requirement for textures */
|
|
||||||
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
|
|
||||||
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
|
|
||||||
int multiProcessorCount; /**< Number of multiprocessors on device */
|
|
||||||
int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */
|
|
||||||
int integrated; /**< Device is integrated as opposed to discrete */
|
|
||||||
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
|
|
||||||
int computeMode; /**< Compute mode (See ::cudaComputeMode) */
|
|
||||||
int maxTexture1D; /**< Maximum 1D texture size */
|
|
||||||
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
|
|
||||||
int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
|
|
||||||
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
|
|
||||||
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
|
|
||||||
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
|
|
||||||
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
|
|
||||||
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
|
|
||||||
int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */
|
|
||||||
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
|
|
||||||
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
|
|
||||||
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
|
|
||||||
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
|
|
||||||
int maxSurface1D; /**< Maximum 1D surface size */
|
|
||||||
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
|
|
||||||
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
|
|
||||||
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
|
|
||||||
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
|
|
||||||
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
|
|
||||||
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
|
|
||||||
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
|
|
||||||
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
|
|
||||||
int ECCEnabled; /**< Device has ECC support enabled */
|
|
||||||
int pciBusID; /**< PCI bus ID of the device */
|
|
||||||
int pciDeviceID; /**< PCI device ID of the device */
|
|
||||||
int pciDomainID; /**< PCI domain ID of the device */
|
|
||||||
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
|
|
||||||
int asyncEngineCount; /**< Number of asynchronous engines */
|
|
||||||
int unifiedAddressing; /**< Device shares a unified address space with the host */
|
|
||||||
int memoryClockRate; /**< Peak memory clock frequency in kilohertz */
|
|
||||||
int memoryBusWidth; /**< Global memory bus width in bits */
|
|
||||||
int l2CacheSize; /**< Size of L2 cache in bytes */
|
|
||||||
int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */
|
|
||||||
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
|
|
||||||
int streamPrioritiesSupported; /**< Device supports stream priorities */
|
|
||||||
int globalL1CacheSupported; /**< Device supports caching globals in L1 */
|
|
||||||
int localL1CacheSupported; /**< Device supports caching locals in L1 */
|
|
||||||
size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
|
|
||||||
int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */
|
|
||||||
int managedMemory; /**< Device supports allocating managed memory on this system */
|
|
||||||
int isMultiGpuBoard; /**< Device is on a multi-GPU board */
|
|
||||||
int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */
|
|
||||||
int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */
|
|
||||||
int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
|
||||||
int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
|
||||||
int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */
|
|
||||||
int computePreemptionSupported; /**< Device supports Compute Preemption */
|
|
||||||
int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
|
|
||||||
int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
|
|
||||||
int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
|
|
||||||
size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */
|
|
||||||
int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
|
|
||||||
int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
|
|
||||||
int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
|
|
||||||
int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
|
|
||||||
size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */
|
|
||||||
} cudaDeviceProp_t;
|
|
||||||
|
|
||||||
typedef struct cudart_handle {
|
|
||||||
void *handle;
|
|
||||||
uint16_t verbose;
|
|
||||||
int driver_major;
|
|
||||||
int driver_minor;
|
|
||||||
cudartReturn_t (*cudaSetDevice)(int device);
|
|
||||||
cudartReturn_t (*cudaDeviceSynchronize)(void);
|
|
||||||
cudartReturn_t (*cudaDeviceReset)(void);
|
|
||||||
cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
|
|
||||||
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
|
||||||
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
|
||||||
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
|
||||||
cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
|
|
||||||
} cudart_handle_t;
|
|
||||||
|
|
||||||
typedef struct cudart_init_resp {
|
|
||||||
char *err; // If err is non-null handle is invalid
|
|
||||||
cudart_handle_t ch;
|
|
||||||
int num_devices;
|
|
||||||
} cudart_init_resp_t;
|
|
||||||
|
|
||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
|
||||||
void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
|
||||||
// TODO - if we keep this library longer term, add cudart_get_free
|
|
||||||
void cudart_release(cudart_handle_t ch);
|
|
||||||
|
|
||||||
#endif // __GPU_INFO_CUDART_H__
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,251 +0,0 @@
|
|||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
#include <inttypes.h>
|
|
||||||
#include "gpu_info_nvcuda.h"
|
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|
||||||
LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
|
|
||||||
CUresult ret;
|
|
||||||
resp->err = NULL;
|
|
||||||
resp->num_devices = 0;
|
|
||||||
resp->cudaErr = CUDA_SUCCESS;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
int i;
|
|
||||||
|
|
||||||
struct lookup {
|
|
||||||
char *s;
|
|
||||||
void **p;
|
|
||||||
} l[] = {
|
|
||||||
|
|
||||||
{"cuInit", (void *)&resp->ch.cuInit},
|
|
||||||
{"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
|
|
||||||
{"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
|
|
||||||
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
|
|
||||||
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
|
|
||||||
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
|
|
||||||
{"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
|
|
||||||
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
|
|
||||||
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
|
|
||||||
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
|
|
||||||
{NULL, NULL},
|
|
||||||
};
|
|
||||||
|
|
||||||
resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
|
|
||||||
if (!resp->ch.handle) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
|
|
||||||
snprintf(buf, buflen,
|
|
||||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
|
||||||
nvcuda_lib_path, msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
resp->cudaErr = -1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
|
||||||
if (!*(l[i].p)) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
|
||||||
msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
resp->cudaErr = -1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(resp->ch.verbose, "calling cuInit\n");
|
|
||||||
ret = (*resp->ch.cuInit)(0);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
resp->cudaErr = ret;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int version = 0;
|
|
||||||
resp->ch.driver_major = 0;
|
|
||||||
resp->ch.driver_minor = 0;
|
|
||||||
|
|
||||||
// Report driver version if we're in verbose mode, ignore errors
|
|
||||||
LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
|
|
||||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
|
||||||
} else {
|
|
||||||
LOG(resp->ch.verbose, "raw version 0x%x\n", version);
|
|
||||||
resp->ch.driver_major = version / 1000;
|
|
||||||
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
|
|
||||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
resp->cudaErr = ret;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int buflen = 256;
|
|
||||||
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|
||||||
resp->err = NULL;
|
|
||||||
nvcudaMemory_t memInfo = {0,0};
|
|
||||||
CUresult ret;
|
|
||||||
CUdevice device = -1;
|
|
||||||
CUcontext ctx = NULL;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
|
||||||
resp->err = strdup("cuda driver library handle isn't initialized");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuDeviceGet)(&device, i);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "cuda driver library device failed to initialize");
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int major = 0;
|
|
||||||
int minor = 0;
|
|
||||||
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
|
|
||||||
} else {
|
|
||||||
ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
|
|
||||||
} else {
|
|
||||||
resp->minor = minor;
|
|
||||||
resp->major = major;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuDeviceGetUuid)(&uuid, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
|
||||||
} else {
|
|
||||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
|
||||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
|
||||||
uuid.bytes[0],
|
|
||||||
uuid.bytes[1],
|
|
||||||
uuid.bytes[2],
|
|
||||||
uuid.bytes[3],
|
|
||||||
uuid.bytes[4],
|
|
||||||
uuid.bytes[5],
|
|
||||||
uuid.bytes[6],
|
|
||||||
uuid.bytes[7],
|
|
||||||
uuid.bytes[8],
|
|
||||||
uuid.bytes[9],
|
|
||||||
uuid.bytes[10],
|
|
||||||
uuid.bytes[11],
|
|
||||||
uuid.bytes[12],
|
|
||||||
uuid.bytes[13],
|
|
||||||
uuid.bytes[14],
|
|
||||||
uuid.bytes[15]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
|
|
||||||
resp->gpu_name[0] = '\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
// To get memory we have to set (and release) a context
|
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
// Best effort on failure...
|
|
||||||
(*h.cuCtxDestroy)(ctx);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
resp->total = memInfo.total;
|
|
||||||
resp->free = memInfo.free;
|
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
|
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
|
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ret = (*h.cuCtxDestroy)(ctx);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(1, "cuda driver library failed to release device context %d", ret);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
|
|
||||||
CUresult ret;
|
|
||||||
CUcontext ctx = NULL;
|
|
||||||
CUdevice device = -1;
|
|
||||||
*free = 0;
|
|
||||||
*total = 0;
|
|
||||||
|
|
||||||
ret = (*h.cuDeviceGet)(&device, i);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(1, "cuda driver library device failed to initialize");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// To get memory we have to set (and release) a context
|
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(1, "cuda driver library failed to get device context %d", ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuMemGetInfo_v2)(free, total);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(1, "cuda driver library device memory info lookup failure %d", ret);
|
|
||||||
// Best effort on failure...
|
|
||||||
(*h.cuCtxDestroy)(ctx);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.cuCtxDestroy)(ctx);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
LOG(1, "cuda driver library failed to release device context %d", ret);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void nvcuda_release(nvcuda_handle_t h) {
|
|
||||||
LOG(h.verbose, "releasing cuda driver library\n");
|
|
||||||
UNLOAD_LIBRARY(h.handle);
|
|
||||||
// TODO and other context release logic?
|
|
||||||
h.handle = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
#ifndef __GPU_INFO_NVCUDA_H__
|
|
||||||
#define __GPU_INFO_NVCUDA_H__
|
|
||||||
#include "gpu_info.h"
|
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
|
||||||
typedef enum cudaError_enum {
|
|
||||||
CUDA_SUCCESS = 0,
|
|
||||||
CUDA_ERROR_INVALID_VALUE = 1,
|
|
||||||
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
|
||||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
|
||||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
|
||||||
CUDA_ERROR_NO_DEVICE = 100,
|
|
||||||
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
|
|
||||||
CUDA_ERROR_UNKNOWN = 999,
|
|
||||||
// Other values omitted for now...
|
|
||||||
} CUresult;
|
|
||||||
|
|
||||||
typedef enum CUdevice_attribute_enum {
|
|
||||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
|
|
||||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
|
|
||||||
|
|
||||||
// TODO - not yet wired up but may be useful for Jetson or other
|
|
||||||
// integrated GPU scenarios with shared memory
|
|
||||||
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
|
|
||||||
|
|
||||||
} CUdevice_attribute;
|
|
||||||
|
|
||||||
typedef void *nvcudaDevice_t; // Opaque is sufficient
|
|
||||||
typedef struct nvcudaMemory_st {
|
|
||||||
uint64_t total;
|
|
||||||
uint64_t free;
|
|
||||||
} nvcudaMemory_t;
|
|
||||||
|
|
||||||
typedef struct nvcudaDriverVersion {
|
|
||||||
int major;
|
|
||||||
int minor;
|
|
||||||
} nvcudaDriverVersion_t;
|
|
||||||
|
|
||||||
typedef struct CUuuid_st {
|
|
||||||
unsigned char bytes[16];
|
|
||||||
} CUuuid;
|
|
||||||
|
|
||||||
typedef int CUdevice;
|
|
||||||
typedef void* CUcontext;
|
|
||||||
|
|
||||||
typedef struct nvcuda_handle {
|
|
||||||
void *handle;
|
|
||||||
uint16_t verbose;
|
|
||||||
int driver_major;
|
|
||||||
int driver_minor;
|
|
||||||
CUresult (*cuInit)(unsigned int Flags);
|
|
||||||
CUresult (*cuDriverGetVersion)(int *driverVersion);
|
|
||||||
CUresult (*cuDeviceGetCount)(int *);
|
|
||||||
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
|
|
||||||
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
|
|
||||||
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
|
|
||||||
CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
|
|
||||||
|
|
||||||
// Context specific aspects
|
|
||||||
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
|
|
||||||
CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
|
|
||||||
CUresult (*cuCtxDestroy)(CUcontext ctx);
|
|
||||||
} nvcuda_handle_t;
|
|
||||||
|
|
||||||
typedef struct nvcuda_init_resp {
|
|
||||||
char *err; // If err is non-null handle is invalid
|
|
||||||
nvcuda_handle_t ch;
|
|
||||||
int num_devices;
|
|
||||||
CUresult cudaErr;
|
|
||||||
} nvcuda_init_resp_t;
|
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
|
||||||
void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
|
|
||||||
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total);
|
|
||||||
void nvcuda_release(nvcuda_handle_t ch);
|
|
||||||
|
|
||||||
#endif // __GPU_INFO_NVCUDA_H__
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,104 +0,0 @@
|
|||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "gpu_info_nvml.h"
|
|
||||||
|
|
||||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
|
||||||
nvmlReturn_t ret;
|
|
||||||
resp->err = NULL;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
int i;
|
|
||||||
|
|
||||||
struct lookup {
|
|
||||||
char *s;
|
|
||||||
void **p;
|
|
||||||
} l[] = {
|
|
||||||
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
|
||||||
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
|
||||||
{"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
|
|
||||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
|
||||||
{NULL, NULL},
|
|
||||||
};
|
|
||||||
|
|
||||||
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
|
|
||||||
if (!resp->ch.handle) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
|
|
||||||
snprintf(buf, buflen,
|
|
||||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
|
||||||
nvml_lib_path, msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
|
||||||
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
|
||||||
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
|
||||||
if (!*(l[i].p)) {
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
|
||||||
msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*resp->ch.nvmlInit_v2)();
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
|
|
||||||
nvmlDevice_t device;
|
|
||||||
nvmlMemory_t memInfo = {0};
|
|
||||||
nvmlReturn_t ret;
|
|
||||||
ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
LOG(1, "unable to get device handle %s: %d", uuid, ret);
|
|
||||||
*free = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
|
|
||||||
*free = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
*free = memInfo.free;
|
|
||||||
*total = memInfo.total;
|
|
||||||
*used = memInfo.used;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void nvml_release(nvml_handle_t h) {
|
|
||||||
LOG(h.verbose, "releasing nvml library\n");
|
|
||||||
nvmlReturn_t ret;
|
|
||||||
ret = (*h.nvmlShutdown)();
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
LOG(1, "error during nvmlShutdown %d", ret);
|
|
||||||
}
|
|
||||||
UNLOAD_LIBRARY(h.handle);
|
|
||||||
h.handle = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
#ifndef __GPU_INFO_NVML_H__
|
|
||||||
#define __GPU_INFO_NVML_H__
|
|
||||||
#include "gpu_info.h"
|
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
|
||||||
typedef enum nvmlReturn_enum {
|
|
||||||
NVML_SUCCESS = 0,
|
|
||||||
// Other values omitted for now...
|
|
||||||
} nvmlReturn_t;
|
|
||||||
typedef void *nvmlDevice_t; // Opaque is sufficient
|
|
||||||
typedef struct nvmlMemory_st {
|
|
||||||
unsigned long long total;
|
|
||||||
unsigned long long free;
|
|
||||||
unsigned long long used;
|
|
||||||
} nvmlMemory_t;
|
|
||||||
|
|
||||||
typedef enum nvmlBrandType_enum
|
|
||||||
{
|
|
||||||
NVML_BRAND_UNKNOWN = 0,
|
|
||||||
} nvmlBrandType_t;
|
|
||||||
|
|
||||||
typedef struct nvml_handle {
|
|
||||||
void *handle;
|
|
||||||
uint16_t verbose;
|
|
||||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
|
||||||
nvmlReturn_t (*nvmlShutdown)(void);
|
|
||||||
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
|
|
||||||
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
|
||||||
} nvml_handle_t;
|
|
||||||
|
|
||||||
typedef struct nvml_init_resp {
|
|
||||||
char *err; // If err is non-null handle is invalid
|
|
||||||
nvml_handle_t ch;
|
|
||||||
} nvml_init_resp_t;
|
|
||||||
|
|
||||||
typedef struct nvml_compute_capability {
|
|
||||||
char *err;
|
|
||||||
int major;
|
|
||||||
int minor;
|
|
||||||
} nvml_compute_capability_t;
|
|
||||||
|
|
||||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
|
||||||
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
|
|
||||||
void nvml_release(nvml_handle_t ch);
|
|
||||||
|
|
||||||
#endif // __GPU_INFO_NVML_H__
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,259 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
|
|
||||||
#include "gpu_info_oneapi.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
|
|
||||||
ze_result_t ret;
|
|
||||||
resp->err = NULL;
|
|
||||||
resp->oh.devices = NULL;
|
|
||||||
resp->oh.num_devices = NULL;
|
|
||||||
resp->oh.drivers = NULL;
|
|
||||||
resp->oh.num_drivers = 0;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
int i, d;
|
|
||||||
struct lookup {
|
|
||||||
char *s;
|
|
||||||
void **p;
|
|
||||||
} l[] = {
|
|
||||||
{"zesInit", (void *)&resp->oh.zesInit},
|
|
||||||
{"zesDriverGet", (void *)&resp->oh.zesDriverGet},
|
|
||||||
{"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
|
|
||||||
{"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
|
|
||||||
{"zesDeviceEnumMemoryModules",
|
|
||||||
(void *)&resp->oh.zesDeviceEnumMemoryModules},
|
|
||||||
{"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
|
|
||||||
{"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
|
|
||||||
{NULL, NULL},
|
|
||||||
};
|
|
||||||
|
|
||||||
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
|
|
||||||
if (!resp->oh.handle) {
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
snprintf(buf, buflen,
|
|
||||||
"Unable to load %s library to query for Intel GPUs: %s\n",
|
|
||||||
oneapi_lib_path, msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
|
||||||
LOG(resp->oh.verbose,
|
|
||||||
"wiring Level-Zero management library functions in %s\n",
|
|
||||||
oneapi_lib_path);
|
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
|
||||||
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
|
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
|
|
||||||
if (!*(l[i].p)) {
|
|
||||||
resp->oh.handle = NULL;
|
|
||||||
char *msg = LOAD_ERR();
|
|
||||||
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
|
|
||||||
UNLOAD_LIBRARY(resp->oh.handle);
|
|
||||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
|
|
||||||
free(msg);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(resp->oh.verbose, "calling zesInit\n");
|
|
||||||
|
|
||||||
ret = (*resp->oh.zesInit)(0);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
|
|
||||||
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
oneapi_release(resp->oh);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(resp->oh.verbose, "calling zesDriverGet\n");
|
|
||||||
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
|
||||||
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
oneapi_release(resp->oh);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
|
|
||||||
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
|
|
||||||
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
|
|
||||||
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
|
|
||||||
resp->oh.devices =
|
|
||||||
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
|
|
||||||
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
|
||||||
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
oneapi_release(resp->oh);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (d = 0; d < resp->oh.num_drivers; d++) {
|
|
||||||
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
|
|
||||||
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
|
|
||||||
&resp->oh.num_devices[d], NULL);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
|
||||||
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
oneapi_release(resp->oh);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
resp->oh.devices[d] =
|
|
||||||
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
|
|
||||||
ret = (*resp->oh.zesDeviceGet)(
|
|
||||||
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
|
||||||
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
oneapi_release(resp->oh);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
|
||||||
mem_info_t *resp) {
|
|
||||||
ze_result_t ret;
|
|
||||||
resp->err = NULL;
|
|
||||||
uint64_t totalMem = 0;
|
|
||||||
uint64_t usedMem = 0;
|
|
||||||
const int buflen = 256;
|
|
||||||
char buf[buflen + 1];
|
|
||||||
int i, d, m;
|
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
|
||||||
resp->err = strdup("Level-Zero handle not initialized");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (driver > h.num_drivers || device > h.num_devices[driver]) {
|
|
||||||
resp->err = strdup("driver of device index out of bounds");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
resp->total = 0;
|
|
||||||
resp->free = 0;
|
|
||||||
|
|
||||||
zes_device_ext_properties_t ext_props;
|
|
||||||
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
|
|
||||||
ext_props.pNext = NULL;
|
|
||||||
|
|
||||||
zes_device_properties_t props;
|
|
||||||
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
|
||||||
props.pNext = &ext_props;
|
|
||||||
|
|
||||||
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "unable to get device properties: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
|
|
||||||
|
|
||||||
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
|
|
||||||
// (this is probably wrong...)
|
|
||||||
// TODO - the driver isn't included - what if there are multiple drivers?
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
|
|
||||||
|
|
||||||
if (h.verbose) {
|
|
||||||
// When in verbose mode, report more information about
|
|
||||||
// the card we discover.
|
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
|
|
||||||
props.modelName);
|
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
|
|
||||||
props.brandName);
|
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
|
|
||||||
props.vendorName);
|
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
|
|
||||||
props.serialNumber);
|
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
|
|
||||||
props.boardNumber);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO
|
|
||||||
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
|
|
||||||
|
|
||||||
uint32_t memCount = 0;
|
|
||||||
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
|
|
||||||
NULL);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
|
|
||||||
ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
|
|
||||||
|
|
||||||
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
|
|
||||||
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
|
|
||||||
|
|
||||||
for (m = 0; m < memCount; m++) {
|
|
||||||
zes_mem_state_t state;
|
|
||||||
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
|
|
||||||
state.pNext = NULL;
|
|
||||||
ret = (*h.zesMemoryGetState)(mems[m], &state);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "unable to get memory state: %x", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
free(mems);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
resp->total += state.size;
|
|
||||||
resp->free += state.free;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(mems);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oneapi_release(oneapi_handle_t h) {
|
|
||||||
int d;
|
|
||||||
LOG(h.verbose, "releasing oneapi library\n");
|
|
||||||
for (d = 0; d < h.num_drivers; d++) {
|
|
||||||
if (h.devices != NULL && h.devices[d] != NULL) {
|
|
||||||
free(h.devices[d]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (h.devices != NULL) {
|
|
||||||
free(h.devices);
|
|
||||||
h.devices = NULL;
|
|
||||||
}
|
|
||||||
if (h.num_devices != NULL) {
|
|
||||||
free(h.num_devices);
|
|
||||||
h.num_devices = NULL;
|
|
||||||
}
|
|
||||||
if (h.drivers != NULL) {
|
|
||||||
free(h.drivers);
|
|
||||||
h.drivers = NULL;
|
|
||||||
}
|
|
||||||
h.num_drivers = 0;
|
|
||||||
UNLOAD_LIBRARY(h.handle);
|
|
||||||
h.handle = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
|
|
||||||
if (h.handle == NULL || h.num_devices == NULL) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (driver > h.num_drivers) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return (int)h.num_devices[driver];
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,203 +0,0 @@
|
|||||||
#ifndef __APPLE__
|
|
||||||
#ifndef __GPU_INFO_ONEAPI_H__
|
|
||||||
#define __GPU_INFO_ONEAPI_H__
|
|
||||||
#include "gpu_info.h"
|
|
||||||
|
|
||||||
#define ZE_MAX_DEVICE_NAME 256
|
|
||||||
#define ZE_MAX_DEVICE_UUID_SIZE 16
|
|
||||||
#define ZES_STRING_PROPERTY_SIZE 64
|
|
||||||
#define ZE_BIT(_i) (1 << _i)
|
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
|
||||||
typedef enum ze_result_t {
|
|
||||||
ZE_RESULT_SUCCESS = 0,
|
|
||||||
// Other values omitted for now...
|
|
||||||
} ze_result_t;
|
|
||||||
|
|
||||||
typedef uint8_t ze_bool_t;
|
|
||||||
typedef struct _zes_driver_handle_t *zes_driver_handle_t;
|
|
||||||
typedef struct _zes_device_handle_t *zes_device_handle_t;
|
|
||||||
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
|
|
||||||
|
|
||||||
typedef enum _ze_structure_type_t {
|
|
||||||
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} ze_structure_type_t;
|
|
||||||
|
|
||||||
typedef enum _zes_structure_type_t {
|
|
||||||
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
|
|
||||||
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
|
|
||||||
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
|
|
||||||
ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
|
|
||||||
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_structure_type_t;
|
|
||||||
|
|
||||||
typedef enum _zes_mem_type_t {
|
|
||||||
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_mem_type_t;
|
|
||||||
|
|
||||||
typedef enum _zes_mem_loc_t {
|
|
||||||
ZES_MEM_LOC_SYSTEM = 0,
|
|
||||||
ZES_MEM_LOC_DEVICE = 1,
|
|
||||||
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_mem_loc_t;
|
|
||||||
|
|
||||||
typedef enum _zes_mem_health_t {
|
|
||||||
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_mem_health_t;
|
|
||||||
|
|
||||||
typedef struct _ze_device_uuid_t {
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
|
||||||
} ze_device_uuid_t;
|
|
||||||
|
|
||||||
typedef struct _zes_uuid_t {
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
|
||||||
} zes_uuid_t;
|
|
||||||
|
|
||||||
typedef enum _ze_device_type_t {
|
|
||||||
ZE_DEVICE_TYPE_GPU = 1,
|
|
||||||
ZE_DEVICE_TYPE_CPU = 2,
|
|
||||||
ZE_DEVICE_TYPE_FPGA = 3,
|
|
||||||
ZE_DEVICE_TYPE_MCA = 4,
|
|
||||||
ZE_DEVICE_TYPE_VPU = 5,
|
|
||||||
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} ze_device_type_t;
|
|
||||||
|
|
||||||
typedef enum _zes_device_type_t {
|
|
||||||
ZES_DEVICE_TYPE_GPU = 1,
|
|
||||||
ZES_DEVICE_TYPE_CPU = 2,
|
|
||||||
ZES_DEVICE_TYPE_FPGA = 3,
|
|
||||||
ZES_DEVICE_TYPE_MCA = 4,
|
|
||||||
ZES_DEVICE_TYPE_VPU = 5,
|
|
||||||
ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_device_type_t;
|
|
||||||
|
|
||||||
typedef uint32_t ze_device_property_flags_t;
|
|
||||||
typedef enum _ze_device_property_flag_t {
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} ze_device_property_flag_t;
|
|
||||||
|
|
||||||
typedef uint32_t zes_device_property_flags_t;
|
|
||||||
typedef enum _zes_device_property_flag_t {
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
|
||||||
} zes_device_property_flag_t;
|
|
||||||
|
|
||||||
typedef struct _ze_device_properties_t {
|
|
||||||
ze_structure_type_t stype;
|
|
||||||
void *pNext;
|
|
||||||
ze_device_type_t type;
|
|
||||||
uint32_t vendorId;
|
|
||||||
uint32_t deviceId;
|
|
||||||
ze_device_property_flags_t flags;
|
|
||||||
uint32_t subdeviceId;
|
|
||||||
uint32_t coreClockRate;
|
|
||||||
uint64_t maxMemAllocSize;
|
|
||||||
uint32_t maxHardwareContexts;
|
|
||||||
uint32_t maxCommandQueuePriority;
|
|
||||||
uint32_t numThreadsPerEU;
|
|
||||||
uint32_t physicalEUSimdWidth;
|
|
||||||
uint32_t numEUsPerSubslice;
|
|
||||||
uint32_t numSubslicesPerSlice;
|
|
||||||
uint32_t numSlices;
|
|
||||||
uint64_t timerResolution;
|
|
||||||
uint32_t timestampValidBits;
|
|
||||||
uint32_t kernelTimestampValidBits;
|
|
||||||
ze_device_uuid_t uuid;
|
|
||||||
char name[ZE_MAX_DEVICE_NAME];
|
|
||||||
} ze_device_properties_t;
|
|
||||||
|
|
||||||
typedef struct _zes_device_properties_t {
|
|
||||||
zes_structure_type_t stype;
|
|
||||||
void *pNext;
|
|
||||||
ze_device_properties_t core;
|
|
||||||
uint32_t numSubdevices;
|
|
||||||
char serialNumber[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
char boardNumber[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
char brandName[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
char modelName[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
char vendorName[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
char driverVersion[ZES_STRING_PROPERTY_SIZE];
|
|
||||||
} zes_device_properties_t;
|
|
||||||
|
|
||||||
typedef struct _zes_device_ext_properties_t {
|
|
||||||
zes_structure_type_t stype;
|
|
||||||
void *pNext;
|
|
||||||
zes_uuid_t uuid;
|
|
||||||
zes_device_type_t type;
|
|
||||||
zes_device_property_flags_t flags;
|
|
||||||
} zes_device_ext_properties_t;
|
|
||||||
|
|
||||||
typedef struct _zes_mem_properties_t {
|
|
||||||
zes_structure_type_t stype;
|
|
||||||
void *pNext;
|
|
||||||
zes_mem_type_t type;
|
|
||||||
ze_bool_t onSubdevice;
|
|
||||||
uint32_t subdeviceId;
|
|
||||||
zes_mem_loc_t location;
|
|
||||||
uint64_t physicalSize;
|
|
||||||
int32_t busWidth;
|
|
||||||
int32_t numChannels;
|
|
||||||
} zes_mem_properties_t;
|
|
||||||
|
|
||||||
typedef struct _zes_mem_state_t {
|
|
||||||
zes_structure_type_t stype;
|
|
||||||
const void *pNext;
|
|
||||||
zes_mem_health_t health;
|
|
||||||
uint64_t free;
|
|
||||||
uint64_t size;
|
|
||||||
} zes_mem_state_t;
|
|
||||||
|
|
||||||
typedef struct oneapi_handle {
|
|
||||||
void *handle;
|
|
||||||
uint16_t verbose;
|
|
||||||
|
|
||||||
uint32_t num_drivers;
|
|
||||||
zes_driver_handle_t *drivers;
|
|
||||||
uint32_t *num_devices;
|
|
||||||
zes_device_handle_t **devices;
|
|
||||||
|
|
||||||
// TODO Driver major, minor information
|
|
||||||
// int driver_major;
|
|
||||||
// int driver_minor;
|
|
||||||
|
|
||||||
ze_result_t (*zesInit)(int);
|
|
||||||
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
|
|
||||||
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
|
|
||||||
zes_device_handle_t *phDevices);
|
|
||||||
ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
|
|
||||||
zes_device_properties_t *pProperties);
|
|
||||||
ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
|
|
||||||
uint32_t *pCount,
|
|
||||||
zes_mem_handle_t *phMemory);
|
|
||||||
ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
|
|
||||||
zes_mem_properties_t *pProperties);
|
|
||||||
ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
|
|
||||||
zes_mem_state_t *pState);
|
|
||||||
|
|
||||||
} oneapi_handle_t;
|
|
||||||
|
|
||||||
typedef struct oneapi_init_resp {
|
|
||||||
char *err; // If err is non-null handle is invalid
|
|
||||||
oneapi_handle_t oh;
|
|
||||||
} oneapi_init_resp_t;
|
|
||||||
|
|
||||||
typedef struct oneapi_version_resp {
|
|
||||||
ze_result_t status;
|
|
||||||
char *str; // Contains version or error string if status != 0
|
|
||||||
} oneapi_version_resp_t;
|
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
|
|
||||||
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
|
||||||
mem_info_t *resp);
|
|
||||||
void oneapi_release(oneapi_handle_t h);
|
|
||||||
int oneapi_get_device_count(oneapi_handle_t h, int driver);
|
|
||||||
|
|
||||||
#endif // __GPU_INFO_INTEL_H__
|
|
||||||
#endif // __APPLE__
|
|
||||||
@@ -1,60 +0,0 @@
|
|||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"runtime"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
"github.com/stretchr/testify/require"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestBasicGetGPUInfo(t *testing.T) {
|
|
||||||
info := GetGPUInfo()
|
|
||||||
assert.NotEmpty(t, len(info))
|
|
||||||
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
|
|
||||||
if info[0].Library != "cpu" {
|
|
||||||
assert.Greater(t, info[0].TotalMemory, uint64(0))
|
|
||||||
assert.Greater(t, info[0].FreeMemory, uint64(0))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestCPUMemInfo(t *testing.T) {
|
|
||||||
info, err := GetCPUMem()
|
|
||||||
require.NoError(t, err)
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "darwin":
|
|
||||||
t.Skip("CPU memory not populated on darwin")
|
|
||||||
case "linux", "windows":
|
|
||||||
assert.Greater(t, info.TotalMemory, uint64(0))
|
|
||||||
assert.Greater(t, info.FreeMemory, uint64(0))
|
|
||||||
default:
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestByLibrary(t *testing.T) {
|
|
||||||
type testCase struct {
|
|
||||||
input []GpuInfo
|
|
||||||
expect int
|
|
||||||
}
|
|
||||||
|
|
||||||
testCases := map[string]*testCase{
|
|
||||||
"empty": {input: []GpuInfo{}, expect: 0},
|
|
||||||
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
|
|
||||||
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
|
|
||||||
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
|
|
||||||
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
|
|
||||||
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
|
|
||||||
}
|
|
||||||
|
|
||||||
for k, v := range testCases {
|
|
||||||
t.Run(k, func(t *testing.T) {
|
|
||||||
resp := (GpuInfoList)(v.input).ByLibrary()
|
|
||||||
if len(resp) != v.expect {
|
|
||||||
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
|
||||||
501
discover/runner.go
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
package discover
|
||||||
|
|
||||||
|
// Runner based GPU discovery
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/logutil"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
deviceMu sync.Mutex
|
||||||
|
devices []ml.DeviceInfo
|
||||||
|
libDirs map[string]struct{}
|
||||||
|
rocmDir string
|
||||||
|
exe string
|
||||||
|
bootstrapped bool
|
||||||
|
)
|
||||||
|
|
||||||
|
func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||||
|
deviceMu.Lock()
|
||||||
|
defer deviceMu.Unlock()
|
||||||
|
startDiscovery := time.Now()
|
||||||
|
msg := "overall device VRAM discovery took"
|
||||||
|
defer func() {
|
||||||
|
slog.Debug(msg, "duration", time.Since(startDiscovery))
|
||||||
|
}()
|
||||||
|
|
||||||
|
if !bootstrapped {
|
||||||
|
msg = "GPU bootstrap discovery took"
|
||||||
|
libDirs = make(map[string]struct{})
|
||||||
|
var err error
|
||||||
|
exe, err = os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("unable to lookup executable path", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
||||||
|
exe = eval
|
||||||
|
}
|
||||||
|
files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("unable to lookup runner library directories", "error", err)
|
||||||
|
}
|
||||||
|
for _, file := range files {
|
||||||
|
libDirs[filepath.Dir(file)] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Our current packaging model places ggml-hip in the main directory
|
||||||
|
// but keeps rocm in an isolated directory. We have to add it to
|
||||||
|
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
|
||||||
|
rocmDir = filepath.Join(LibOllamaPath, "rocm")
|
||||||
|
if _, err := os.Stat(rocmDir); err != nil {
|
||||||
|
rocmDir = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(libDirs) == 0 {
|
||||||
|
libDirs[""] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("discovering available GPUs...")
|
||||||
|
requested := envconfig.LLMLibrary()
|
||||||
|
jetpack := cudaJetpack()
|
||||||
|
|
||||||
|
// For our initial discovery pass, we gather all the known GPUs through
|
||||||
|
// all the libraries that were detected. This pass may include GPUs that
|
||||||
|
// are enumerated, but not actually supported.
|
||||||
|
// We run this in serial to avoid potentially initializing a GPU multiple
|
||||||
|
// times concurrently leading to memory contention
|
||||||
|
// TODO refactor so we group the lib dirs and do serial per version, but parallel for different libs
|
||||||
|
for dir := range libDirs {
|
||||||
|
bootstrapTimeout := 30 * time.Second
|
||||||
|
var dirs []string
|
||||||
|
if dir != "" {
|
||||||
|
if requested != "" && filepath.Base(dir) != requested {
|
||||||
|
slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
|
||||||
|
continue
|
||||||
|
} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if dir == "" {
|
||||||
|
dirs = []string{LibOllamaPath}
|
||||||
|
} else {
|
||||||
|
dirs = []string{LibOllamaPath, dir}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ROCm can take a long time on some systems, so give it more time before giving up
|
||||||
|
if dir != "" && strings.Contains(filepath.Base(dir), "rocm") {
|
||||||
|
bootstrapTimeout = 60 * time.Second
|
||||||
|
}
|
||||||
|
// Typically bootstrapping takes < 1s, but on some systems, with devices
|
||||||
|
// in low power/idle mode, initialization can take multiple seconds. We
|
||||||
|
// set a long timeout just for bootstrap discovery to reduce the chance
|
||||||
|
// of giving up too quickly
|
||||||
|
ctx1stPass, cancel := context.WithTimeout(ctx, bootstrapTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// For this pass, we retain duplicates in case any are incompatible with some libraries
|
||||||
|
devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// In the second pass, we more deeply initialize the GPUs to weed out devices that
|
||||||
|
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
|
||||||
|
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
|
||||||
|
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
needsDelete := make([]bool, len(devices))
|
||||||
|
supportedMu := sync.Mutex{}
|
||||||
|
supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
|
||||||
|
for i := range devices {
|
||||||
|
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
|
||||||
|
if devices[i].Library == "Metal" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||||
|
wg.Add(1)
|
||||||
|
go func(i int) {
|
||||||
|
defer wg.Done()
|
||||||
|
var envVar string
|
||||||
|
id := devices[i].ID
|
||||||
|
if devices[i].Library == "ROCm" {
|
||||||
|
if runtime.GOOS != "linux" {
|
||||||
|
envVar = "HIP_VISIBLE_DEVICES"
|
||||||
|
} else {
|
||||||
|
envVar = "ROCR_VISIBLE_DEVICES"
|
||||||
|
}
|
||||||
|
} else if devices[i].Library == "CUDA" {
|
||||||
|
envVar = "CUDA_VISIBLE_DEVICES"
|
||||||
|
} else if devices[i].Library == "Vulkan" {
|
||||||
|
id = devices[i].FilteredID
|
||||||
|
envVar = "GGML_VK_VISIBLE_DEVICES"
|
||||||
|
} else {
|
||||||
|
slog.Error("Unknown Library:" + devices[i].Library)
|
||||||
|
}
|
||||||
|
|
||||||
|
extraEnvs := map[string]string{
|
||||||
|
"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
|
||||||
|
envVar: id, // Filter to just this one GPU
|
||||||
|
}
|
||||||
|
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||||
|
slog.Debug("filtering device which didn't fully initialize",
|
||||||
|
"id", devices[i].ID,
|
||||||
|
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
|
||||||
|
"pci_id", devices[i].PCIID,
|
||||||
|
"library", devices[i].Library,
|
||||||
|
)
|
||||||
|
needsDelete[i] = true
|
||||||
|
} else {
|
||||||
|
supportedMu.Lock()
|
||||||
|
if _, ok := supported[devices[i].Library]; !ok {
|
||||||
|
supported[devices[i].Library] = make(map[string]map[string]int)
|
||||||
|
}
|
||||||
|
if _, ok := supported[devices[i].Library][libDir]; !ok {
|
||||||
|
supported[devices[i].Library][libDir] = make(map[string]int)
|
||||||
|
}
|
||||||
|
supported[devices[i].Library][libDir][devices[i].ID] = i
|
||||||
|
supportedMu.Unlock()
|
||||||
|
}
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
logutil.Trace("supported GPU library combinations before filtering", "supported", supported)
|
||||||
|
|
||||||
|
filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
|
||||||
|
|
||||||
|
// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
|
||||||
|
filterOverlapByLibrary(supported, needsDelete)
|
||||||
|
|
||||||
|
// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
|
||||||
|
rocmID := 0
|
||||||
|
for i := 0; i < len(needsDelete); i++ {
|
||||||
|
if needsDelete[i] {
|
||||||
|
logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
|
||||||
|
devices = append(devices[:i], devices[i+1:]...)
|
||||||
|
needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
|
||||||
|
i--
|
||||||
|
} else if devices[i].Library == "ROCm" {
|
||||||
|
if _, err := strconv.Atoi(devices[i].ID); err == nil {
|
||||||
|
// Replace the numeric ID with the post-filtered IDs
|
||||||
|
devices[i].FilteredID = devices[i].ID
|
||||||
|
devices[i].ID = strconv.Itoa(rocmID)
|
||||||
|
}
|
||||||
|
rocmID++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now filter out any overlap with different libraries (favor CUDA/HIP over others)
|
||||||
|
for i := 0; i < len(devices); i++ {
|
||||||
|
for j := i + 1; j < len(devices); j++ {
|
||||||
|
// For this pass, we only drop exact duplicates
|
||||||
|
switch devices[i].Compare(devices[j]) {
|
||||||
|
case ml.SameBackendDevice:
|
||||||
|
// Same library and device, skip it
|
||||||
|
devices = append(devices[:j], devices[j+1:]...)
|
||||||
|
j--
|
||||||
|
continue
|
||||||
|
case ml.DuplicateDevice:
|
||||||
|
// Different library, choose based on priority
|
||||||
|
var droppedDevice ml.DeviceInfo
|
||||||
|
if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
|
||||||
|
droppedDevice = devices[j]
|
||||||
|
} else {
|
||||||
|
droppedDevice = devices[i]
|
||||||
|
devices[i] = devices[j]
|
||||||
|
}
|
||||||
|
devices = append(devices[:j], devices[j+1:]...)
|
||||||
|
j--
|
||||||
|
|
||||||
|
typeStr := "discrete"
|
||||||
|
if droppedDevice.Integrated {
|
||||||
|
typeStr = "iGPU"
|
||||||
|
}
|
||||||
|
slog.Debug("dropping duplicate device",
|
||||||
|
"id", droppedDevice.ID,
|
||||||
|
"library", droppedDevice.Library,
|
||||||
|
"compute", droppedDevice.Compute(),
|
||||||
|
"name", droppedDevice.Name,
|
||||||
|
"description", droppedDevice.Description,
|
||||||
|
"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
|
||||||
|
"driver", droppedDevice.Driver(),
|
||||||
|
"pci_id", droppedDevice.PCIID,
|
||||||
|
"type", typeStr,
|
||||||
|
"total", format.HumanBytes2(droppedDevice.TotalMemory),
|
||||||
|
"available", format.HumanBytes2(droppedDevice.FreeMemory),
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset the libDirs to what we actually wind up using for future refreshes
|
||||||
|
libDirs = make(map[string]struct{})
|
||||||
|
for _, dev := range devices {
|
||||||
|
dir := dev.LibraryPath[len(dev.LibraryPath)-1]
|
||||||
|
if dir != LibOllamaPath {
|
||||||
|
libDirs[dir] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(libDirs) == 0 {
|
||||||
|
libDirs[""] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
bootstrapped = true
|
||||||
|
} else {
|
||||||
|
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||||
|
// metal never updates free VRAM
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("refreshing free memory")
|
||||||
|
updated := make([]bool, len(devices))
|
||||||
|
allDone := func() bool {
|
||||||
|
allDone := true
|
||||||
|
for _, done := range updated {
|
||||||
|
if !done {
|
||||||
|
allDone = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return allDone
|
||||||
|
}
|
||||||
|
|
||||||
|
// First try to use existing runners to refresh VRAM since they're already
|
||||||
|
// active on GPU(s)
|
||||||
|
for _, runner := range runners {
|
||||||
|
if runner == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
deviceIDs := runner.GetActiveDeviceIDs()
|
||||||
|
if len(deviceIDs) == 0 {
|
||||||
|
// Skip this runner since it doesn't have active GPU devices
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check to see if this runner is active on any devices that need a refresh
|
||||||
|
skip := true
|
||||||
|
devCheck:
|
||||||
|
for _, dev := range deviceIDs {
|
||||||
|
for i := range devices {
|
||||||
|
if dev == devices[i].DeviceID {
|
||||||
|
if !updated[i] {
|
||||||
|
skip = false
|
||||||
|
break devCheck
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Typical refresh on existing runner is ~500ms but allow longer if the system
|
||||||
|
// is under stress before giving up and using stale data.
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
start := time.Now()
|
||||||
|
updatedDevices := runner.GetDeviceInfos(ctx)
|
||||||
|
slog.Debug("existing runner discovery took", "duration", time.Since(start))
|
||||||
|
for _, u := range updatedDevices {
|
||||||
|
for i := range devices {
|
||||||
|
if u.DeviceID == devices[i].DeviceID {
|
||||||
|
updated[i] = true
|
||||||
|
devices[i].FreeMemory = u.FreeMemory
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Short circuit if we've updated all the devices
|
||||||
|
if allDone() {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allDone() {
|
||||||
|
slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
|
||||||
|
|
||||||
|
// Bootstrapping may take longer in some cases (AMD windows), but we
|
||||||
|
// would rather use stale free data to get the model running sooner
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
for dir := range libDirs {
|
||||||
|
updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
|
||||||
|
for _, u := range updatedDevices {
|
||||||
|
for i := range devices {
|
||||||
|
if u.DeviceID == devices[i].DeviceID {
|
||||||
|
updated[i] = true
|
||||||
|
devices[i].FreeMemory = u.FreeMemory
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
|
||||||
|
}
|
||||||
|
if allDone() {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allDone() {
|
||||||
|
slog.Warn("unable to refresh free memory, using old values")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
|
||||||
|
// Filter out Vulkan devices that share a PCI ID with a non-Vulkan device that is not marked for deletion
|
||||||
|
for i := range devices {
|
||||||
|
if devices[i].Library != "Vulkan" || needsDelete[i] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devices[i].PCIID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for j := range devices {
|
||||||
|
if i == j {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devices[j].PCIID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
|
||||||
|
needsDelete[i] = true
|
||||||
|
slog.Debug("filtering device with duplicate PCI ID",
|
||||||
|
"id", devices[i].ID,
|
||||||
|
"library", devices[i].Library,
|
||||||
|
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
|
||||||
|
"pci_id", devices[i].PCIID,
|
||||||
|
"kept_id", devices[j].ID,
|
||||||
|
"kept_library", devices[j].Library,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
|
||||||
|
// For multi-GPU systems, use the newest version that supports all the GPUs
|
||||||
|
for _, byLibDirs := range supported {
|
||||||
|
libDirs := make([]string, 0, len(byLibDirs))
|
||||||
|
for libDir := range byLibDirs {
|
||||||
|
libDirs = append(libDirs, libDir)
|
||||||
|
}
|
||||||
|
sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
|
||||||
|
anyMissing := false
|
||||||
|
var newest string
|
||||||
|
for _, newest = range libDirs {
|
||||||
|
for _, libDir := range libDirs {
|
||||||
|
if libDir == newest {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
|
||||||
|
anyMissing = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
for dev := range byLibDirs[newest] {
|
||||||
|
if _, found := byLibDirs[libDir][dev]; !found {
|
||||||
|
anyMissing = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !anyMissing {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Now we can mark overlaps for deletion
|
||||||
|
for _, libDir := range libDirs {
|
||||||
|
if libDir == newest {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for dev, i := range byLibDirs[libDir] {
|
||||||
|
if _, found := byLibDirs[newest][dev]; found {
|
||||||
|
slog.Debug("filtering device with overlapping libraries",
|
||||||
|
"id", dev,
|
||||||
|
"library", libDir,
|
||||||
|
"delete_index", i,
|
||||||
|
"kept_library", newest,
|
||||||
|
)
|
||||||
|
needsDelete[i] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type bootstrapRunner struct {
|
||||||
|
port int
|
||||||
|
cmd *exec.Cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *bootstrapRunner) GetPort() int {
|
||||||
|
return r.port
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *bootstrapRunner) HasExited() bool {
|
||||||
|
if r.cmd != nil && r.cmd.ProcessState != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
|
||||||
|
var out io.Writer
|
||||||
|
if envconfig.LogLevel() == logutil.LevelTrace {
|
||||||
|
out = os.Stderr
|
||||||
|
}
|
||||||
|
start := time.Now()
|
||||||
|
defer func() {
|
||||||
|
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
||||||
|
}()
|
||||||
|
|
||||||
|
logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
|
||||||
|
cmd, port, err := llm.StartRunner(
|
||||||
|
true, // ollama engine
|
||||||
|
"", // no model
|
||||||
|
ollamaLibDirs,
|
||||||
|
out,
|
||||||
|
extraEnvs,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("failed to start runner to discovery GPUs", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
cmd.Wait() // exit status ignored
|
||||||
|
}()
|
||||||
|
|
||||||
|
defer cmd.Process.Kill()
|
||||||
|
devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
|
||||||
|
if err != nil {
|
||||||
|
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
|
||||||
|
// Expected during bootstrapping while we filter out unsupported AMD GPUs
|
||||||
|
logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
|
||||||
|
} else {
|
||||||
|
slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
|
||||||
|
|
||||||
|
return devices
|
||||||
|
}
|
||||||
108
discover/runner_test.go
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
package discover
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
lifecycle.InitLogging()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFilterOverlapByLibrary(t *testing.T) {
|
||||||
|
type testcase struct {
|
||||||
|
name string
|
||||||
|
inp map[string]map[string]map[string]int
|
||||||
|
exp []bool
|
||||||
|
}
|
||||||
|
for _, tc := range []testcase{
|
||||||
|
{
|
||||||
|
name: "empty",
|
||||||
|
inp: map[string]map[string]map[string]int{},
|
||||||
|
exp: []bool{}, // needs deletion
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "single no overlap",
|
||||||
|
inp: map[string]map[string]map[string]int{
|
||||||
|
"CUDA": {
|
||||||
|
"cuda_v12": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
exp: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "100% overlap pick 2nd",
|
||||||
|
inp: map[string]map[string]map[string]int{
|
||||||
|
"CUDA": {
|
||||||
|
"cuda_v12": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
|
||||||
|
},
|
||||||
|
"cuda_v13": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
exp: []bool{true, true, false, false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "100% overlap pick 1st",
|
||||||
|
inp: map[string]map[string]map[string]int{
|
||||||
|
"CUDA": {
|
||||||
|
"cuda_v13": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
|
||||||
|
},
|
||||||
|
"cuda_v12": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
exp: []bool{false, false, true, true},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "partial overlap pick older",
|
||||||
|
inp: map[string]map[string]map[string]int{
|
||||||
|
"CUDA": {
|
||||||
|
"cuda_v13": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
|
||||||
|
},
|
||||||
|
"cuda_v12": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
exp: []bool{true, false, false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no overlap",
|
||||||
|
inp: map[string]map[string]map[string]int{
|
||||||
|
"CUDA": {
|
||||||
|
"cuda_v13": {
|
||||||
|
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
|
||||||
|
},
|
||||||
|
"cuda_v12": {
|
||||||
|
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
exp: []bool{false, false},
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
needsDelete := make([]bool, len(tc.exp))
|
||||||
|
filterOverlapByLibrary(tc.inp, needsDelete)
|
||||||
|
for i, exp := range tc.exp {
|
||||||
|
if needsDelete[i] != exp {
|
||||||
|
t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,10 +1,13 @@
|
|||||||
package discover
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type memInfo struct {
|
type memInfo struct {
|
||||||
@@ -13,53 +16,6 @@ type memInfo struct {
|
|||||||
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beginning of an `ollama info` command
|
|
||||||
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|
||||||
memInfo
|
|
||||||
Library string `json:"library,omitempty"`
|
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
|
||||||
Variant string `json:"variant"`
|
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
|
||||||
MinimumMemory uint64 `json:"-"`
|
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
|
||||||
DependencyPath []string `json:"lib_path,omitempty"`
|
|
||||||
|
|
||||||
// Extra environment variables specific to the GPU as list of [key=value]
|
|
||||||
EnvWorkarounds []string `json:"envs,omitempty"`
|
|
||||||
|
|
||||||
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
|
|
||||||
// the FreeMemory is best effort, and may over or under report actual memory usage
|
|
||||||
// False indicates FreeMemory can generally be trusted on this GPU
|
|
||||||
UnreliableFreeMemory bool
|
|
||||||
|
|
||||||
// GPU information
|
|
||||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
|
||||||
filterID int //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
|
|
||||||
Name string `json:"name"` // user friendly name if available
|
|
||||||
Compute string `json:"compute"` // Compute Capability or gfx
|
|
||||||
|
|
||||||
// Driver Information - TODO no need to put this on each GPU
|
|
||||||
DriverMajor int `json:"driver_major,omitempty"`
|
|
||||||
DriverMinor int `json:"driver_minor,omitempty"`
|
|
||||||
|
|
||||||
// TODO other performance capability info to help in scheduling decisions
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gpu GpuInfo) RunnerName() string {
|
|
||||||
if gpu.Variant != "" {
|
|
||||||
return gpu.Library + "_" + gpu.Variant
|
|
||||||
}
|
|
||||||
return gpu.Library
|
|
||||||
}
|
|
||||||
|
|
||||||
type CPUInfo struct {
|
|
||||||
GpuInfo
|
|
||||||
CPUs []CPU
|
|
||||||
}
|
|
||||||
|
|
||||||
// CPU type represents a CPU Package occupying a socket
|
// CPU type represents a CPU Package occupying a socket
|
||||||
type CPU struct {
|
type CPU struct {
|
||||||
ID string `cpuinfo:"processor"`
|
ID string `cpuinfo:"processor"`
|
||||||
@@ -70,116 +26,49 @@ type CPU struct {
|
|||||||
ThreadCount int
|
ThreadCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
type CudaGPUInfo struct {
|
func LogDetails(devices []ml.DeviceInfo) {
|
||||||
GpuInfo
|
sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
|
||||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
for _, dev := range devices {
|
||||||
index int //nolint:unused,nolintlint
|
var libs []string
|
||||||
computeMajor int //nolint:unused,nolintlint
|
for _, dir := range dev.LibraryPath {
|
||||||
computeMinor int //nolint:unused,nolintlint
|
if strings.Contains(dir, filepath.Join("lib", "ollama")) {
|
||||||
}
|
libs = append(libs, filepath.Base(dir))
|
||||||
type CudaGPUInfoList []CudaGPUInfo
|
|
||||||
|
|
||||||
type RocmGPUInfo struct {
|
|
||||||
GpuInfo
|
|
||||||
usedFilepath string //nolint:unused,nolintlint
|
|
||||||
index int //nolint:unused,nolintlint
|
|
||||||
}
|
|
||||||
type RocmGPUInfoList []RocmGPUInfo
|
|
||||||
|
|
||||||
type OneapiGPUInfo struct {
|
|
||||||
GpuInfo
|
|
||||||
driverIndex int //nolint:unused,nolintlint
|
|
||||||
gpuIndex int //nolint:unused,nolintlint
|
|
||||||
}
|
|
||||||
type OneapiGPUInfoList []OneapiGPUInfo
|
|
||||||
|
|
||||||
type GpuInfoList []GpuInfo
|
|
||||||
|
|
||||||
type UnsupportedGPUInfo struct {
|
|
||||||
GpuInfo
|
|
||||||
Reason string `json:"reason"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// Split up the set of gpu info's by Library and variant
|
|
||||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|
||||||
resp := []GpuInfoList{}
|
|
||||||
libs := []string{}
|
|
||||||
for _, info := range l {
|
|
||||||
found := false
|
|
||||||
requested := info.Library
|
|
||||||
if info.Variant != "" {
|
|
||||||
requested += "_" + info.Variant
|
|
||||||
}
|
|
||||||
for i, lib := range libs {
|
|
||||||
if lib == requested {
|
|
||||||
resp[i] = append(resp[i], info)
|
|
||||||
found = true
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !found {
|
typeStr := "discrete"
|
||||||
libs = append(libs, requested)
|
if dev.Integrated {
|
||||||
resp = append(resp, []GpuInfo{info})
|
typeStr = "iGPU"
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return resp
|
|
||||||
}
|
|
||||||
|
|
||||||
// Report the GPU information into the log an Info level
|
|
||||||
func (l GpuInfoList) LogDetails() {
|
|
||||||
for _, g := range l {
|
|
||||||
slog.Info("inference compute",
|
slog.Info("inference compute",
|
||||||
"id", g.ID,
|
"id", dev.ID,
|
||||||
"library", g.Library,
|
"filtered_id", dev.FilteredID,
|
||||||
"variant", g.Variant,
|
"library", dev.Library,
|
||||||
"compute", g.Compute,
|
"compute", dev.Compute(),
|
||||||
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
"name", dev.Name,
|
||||||
"name", g.Name,
|
"description", dev.Description,
|
||||||
"total", format.HumanBytes2(g.TotalMemory),
|
"libdirs", strings.Join(libs, ","),
|
||||||
"available", format.HumanBytes2(g.FreeMemory),
|
"driver", dev.Driver(),
|
||||||
|
"pci_id", dev.PCIID,
|
||||||
|
"type", typeStr,
|
||||||
|
"total", format.HumanBytes2(dev.TotalMemory),
|
||||||
|
"available", format.HumanBytes2(dev.FreeMemory),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
// CPU inference
|
||||||
|
if len(devices) == 0 {
|
||||||
|
dev, _ := GetCPUMem()
|
||||||
|
slog.Info("inference compute",
|
||||||
|
"id", "cpu",
|
||||||
|
"library", "cpu",
|
||||||
|
"compute", "",
|
||||||
|
"name", "cpu",
|
||||||
|
"description", "cpu",
|
||||||
|
"libdirs", "ollama",
|
||||||
|
"driver", "",
|
||||||
|
"pci_id", "",
|
||||||
|
"type", "",
|
||||||
|
"total", format.HumanBytes2(dev.TotalMemory),
|
||||||
|
"available", format.HumanBytes2(dev.FreeMemory),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by Free Space
|
|
||||||
type ByFreeMemory []GpuInfo
|
|
||||||
|
|
||||||
func (a ByFreeMemory) Len() int { return len(a) }
|
|
||||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
||||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
|
||||||
|
|
||||||
type SystemInfo struct {
|
|
||||||
System CPUInfo `json:"system"`
|
|
||||||
GPUs []GpuInfo `json:"gpus"`
|
|
||||||
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
|
|
||||||
DiscoveryErrors []string `json:"discovery_errors"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the optimal number of threads to use for inference
|
|
||||||
func (si SystemInfo) GetOptimalThreadCount() int {
|
|
||||||
if len(si.System.CPUs) == 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
coreCount := 0
|
|
||||||
for _, c := range si.System.CPUs {
|
|
||||||
coreCount += c.CoreCount - c.EfficiencyCoreCount
|
|
||||||
}
|
|
||||||
|
|
||||||
return coreCount
|
|
||||||
}
|
|
||||||
|
|
||||||
// For each GPU, check if it does NOT support flash attention
|
|
||||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
|
||||||
for _, gpu := range l {
|
|
||||||
supportsFA := gpu.Library == "cpu" ||
|
|
||||||
gpu.Library == "metal" ||
|
|
||||||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
|
|
||||||
gpu.Library == "rocm"
|
|
||||||
|
|
||||||
if !supportsFA {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,22 +1,22 @@
|
|||||||
# Documentation
|
# Documentation
|
||||||
|
|
||||||
### Getting Started
|
### Getting Started
|
||||||
* [Quickstart](../README.md#quickstart)
|
* [Quickstart](https://docs.ollama.com/quickstart)
|
||||||
* [Examples](./examples.md)
|
* [Examples](./examples.md)
|
||||||
* [Importing models](./import.md)
|
* [Importing models](https://docs.ollama.com/import)
|
||||||
* [MacOS Documentation](./macos.md)
|
* [MacOS Documentation](https://docs.ollama.com/macos)
|
||||||
* [Linux Documentation](./linux.md)
|
* [Linux Documentation](https://docs.ollama.com/linux)
|
||||||
* [Windows Documentation](./windows.md)
|
* [Windows Documentation](https://docs.ollama.com/windows)
|
||||||
* [Docker Documentation](./docker.md)
|
* [Docker Documentation](https://docs.ollama.com/docker)
|
||||||
|
|
||||||
### Reference
|
### Reference
|
||||||
|
|
||||||
* [API Reference](./api.md)
|
* [API Reference](https://docs.ollama.com/api)
|
||||||
* [Modelfile Reference](./modelfile.md)
|
* [Modelfile Reference](./modelfile.md)
|
||||||
* [OpenAI Compatibility](./openai.md)
|
* [OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
|
||||||
|
|
||||||
### Resources
|
### Resources
|
||||||
|
|
||||||
* [Troubleshooting Guide](./troubleshooting.md)
|
* [Troubleshooting Guide](https://docs.ollama.com/troubleshooting)
|
||||||
* [FAQ](./faq.md)
|
* [FAQ](https://docs.ollama.com/faq#faq)
|
||||||
* [Development guide](./development.md)
|
* [Development guide](./development.md)
|
||||||
|
|||||||
101
docs/api.md
@@ -1,5 +1,7 @@
|
|||||||
# API
|
# API
|
||||||
|
|
||||||
|
> Note: Ollama's API docs are moving to https://docs.ollama.com/api
|
||||||
|
|
||||||
## Endpoints
|
## Endpoints
|
||||||
|
|
||||||
- [Generate a completion](#generate-a-completion)
|
- [Generate a completion](#generate-a-completion)
|
||||||
@@ -104,7 +106,7 @@ The final response in the stream also includes additional data about the generat
|
|||||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||||
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||||
|
|
||||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
|
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` \* `10^9`.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -617,6 +619,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
##### Response
|
##### Response
|
||||||
|
|
||||||
A stream of JSON objects is returned:
|
A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3.2",
|
"model": "llama3.2",
|
||||||
@@ -631,7 +634,7 @@ A stream of JSON objects is returned:
|
|||||||
"arguments": {
|
"arguments": {
|
||||||
"city": "Tokyo"
|
"city": "Tokyo"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -701,7 +704,6 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3.2",
|
"model": "llama3.2",
|
||||||
@@ -750,7 +752,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
"arguments": {
|
"arguments": {
|
||||||
"city": "Tokyo"
|
"city": "Tokyo"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -801,7 +803,10 @@ curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json"
|
|||||||
{
|
{
|
||||||
"model": "llama3.1",
|
"model": "llama3.1",
|
||||||
"created_at": "2024-12-06T00:46:58.265747Z",
|
"created_at": "2024-12-06T00:46:58.265747Z",
|
||||||
"message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"age\": 22, \"available\": false}"
|
||||||
|
},
|
||||||
"done_reason": "stop",
|
"done_reason": "stop",
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 2254970291,
|
"total_duration": 2254970291,
|
||||||
@@ -871,7 +876,6 @@ Final response:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
#### Chat request (With history, with tools)
|
#### Chat request (With history, with tools)
|
||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
@@ -948,10 +952,8 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
"eval_count": 11,
|
"eval_count": 11,
|
||||||
"eval_duration": 90282125
|
"eval_duration": 90282125
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
#### Chat request (with images)
|
#### Chat request (with images)
|
||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
@@ -1171,9 +1173,10 @@ POST /api/create
|
|||||||
```
|
```
|
||||||
|
|
||||||
Create a model from:
|
Create a model from:
|
||||||
* another model;
|
|
||||||
* a safetensors directory; or
|
- another model;
|
||||||
* a GGUF file.
|
- a safetensors directory; or
|
||||||
|
- a GGUF file.
|
||||||
|
|
||||||
If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
|
If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
|
||||||
|
|
||||||
@@ -1194,10 +1197,10 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
|||||||
#### Quantization types
|
#### Quantization types
|
||||||
|
|
||||||
| Type | Recommended |
|
| Type | Recommended |
|
||||||
| --- | :-: |
|
| ------ | :---------: |
|
||||||
| q4_K_M | * |
|
| q4_K_M | \* |
|
||||||
| q4_K_S | |
|
| q4_K_S | |
|
||||||
| q8_0 | * |
|
| q8_0 | \* |
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
@@ -1268,7 +1271,6 @@ A stream of JSON objects is returned:
|
|||||||
|
|
||||||
Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
|
Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
|
||||||
|
|
||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@@ -1291,7 +1293,6 @@ A stream of JSON objects is returned:
|
|||||||
{"status":"success"}
|
{"status":"success"}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
#### Create a model from a Safetensors directory
|
#### Create a model from a Safetensors directory
|
||||||
|
|
||||||
The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted.
|
The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted.
|
||||||
@@ -1406,9 +1407,7 @@ A single JSON object will be returned.
|
|||||||
"parent_model": "",
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "qwen2",
|
"family": "qwen2",
|
||||||
"families": [
|
"families": ["qwen2"],
|
||||||
"qwen2"
|
|
||||||
],
|
|
||||||
"parameter_size": "7.6B",
|
"parameter_size": "7.6B",
|
||||||
"quantization_level": "Q4_K_M"
|
"quantization_level": "Q4_K_M"
|
||||||
}
|
}
|
||||||
@@ -1423,9 +1422,7 @@ A single JSON object will be returned.
|
|||||||
"parent_model": "",
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": [
|
"families": ["llama"],
|
||||||
"llama"
|
|
||||||
],
|
|
||||||
"parameter_size": "3.2B",
|
"parameter_size": "3.2B",
|
||||||
"quantization_level": "Q4_K_M"
|
"quantization_level": "Q4_K_M"
|
||||||
}
|
}
|
||||||
@@ -1461,20 +1458,18 @@ curl http://localhost:11434/api/show -d '{
|
|||||||
|
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
|
modelfile: '# Modelfile generated by "ollama show"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE """{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: """\nPARAMETER num_ctx 4096\nPARAMETER stop "\u003c/s\u003e"\nPARAMETER stop "USER:"\nPARAMETER stop "ASSISTANT:"',
|
||||||
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
|
parameters: 'num_keep 24\nstop "<|start_header_id|>"\nstop "<|end_header_id|>"\nstop "<|eot_id|>"',
|
||||||
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
|
template: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
|
||||||
"details": {
|
details: {
|
||||||
"parent_model": "",
|
parent_model: "",
|
||||||
"format": "gguf",
|
format: "gguf",
|
||||||
"family": "llama",
|
family: "llama",
|
||||||
"families": [
|
families: ["llama"],
|
||||||
"llama"
|
parameter_size: "8.0B",
|
||||||
],
|
quantization_level: "Q4_0",
|
||||||
"parameter_size": "8.0B",
|
|
||||||
"quantization_level": "Q4_0"
|
|
||||||
},
|
},
|
||||||
"model_info": {
|
model_info: {
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"general.file_type": 2,
|
"general.file_type": 2,
|
||||||
"general.parameter_count": 8030261248,
|
"general.parameter_count": 8030261248,
|
||||||
@@ -1495,12 +1490,9 @@ curl http://localhost:11434/api/show -d '{
|
|||||||
"tokenizer.ggml.model": "gpt2",
|
"tokenizer.ggml.model": "gpt2",
|
||||||
"tokenizer.ggml.pre": "llama-bpe",
|
"tokenizer.ggml.pre": "llama-bpe",
|
||||||
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
|
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
|
||||||
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
|
"tokenizer.ggml.tokens": [], // populates if `verbose=true`
|
||||||
},
|
},
|
||||||
"capabilities": [
|
capabilities: ["completion", "vision"],
|
||||||
"completion",
|
|
||||||
"vision"
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -1708,6 +1700,7 @@ Advanced parameters:
|
|||||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
|
- `dimensions`: number of dimensions for the embedding
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
@@ -1725,10 +1718,12 @@ curl http://localhost:11434/api/embed -d '{
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "all-minilm",
|
"model": "all-minilm",
|
||||||
"embeddings": [[
|
"embeddings": [
|
||||||
|
[
|
||||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
||||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
||||||
]],
|
]
|
||||||
|
],
|
||||||
"total_duration": 14143917,
|
"total_duration": 14143917,
|
||||||
"load_duration": 1019500,
|
"load_duration": 1019500,
|
||||||
"prompt_eval_count": 8
|
"prompt_eval_count": 8
|
||||||
@@ -1749,17 +1744,21 @@ curl http://localhost:11434/api/embed -d '{
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "all-minilm",
|
"model": "all-minilm",
|
||||||
"embeddings": [[
|
"embeddings": [
|
||||||
|
[
|
||||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
||||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
||||||
],[
|
],
|
||||||
|
[
|
||||||
-0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
|
-0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
|
||||||
0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
|
0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
|
||||||
]]
|
]
|
||||||
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## List Running Models
|
## List Running Models
|
||||||
|
|
||||||
```
|
```
|
||||||
GET /api/ps
|
GET /api/ps
|
||||||
```
|
```
|
||||||
@@ -1790,9 +1789,7 @@ A single JSON object will be returned.
|
|||||||
"parent_model": "",
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": [
|
"families": ["llama"],
|
||||||
"llama"
|
|
||||||
],
|
|
||||||
"parameter_size": "7.2B",
|
"parameter_size": "7.2B",
|
||||||
"quantization_level": "Q4_0"
|
"quantization_level": "Q4_0"
|
||||||
},
|
},
|
||||||
@@ -1839,8 +1836,10 @@ curl http://localhost:11434/api/embeddings -d '{
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"embedding": [
|
"embedding": [
|
||||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
0.5670403838157654, 0.009260174818336964, 0.23178744316101074,
|
||||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
-0.2916173040866852, -0.8924556970596313, 0.8785552978515625,
|
||||||
|
-0.34576427936553955, 0.5742510557174683, -0.04222835972905159,
|
||||||
|
-0.137906014919281
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@@ -1868,5 +1867,3 @@ curl http://localhost:11434/api/version
|
|||||||
"version": "0.5.1"
|
"version": "0.5.1"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
63
docs/api/authentication.mdx
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
---
|
||||||
|
title: Authentication
|
||||||
|
---
|
||||||
|
|
||||||
|
No authentication is required when accessing Ollama's API locally via `http://localhost:11434`.
|
||||||
|
|
||||||
|
Authentication is required for the following:
|
||||||
|
|
||||||
|
* Running cloud models via ollama.com
|
||||||
|
* Publishing models
|
||||||
|
* Downloading private models
|
||||||
|
|
||||||
|
Ollama supports two authentication methods:
|
||||||
|
|
||||||
|
* **Signing in**: sign in from your local installation, and Ollama will automatically take care of authenticating requests to ollama.com when running commands
|
||||||
|
* **API keys**: API keys for programmatic access to ollama.com's API
|
||||||
|
|
||||||
|
## Signing in
|
||||||
|
|
||||||
|
To sign in to ollama.com from your local installation of Ollama, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama signin
|
||||||
|
```
|
||||||
|
|
||||||
|
Once signed in, Ollama will automatically authenticate commands as required:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama run gpt-oss:120b-cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
Similarly, when accessing a local API endpoint that requires cloud access, Ollama will automatically authenticate the request:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:11434/api/generate -d '{
|
||||||
|
"model": "gpt-oss:120b-cloud",
|
||||||
|
"prompt": "Why is the sky blue?"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## API keys
|
||||||
|
|
||||||
|
For direct access to ollama.com's API served at `https://ollama.com/api`, authentication via API keys is required.
|
||||||
|
|
||||||
|
First, create an [API key](https://ollama.com/settings/keys), then set the `OLLAMA_API_KEY` environment variable:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export OLLAMA_API_KEY=your_api_key
|
||||||
|
```
|
||||||
|
|
||||||
|
Then use the API key in the Authorization header:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://ollama.com/api/generate \
|
||||||
|
-H "Authorization: Bearer $OLLAMA_API_KEY" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-oss:120b",
|
||||||
|
"prompt": "Why is the sky blue?",
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
API keys don't currently expire, however you can revoke them at any time in your [API keys settings](https://ollama.com/settings/keys).
|
||||||
36
docs/api/errors.mdx
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
title: Errors
|
||||||
|
---
|
||||||
|
|
||||||
|
## Status codes
|
||||||
|
|
||||||
|
Endpoints return appropriate HTTP status codes based on the success or failure of the request in the HTTP status line (e.g. `HTTP/1.1 200 OK` or `HTTP/1.1 400 Bad Request`). Common status codes are:
|
||||||
|
|
||||||
|
- `200`: Success
|
||||||
|
- `400`: Bad Request (missing parameters, invalid JSON, etc.)
|
||||||
|
- `404`: Not Found (model doesn't exist, etc.)
|
||||||
|
- `429`: Too Many Requests (e.g. when a rate limit is exceeded)
|
||||||
|
- `500`: Internal Server Error
|
||||||
|
- `502`: Bad Gateway (e.g. when a cloud model cannot be reached)
|
||||||
|
|
||||||
|
## Error messages
|
||||||
|
|
||||||
|
Errors are returned in the `application/json` format with the following structure, with the error message in the `error` property:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "the model failed to generate a response"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Errors that occur while streaming
|
||||||
|
|
||||||
|
If an error occurs mid-stream, the error will be returned as an object in the `application/x-ndjson` format with an `error` property. Since the response has already started, the status code of the response will not be changed.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:21:21.196249Z","response":" Yes","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:21:21.207235Z","response":".","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:21:21.219166Z","response":"I","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:21:21.231094Z","response":"can","done":false}
|
||||||
|
{"error":"an error was encountered while running the model"}
|
||||||
|
```
|
||||||
47
docs/api/index.mdx
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
---
|
||||||
|
title: Introduction
|
||||||
|
---
|
||||||
|
|
||||||
|
Ollama's API allows you to run and interact with models programatically.
|
||||||
|
|
||||||
|
## Get started
|
||||||
|
|
||||||
|
If you're just getting started, follow the [quickstart](/quickstart) documentation to get up and running with Ollama's API.
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
After installation, Ollama's API is served by default at:
|
||||||
|
|
||||||
|
```
|
||||||
|
http://localhost:11434/api
|
||||||
|
```
|
||||||
|
|
||||||
|
For running cloud models on **ollama.com**, the same API is available with the following base URL:
|
||||||
|
|
||||||
|
```
|
||||||
|
https://ollama.com/api
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example request
|
||||||
|
|
||||||
|
Once Ollama is running, its API is automatically available and can be accessed via `curl`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:11434/api/generate -d '{
|
||||||
|
"model": "gemma3",
|
||||||
|
"prompt": "Why is the sky blue?"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Libraries
|
||||||
|
|
||||||
|
Ollama has official libraries for Python and JavaScript:
|
||||||
|
|
||||||
|
- [Python](https://github.com/ollama/ollama-python)
|
||||||
|
- [JavaScript](https://github.com/ollama/ollama-js)
|
||||||
|
|
||||||
|
Several community-maintained libraries are available for Ollama. For a full list, see the [Ollama GitHub repository](https://github.com/ollama/ollama?tab=readme-ov-file#libraries-1).
|
||||||
|
|
||||||
|
## Versioning
|
||||||
|
|
||||||
|
Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
|
||||||
@@ -1,9 +1,8 @@
|
|||||||
# OpenAI compatibility
|
---
|
||||||
|
title: OpenAI compatibility
|
||||||
|
---
|
||||||
|
|
||||||
> [!NOTE]
|
Ollama provides compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
|
||||||
> OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
|
|
||||||
|
|
||||||
Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@@ -100,19 +99,19 @@ except Exception as e:
|
|||||||
### OpenAI JavaScript library
|
### OpenAI JavaScript library
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
import OpenAI from 'openai'
|
import OpenAI from "openai";
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
baseURL: 'http://localhost:11434/v1/',
|
baseURL: "http://localhost:11434/v1/",
|
||||||
|
|
||||||
// required but ignored
|
// required but ignored
|
||||||
apiKey: 'ollama',
|
apiKey: "ollama",
|
||||||
})
|
});
|
||||||
|
|
||||||
const chatCompletion = await openai.chat.completions.create({
|
const chatCompletion = await openai.chat.completions.create({
|
||||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
messages: [{ role: "user", content: "Say this is a test" }],
|
||||||
model: 'llama3.2',
|
model: "llama3.2",
|
||||||
})
|
});
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
const response = await openai.chat.completions.create({
|
||||||
model: "llava",
|
model: "llava",
|
||||||
@@ -123,26 +122,27 @@ const response = await openai.chat.completions.create({
|
|||||||
{ type: "text", text: "What's in this image?" },
|
{ type: "text", text: "What's in this image?" },
|
||||||
{
|
{
|
||||||
type: "image_url",
|
type: "image_url",
|
||||||
image_url: "",
|
image_url:
|
||||||
|
"",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
})
|
});
|
||||||
|
|
||||||
const completion = await openai.completions.create({
|
const completion = await openai.completions.create({
|
||||||
model: "llama3.2",
|
model: "llama3.2",
|
||||||
prompt: "Say this is a test.",
|
prompt: "Say this is a test.",
|
||||||
})
|
});
|
||||||
|
|
||||||
const listCompletion = await openai.models.list()
|
const listCompletion = await openai.models.list();
|
||||||
|
|
||||||
const model = await openai.models.retrieve("llama3.2")
|
const model = await openai.models.retrieve("llama3.2");
|
||||||
|
|
||||||
const embedding = await openai.embeddings.create({
|
const embedding = await openai.embeddings.create({
|
||||||
model: "all-minilm",
|
model: "all-minilm",
|
||||||
input: ["why is the sky blue?", "why is the grass green?"],
|
input: ["why is the sky blue?", "why is the grass green?"],
|
||||||
})
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
### `curl`
|
### `curl`
|
||||||
@@ -306,8 +306,8 @@ curl http://localhost:11434/v1/embeddings \
|
|||||||
- [x] array of strings
|
- [x] array of strings
|
||||||
- [ ] array of tokens
|
- [ ] array of tokens
|
||||||
- [ ] array of token arrays
|
- [ ] array of token arrays
|
||||||
- [ ] `encoding format`
|
- [x] `encoding format`
|
||||||
- [ ] `dimensions`
|
- [x] `dimensions`
|
||||||
- [ ] `user`
|
- [ ] `user`
|
||||||
|
|
||||||
## Models
|
## Models
|
||||||
35
docs/api/streaming.mdx
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
title: Streaming
|
||||||
|
---
|
||||||
|
|
||||||
|
Certain API endpoints stream responses by default, such as `/api/generate`. These responses are provided in the newline-delimited JSON format (i.e. the `application/x-ndjson` content type). For example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.097767Z","response":"That","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.109172Z","response":"'","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.121485Z","response":"s","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.132802Z","response":" a","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.143931Z","response":" fantastic","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.155176Z","response":" question","done":false}
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.166576Z","response":"!","done":true, "done_reason": "stop"}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Disabling streaming
|
||||||
|
|
||||||
|
Streaming can be disabled by providing `{"stream": false}` in the request body for any endpoint that support streaming. This will cause responses to be returned in the `application/json` format instead:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"model":"gemma3","created_at":"2025-10-26T17:15:24.166576Z","response":"That's a fantastic question!","done":true}
|
||||||
|
```
|
||||||
|
|
||||||
|
## When to use streaming vs non-streaming
|
||||||
|
|
||||||
|
**Streaming (default)**:
|
||||||
|
- Real-time response generation
|
||||||
|
- Lower perceived latency
|
||||||
|
- Better for long generations
|
||||||
|
|
||||||
|
**Non-streaming**:
|
||||||
|
- Simpler to process
|
||||||
|
- Better for short responses, or structured outputs
|
||||||
|
- Easier to handle in some applications
|
||||||
36
docs/api/usage.mdx
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
title: Usage
|
||||||
|
---
|
||||||
|
|
||||||
|
Ollama's API responses include metrics that can be used for measuring performance and model usage:
|
||||||
|
|
||||||
|
* `total_duration`: How long the response took to generate
|
||||||
|
* `load_duration`: How long the model took to load
|
||||||
|
* `prompt_eval_count`: How many input tokens were processed
|
||||||
|
* `prompt_eval_duration`: How long it took to evaluate the prompt
|
||||||
|
* `eval_count`: How many output tokens were processes
|
||||||
|
* `eval_duration`: How long it took to generate the output tokens
|
||||||
|
|
||||||
|
All timing values are measured in nanoseconds.
|
||||||
|
|
||||||
|
## Example response
|
||||||
|
|
||||||
|
For endpoints that return usage metrics, the response body will include the usage fields. For example, a non-streaming call to `/api/generate` may return the following response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "gemma3",
|
||||||
|
"created_at": "2025-10-17T23:14:07.414671Z",
|
||||||
|
"response": "Hello! How can I help you today?",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 174560334,
|
||||||
|
"load_duration": 101397084,
|
||||||
|
"prompt_eval_count": 11,
|
||||||
|
"prompt_eval_duration": 13074791,
|
||||||
|
"eval_count": 18,
|
||||||
|
"eval_duration": 52479709
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For endpoints that return **streaming responses**, usage fields are included as part of the final chunk, where `done` is `true`.
|
||||||
113
docs/capabilities/embeddings.mdx
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
---
|
||||||
|
title: Embeddings
|
||||||
|
description: Generate text embeddings for semantic search, retrieval, and RAG.
|
||||||
|
---
|
||||||
|
|
||||||
|
Embeddings turn text into numeric vectors you can store in a vector database, search with cosine similarity, or use in RAG pipelines. The vector length depends on the model (typically 384–1024 dimensions).
|
||||||
|
|
||||||
|
## Recommended models
|
||||||
|
|
||||||
|
- [embeddinggemma](https://ollama.com/library/embeddinggemma)
|
||||||
|
- [qwen3-embedding](https://ollama.com/library/qwen3-embedding)
|
||||||
|
- [all-minilm](https://ollama.com/library/all-minilm)
|
||||||
|
|
||||||
|
## Generate embeddings
|
||||||
|
|
||||||
|
Use `/api/embed` with a single string.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
curl -X POST http://localhost:11434/api/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "embeddinggemma",
|
||||||
|
"input": "The quick brown fox jumps over the lazy dog."
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
import ollama
|
||||||
|
|
||||||
|
single = ollama.embed(
|
||||||
|
model='embeddinggemma',
|
||||||
|
input='The quick brown fox jumps over the lazy dog.'
|
||||||
|
)
|
||||||
|
print(len(single['embeddings'][0])) # vector length
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
const single = await ollama.embed({
|
||||||
|
model: 'embeddinggemma',
|
||||||
|
input: 'The quick brown fox jumps over the lazy dog.',
|
||||||
|
})
|
||||||
|
console.log(single.embeddings[0].length) // vector length
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
The `/api/embed` endpoint returns L2‑normalized (unit‑length) vectors.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
## Generate a batch of embeddings
|
||||||
|
|
||||||
|
Pass an array of strings to `input`.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
curl -X POST http://localhost:11434/api/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "embeddinggemma",
|
||||||
|
"input": [
|
||||||
|
"First sentence",
|
||||||
|
"Second sentence",
|
||||||
|
"Third sentence"
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
import ollama
|
||||||
|
|
||||||
|
batch = ollama.embed(
|
||||||
|
model='embeddinggemma',
|
||||||
|
input=[
|
||||||
|
'The quick brown fox jumps over the lazy dog.',
|
||||||
|
'The five boxing wizards jump quickly.',
|
||||||
|
'Jackdaws love my big sphinx of quartz.',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(len(batch['embeddings'])) # number of vectors
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
const batch = await ollama.embed({
|
||||||
|
model: 'embeddinggemma',
|
||||||
|
input: [
|
||||||
|
'The quick brown fox jumps over the lazy dog.',
|
||||||
|
'The five boxing wizards jump quickly.',
|
||||||
|
'Jackdaws love my big sphinx of quartz.',
|
||||||
|
],
|
||||||
|
})
|
||||||
|
console.log(batch.embeddings.length) // number of vectors
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
- Use cosine similarity for most semantic search use cases.
|
||||||
|
- Use the same embedding model for both indexing and querying.
|
||||||
|
|
||||||
|
|
||||||
99
docs/capabilities/streaming.mdx
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
---
|
||||||
|
title: Streaming
|
||||||
|
---
|
||||||
|
|
||||||
|
Streaming allows you to render text as it is produced by the model.
|
||||||
|
|
||||||
|
Streaming is enabled by default through the REST API, but disabled by default in the SDKs.
|
||||||
|
|
||||||
|
To enable streaming in the SDKs, set the `stream` parameter to `True`.
|
||||||
|
|
||||||
|
## Key streaming concepts
|
||||||
|
1. Chatting: Stream partial assistant messages. Each chunk includes the `content` so you can render messages as they arrive.
|
||||||
|
1. Thinking: Thinking-capable models emit a `thinking` field alongside regular content in each chunk. Detect this field in streaming chunks to show or hide reasoning traces before the final answer arrives.
|
||||||
|
1. Tool calling: Watch for streamed `tool_calls` in each chunk, execute the requested tool, and append tool outputs back into the conversation.
|
||||||
|
|
||||||
|
## Handling streamed chunks
|
||||||
|
|
||||||
|
|
||||||
|
<Note> It is necessary to accumulate the partial fields in order to maintain the history of the conversation. This is particularly important for tool calling where the thinking, tool call from the model, and the executed tool result must be passed back to the model in the next request. </Note>
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="Python">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
stream = chat(
|
||||||
|
model='qwen3',
|
||||||
|
messages=[{'role': 'user', 'content': 'What is 17 × 23?'}],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
in_thinking = False
|
||||||
|
content = ''
|
||||||
|
thinking = ''
|
||||||
|
for chunk in stream:
|
||||||
|
if chunk.message.thinking:
|
||||||
|
if not in_thinking:
|
||||||
|
in_thinking = True
|
||||||
|
print('Thinking:\n', end='', flush=True)
|
||||||
|
print(chunk.message.thinking, end='', flush=True)
|
||||||
|
# accumulate the partial thinking
|
||||||
|
thinking += chunk.message.thinking
|
||||||
|
elif chunk.message.content:
|
||||||
|
if in_thinking:
|
||||||
|
in_thinking = False
|
||||||
|
print('\n\nAnswer:\n', end='', flush=True)
|
||||||
|
print(chunk.message.content, end='', flush=True)
|
||||||
|
# accumulate the partial content
|
||||||
|
content += chunk.message.content
|
||||||
|
|
||||||
|
# append the accumulated fields to the messages for the next request
|
||||||
|
new_messages = [{ role: 'assistant', thinking: thinking, content: content }]
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const stream = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages: [{ role: 'user', content: 'What is 17 × 23?' }],
|
||||||
|
stream: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
let inThinking = false
|
||||||
|
let content = ''
|
||||||
|
let thinking = ''
|
||||||
|
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
if (chunk.message.thinking) {
|
||||||
|
if (!inThinking) {
|
||||||
|
inThinking = true
|
||||||
|
process.stdout.write('Thinking:\n')
|
||||||
|
}
|
||||||
|
process.stdout.write(chunk.message.thinking)
|
||||||
|
// accumulate the partial thinking
|
||||||
|
thinking += chunk.message.thinking
|
||||||
|
} else if (chunk.message.content) {
|
||||||
|
if (inThinking) {
|
||||||
|
inThinking = false
|
||||||
|
process.stdout.write('\n\nAnswer:\n')
|
||||||
|
}
|
||||||
|
process.stdout.write(chunk.message.content)
|
||||||
|
// accumulate the partial content
|
||||||
|
content += chunk.message.content
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// append the accumulated fields to the messages for the next request
|
||||||
|
new_messages = [{ role: 'assistant', thinking: thinking, content: content }]
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
194
docs/capabilities/structured-outputs.mdx
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
---
|
||||||
|
title: Structured Outputs
|
||||||
|
---
|
||||||
|
|
||||||
|
Structured outputs let you enforce a JSON schema on model responses so you can reliably extract structured data, describe images, or keep every reply consistent.
|
||||||
|
|
||||||
|
## Generating structured JSON
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "gpt-oss",
|
||||||
|
"messages": [{"role": "user", "content": "Tell me about Canada in one line"}],
|
||||||
|
"stream": false,
|
||||||
|
"format": "json"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='gpt-oss',
|
||||||
|
messages=[{'role': 'user', 'content': 'Tell me about Canada.'}],
|
||||||
|
format='json'
|
||||||
|
)
|
||||||
|
print(response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'gpt-oss',
|
||||||
|
messages: [{ role: 'user', content: 'Tell me about Canada.' }],
|
||||||
|
format: 'json'
|
||||||
|
})
|
||||||
|
console.log(response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Generating structured JSON with a schema
|
||||||
|
|
||||||
|
Provide a JSON schema to the `format` field.
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
It is ideal to also pass the JSON schema as a string in the prompt to ground the model's response.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "gpt-oss",
|
||||||
|
"messages": [{"role": "user", "content": "Tell me about Canada."}],
|
||||||
|
"stream": false,
|
||||||
|
"format": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {"type": "string"},
|
||||||
|
"capital": {"type": "string"},
|
||||||
|
"languages": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["name", "capital", "languages"]
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
Use Pydantic models and pass `model_json_schema()` to `format`, then validate the response:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class Country(BaseModel):
|
||||||
|
name: str
|
||||||
|
capital: str
|
||||||
|
languages: list[str]
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='gpt-oss',
|
||||||
|
messages=[{'role': 'user', 'content': 'Tell me about Canada.'}],
|
||||||
|
format=Country.model_json_schema(),
|
||||||
|
)
|
||||||
|
|
||||||
|
country = Country.model_validate_json(response.message.content)
|
||||||
|
print(country)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
Serialize a Zod schema with `zodToJsonSchema()` and parse the structured response:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
import { z } from 'zod'
|
||||||
|
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||||
|
|
||||||
|
const Country = z.object({
|
||||||
|
name: z.string(),
|
||||||
|
capital: z.string(),
|
||||||
|
languages: z.array(z.string()),
|
||||||
|
})
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'gpt-oss',
|
||||||
|
messages: [{ role: 'user', content: 'Tell me about Canada.' }],
|
||||||
|
format: zodToJsonSchema(Country),
|
||||||
|
})
|
||||||
|
|
||||||
|
const country = Country.parse(JSON.parse(response.message.content))
|
||||||
|
console.log(country)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Example: Extract structured data
|
||||||
|
|
||||||
|
Define the objects you want returned and let the model populate the fields:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class Pet(BaseModel):
|
||||||
|
name: str
|
||||||
|
animal: str
|
||||||
|
age: int
|
||||||
|
color: str | None
|
||||||
|
favorite_toy: str | None
|
||||||
|
|
||||||
|
class PetList(BaseModel):
|
||||||
|
pets: list[Pet]
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='gpt-oss',
|
||||||
|
messages=[{'role': 'user', 'content': 'I have two cats named Luna and Loki...'}],
|
||||||
|
format=PetList.model_json_schema(),
|
||||||
|
)
|
||||||
|
|
||||||
|
pets = PetList.model_validate_json(response.message.content)
|
||||||
|
print(pets)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example: Vision with structured outputs
|
||||||
|
|
||||||
|
Vision models accept the same `format` parameter, enabling deterministic descriptions of images:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Literal, Optional
|
||||||
|
|
||||||
|
class Object(BaseModel):
|
||||||
|
name: str
|
||||||
|
confidence: float
|
||||||
|
attributes: str
|
||||||
|
|
||||||
|
class ImageDescription(BaseModel):
|
||||||
|
summary: str
|
||||||
|
objects: list[Object]
|
||||||
|
scene: str
|
||||||
|
colors: list[str]
|
||||||
|
time_of_day: Literal['Morning', 'Afternoon', 'Evening', 'Night']
|
||||||
|
setting: Literal['Indoor', 'Outdoor', 'Unknown']
|
||||||
|
text_content: Optional[str] = None
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='gemma3',
|
||||||
|
messages=[{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Describe this photo and list the objects you detect.',
|
||||||
|
'images': ['path/to/image.jpg'],
|
||||||
|
}],
|
||||||
|
format=ImageDescription.model_json_schema(),
|
||||||
|
options={'temperature': 0},
|
||||||
|
)
|
||||||
|
|
||||||
|
image_description = ImageDescription.model_validate_json(response.message.content)
|
||||||
|
print(image_description)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips for reliable structured outputs
|
||||||
|
|
||||||
|
- Define schemas with Pydantic (Python) or Zod (JavaScript) so they can be reused for validation.
|
||||||
|
- Lower the temperature (e.g., set it to `0`) for more deterministic completions.
|
||||||
|
- Structured outputs work through the OpenAI-compatible API via `response_format`
|
||||||
153
docs/capabilities/thinking.mdx
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
---
|
||||||
|
title: Thinking
|
||||||
|
---
|
||||||
|
|
||||||
|
Thinking-capable models emit a `thinking` field that separates their reasoning trace from the final answer.
|
||||||
|
|
||||||
|
Use this capability to audit model steps, animate the model *thinking* in a UI, or hide the trace entirely when you only need the final response.
|
||||||
|
|
||||||
|
## Supported models
|
||||||
|
|
||||||
|
- [Qwen 3](https://ollama.com/library/qwen3)
|
||||||
|
- [GPT-OSS](https://ollama.com/library/gpt-oss) *(use `think` levels: `low`, `medium`, `high` — the trace cannot be fully disabled)*
|
||||||
|
- [DeepSeek-v3.1](https://ollama.com/library/deepseek-v3.1)
|
||||||
|
- [DeepSeek R1](https://ollama.com/library/deepseek-r1)
|
||||||
|
- Browse the latest additions under [thinking models](https://ollama.com/search?c=thinking)
|
||||||
|
|
||||||
|
## Enable thinking in API calls
|
||||||
|
|
||||||
|
Set the `think` field on chat or generate requests. Most models accept booleans (`true`/`false`).
|
||||||
|
|
||||||
|
GPT-OSS instead expects one of `low`, `medium`, or `high` to tune the trace length.
|
||||||
|
|
||||||
|
The `message.thinking` (chat endpoint) or `thinking` (generate endpoint) field contains the reasoning trace while `message.content` / `response` holds the final answer.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
curl http://localhost:11434/api/chat -d '{
|
||||||
|
"model": "qwen3",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "How many letter r are in strawberry?"
|
||||||
|
}],
|
||||||
|
"think": true,
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='qwen3',
|
||||||
|
messages=[{'role': 'user', 'content': 'How many letter r are in strawberry?'}],
|
||||||
|
think=True,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
print('Thinking:\n', response.message.thinking)
|
||||||
|
print('Answer:\n', response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'deepseek-r1',
|
||||||
|
messages: [{ role: 'user', content: 'How many letter r are in strawberry?' }],
|
||||||
|
think: true,
|
||||||
|
stream: false,
|
||||||
|
})
|
||||||
|
|
||||||
|
console.log('Thinking:\n', response.message.thinking)
|
||||||
|
console.log('Answer:\n', response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
GPT-OSS requires `think` to be set to `"low"`, `"medium"`, or `"high"`. Passing `true`/`false` is ignored for that model.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
## Stream the reasoning trace
|
||||||
|
|
||||||
|
Thinking streams interleave reasoning tokens before answer tokens. Detect the first `thinking` chunk to render a "thinking" section, then switch to the final reply once `message.content` arrives.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
stream = chat(
|
||||||
|
model='qwen3',
|
||||||
|
messages=[{'role': 'user', 'content': 'What is 17 × 23?'}],
|
||||||
|
think=True,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
in_thinking = False
|
||||||
|
|
||||||
|
for chunk in stream:
|
||||||
|
if chunk.message.thinking and not in_thinking:
|
||||||
|
in_thinking = True
|
||||||
|
print('Thinking:\n', end='')
|
||||||
|
|
||||||
|
if chunk.message.thinking:
|
||||||
|
print(chunk.message.thinking, end='')
|
||||||
|
elif chunk.message.content:
|
||||||
|
if in_thinking:
|
||||||
|
print('\n\nAnswer:\n', end='')
|
||||||
|
in_thinking = False
|
||||||
|
print(chunk.message.content, end='')
|
||||||
|
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const stream = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages: [{ role: 'user', content: 'What is 17 × 23?' }],
|
||||||
|
think: true,
|
||||||
|
stream: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
let inThinking = false
|
||||||
|
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
if (chunk.message.thinking && !inThinking) {
|
||||||
|
inThinking = true
|
||||||
|
process.stdout.write('Thinking:\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk.message.thinking) {
|
||||||
|
process.stdout.write(chunk.message.thinking)
|
||||||
|
} else if (chunk.message.content) {
|
||||||
|
if (inThinking) {
|
||||||
|
process.stdout.write('\n\nAnswer:\n')
|
||||||
|
inThinking = false
|
||||||
|
}
|
||||||
|
process.stdout.write(chunk.message.content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main()
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## CLI quick reference
|
||||||
|
|
||||||
|
- Enable thinking for a single run: `ollama run deepseek-r1 --think "Where should I visit in Lisbon?"`
|
||||||
|
- Disable thinking: `ollama run deepseek-r1 --think=false "Summarize this article"`
|
||||||
|
- Hide the trace while still using a thinking model: `ollama run deepseek-r1 --hidethinking "Is 9.9 bigger or 9.11?"`
|
||||||
|
- Inside interactive sessions, toggle with `/set think` or `/set nothink`.
|
||||||
|
- GPT-OSS only accepts levels: `ollama run gpt-oss --think=low "Draft a headline"` (replace `low` with `medium` or `high` as needed).
|
||||||
|
|
||||||
|
<Note>Thinking is enabled by default in the CLI and API for supported models.</Note>
|
||||||
777
docs/capabilities/tool-calling.mdx
Normal file
@@ -0,0 +1,777 @@
|
|||||||
|
---
|
||||||
|
title: Tool calling
|
||||||
|
---
|
||||||
|
|
||||||
|
Ollama supports tool calling (also known as function calling) which allows a model to invoke tools and incorporate their results into its replies.
|
||||||
|
|
||||||
|
## Calling a single tool
|
||||||
|
Invoke a single tool and include its response in a follow-up request.
|
||||||
|
|
||||||
|
Also known as "single-shot" tool calling.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "qwen3",
|
||||||
|
"messages": [{"role": "user", "content": "What's the temperature in New York?"}],
|
||||||
|
"stream": false,
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_temperature",
|
||||||
|
"description": "Get the current temperature for a city",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["city"],
|
||||||
|
"properties": {
|
||||||
|
"city": {"type": "string", "description": "The name of the city"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate a response with a single tool result**
|
||||||
|
```shell
|
||||||
|
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "qwen3",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What's the temperature in New York?"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"index": 0,
|
||||||
|
"name": "get_temperature",
|
||||||
|
"arguments": {"city": "New York"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{"role": "tool", "tool_name": "get_temperature", "content": "22°C"}
|
||||||
|
],
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
Install the Ollama Python SDK:
|
||||||
|
```bash
|
||||||
|
# with pip
|
||||||
|
pip install ollama -U
|
||||||
|
|
||||||
|
# with uv
|
||||||
|
uv add ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
def get_temperature(city: str) -> str:
|
||||||
|
"""Get the current temperature for a city
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city: The name of the city
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current temperature for the city
|
||||||
|
"""
|
||||||
|
temperatures = {
|
||||||
|
"New York": "22°C",
|
||||||
|
"London": "15°C",
|
||||||
|
"Tokyo": "18°C",
|
||||||
|
}
|
||||||
|
return temperatures.get(city, "Unknown")
|
||||||
|
|
||||||
|
messages = [{"role": "user", "content": "What's the temperature in New York?"}]
|
||||||
|
|
||||||
|
# pass functions directly as tools in the tools list or as a JSON schema
|
||||||
|
response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
|
||||||
|
|
||||||
|
messages.append(response.message)
|
||||||
|
if response.message.tool_calls:
|
||||||
|
# only recommended for models which only return a single tool call
|
||||||
|
call = response.message.tool_calls[0]
|
||||||
|
result = get_temperature(**call.function.arguments)
|
||||||
|
# add the tool result to the messages
|
||||||
|
messages.append({"role": "tool", "tool_name": call.function.name, "content": str(result)})
|
||||||
|
|
||||||
|
final_response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
|
||||||
|
print(final_response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
Install the Ollama JavaScript library:
|
||||||
|
```bash
|
||||||
|
# with npm
|
||||||
|
npm i ollama
|
||||||
|
|
||||||
|
# with bun
|
||||||
|
bun i ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
function getTemperature(city: string): string {
|
||||||
|
const temperatures: Record<string, string> = {
|
||||||
|
'New York': '22°C',
|
||||||
|
'London': '15°C',
|
||||||
|
'Tokyo': '18°C',
|
||||||
|
}
|
||||||
|
return temperatures[city] ?? 'Unknown'
|
||||||
|
}
|
||||||
|
|
||||||
|
const tools = [
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'get_temperature',
|
||||||
|
description: 'Get the current temperature for a city',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['city'],
|
||||||
|
properties: {
|
||||||
|
city: { type: 'string', description: 'The name of the city' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages,
|
||||||
|
tools,
|
||||||
|
think: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
messages.push(response.message)
|
||||||
|
if (response.message.tool_calls?.length) {
|
||||||
|
// only recommended for models which only return a single tool call
|
||||||
|
const call = response.message.tool_calls[0]
|
||||||
|
const args = call.function.arguments as { city: string }
|
||||||
|
const result = getTemperature(args.city)
|
||||||
|
// add the tool result to the messages
|
||||||
|
messages.push({ role: 'tool', tool_name: call.function.name, content: result })
|
||||||
|
|
||||||
|
// generate the final response
|
||||||
|
const finalResponse = await ollama.chat({ model: 'qwen3', messages, tools, think: true })
|
||||||
|
console.log(finalResponse.message.content)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Parallel tool calling
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
Request multiple tool calls in parallel, then send all tool responses back to the model.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "qwen3",
|
||||||
|
"messages": [{"role": "user", "content": "What are the current weather conditions and temperature in New York and London?"}],
|
||||||
|
"stream": false,
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_temperature",
|
||||||
|
"description": "Get the current temperature for a city",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["city"],
|
||||||
|
"properties": {
|
||||||
|
"city": {"type": "string", "description": "The name of the city"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_conditions",
|
||||||
|
"description": "Get the current weather conditions for a city",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["city"],
|
||||||
|
"properties": {
|
||||||
|
"city": {"type": "string", "description": "The name of the city"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate a response with multiple tool results**
|
||||||
|
```shell
|
||||||
|
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "qwen3",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What are the current weather conditions and temperature in New York and London?"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"index": 0,
|
||||||
|
"name": "get_temperature",
|
||||||
|
"arguments": {"city": "New York"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"index": 1,
|
||||||
|
"name": "get_conditions",
|
||||||
|
"arguments": {"city": "New York"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"index": 2,
|
||||||
|
"name": "get_temperature",
|
||||||
|
"arguments": {"city": "London"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"index": 3,
|
||||||
|
"name": "get_conditions",
|
||||||
|
"arguments": {"city": "London"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{"role": "tool", "tool_name": "get_temperature", "content": "22°C"},
|
||||||
|
{"role": "tool", "tool_name": "get_conditions", "content": "Partly cloudy"},
|
||||||
|
{"role": "tool", "tool_name": "get_temperature", "content": "15°C"},
|
||||||
|
{"role": "tool", "tool_name": "get_conditions", "content": "Rainy"}
|
||||||
|
],
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
def get_temperature(city: str) -> str:
|
||||||
|
"""Get the current temperature for a city
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city: The name of the city
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current temperature for the city
|
||||||
|
"""
|
||||||
|
temperatures = {
|
||||||
|
"New York": "22°C",
|
||||||
|
"London": "15°C",
|
||||||
|
"Tokyo": "18°C"
|
||||||
|
}
|
||||||
|
return temperatures.get(city, "Unknown")
|
||||||
|
|
||||||
|
def get_conditions(city: str) -> str:
|
||||||
|
"""Get the current weather conditions for a city
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city: The name of the city
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current weather conditions for the city
|
||||||
|
"""
|
||||||
|
conditions = {
|
||||||
|
"New York": "Partly cloudy",
|
||||||
|
"London": "Rainy",
|
||||||
|
"Tokyo": "Sunny"
|
||||||
|
}
|
||||||
|
return conditions.get(city, "Unknown")
|
||||||
|
|
||||||
|
|
||||||
|
messages = [{'role': 'user', 'content': 'What are the current weather conditions and temperature in New York and London?'}]
|
||||||
|
|
||||||
|
# The python client automatically parses functions as a tool schema so we can pass them directly
|
||||||
|
# Schemas can be passed directly in the tools list as well
|
||||||
|
response = chat(model='qwen3', messages=messages, tools=[get_temperature, get_conditions], think=True)
|
||||||
|
|
||||||
|
# add the assistant message to the messages
|
||||||
|
messages.append(response.message)
|
||||||
|
if response.message.tool_calls:
|
||||||
|
# process each tool call
|
||||||
|
for call in response.message.tool_calls:
|
||||||
|
# execute the appropriate tool
|
||||||
|
if call.function.name == 'get_temperature':
|
||||||
|
result = get_temperature(**call.function.arguments)
|
||||||
|
elif call.function.name == 'get_conditions':
|
||||||
|
result = get_conditions(**call.function.arguments)
|
||||||
|
else:
|
||||||
|
result = 'Unknown tool'
|
||||||
|
# add the tool result to the messages
|
||||||
|
messages.append({'role': 'tool', 'tool_name': call.function.name, 'content': str(result)})
|
||||||
|
|
||||||
|
# generate the final response
|
||||||
|
final_response = chat(model='qwen3', messages=messages, tools=[get_temperature, get_conditions], think=True)
|
||||||
|
print(final_response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```typescript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
function getTemperature(city: string): string {
|
||||||
|
const temperatures: { [key: string]: string } = {
|
||||||
|
"New York": "22°C",
|
||||||
|
"London": "15°C",
|
||||||
|
"Tokyo": "18°C"
|
||||||
|
}
|
||||||
|
return temperatures[city] || "Unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
function getConditions(city: string): string {
|
||||||
|
const conditions: { [key: string]: string } = {
|
||||||
|
"New York": "Partly cloudy",
|
||||||
|
"London": "Rainy",
|
||||||
|
"Tokyo": "Sunny"
|
||||||
|
}
|
||||||
|
return conditions[city] || "Unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
const tools = [
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'get_temperature',
|
||||||
|
description: 'Get the current temperature for a city',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['city'],
|
||||||
|
properties: {
|
||||||
|
city: { type: 'string', description: 'The name of the city' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'get_conditions',
|
||||||
|
description: 'Get the current weather conditions for a city',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['city'],
|
||||||
|
properties: {
|
||||||
|
city: { type: 'string', description: 'The name of the city' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
const messages = [{ role: 'user', content: 'What are the current weather conditions and temperature in New York and London?' }]
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages,
|
||||||
|
tools,
|
||||||
|
think: true
|
||||||
|
})
|
||||||
|
|
||||||
|
// add the assistant message to the messages
|
||||||
|
messages.push(response.message)
|
||||||
|
if (response.message.tool_calls) {
|
||||||
|
// process each tool call
|
||||||
|
for (const call of response.message.tool_calls) {
|
||||||
|
// execute the appropriate tool
|
||||||
|
let result: string
|
||||||
|
if (call.function.name === 'get_temperature') {
|
||||||
|
const args = call.function.arguments as { city: string }
|
||||||
|
result = getTemperature(args.city)
|
||||||
|
} else if (call.function.name === 'get_conditions') {
|
||||||
|
const args = call.function.arguments as { city: string }
|
||||||
|
result = getConditions(args.city)
|
||||||
|
} else {
|
||||||
|
result = 'Unknown tool'
|
||||||
|
}
|
||||||
|
// add the tool result to the messages
|
||||||
|
messages.push({ role: 'tool', tool_name: call.function.name, content: result })
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate the final response
|
||||||
|
const finalResponse = await ollama.chat({ model: 'qwen3', messages, tools, think: true })
|
||||||
|
console.log(finalResponse.message.content)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
## Multi-turn tool calling (Agent loop)
|
||||||
|
|
||||||
|
An agent loop allows the model to decide when to invoke tools and incorporate their results into its replies.
|
||||||
|
|
||||||
|
It also might help to tell the model that it is in a loop and can make multiple tool calls.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat, ChatResponse
|
||||||
|
|
||||||
|
|
||||||
|
def add(a: int, b: int) -> int:
|
||||||
|
"""Add two numbers"""
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
a: The first number
|
||||||
|
b: The second number
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The sum of the two numbers
|
||||||
|
"""
|
||||||
|
return a + b
|
||||||
|
|
||||||
|
|
||||||
|
def multiply(a: int, b: int) -> int:
|
||||||
|
"""Multiply two numbers"""
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
a: The first number
|
||||||
|
b: The second number
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The product of the two numbers
|
||||||
|
"""
|
||||||
|
return a * b
|
||||||
|
|
||||||
|
|
||||||
|
available_functions = {
|
||||||
|
'add': add,
|
||||||
|
'multiply': multiply,
|
||||||
|
}
|
||||||
|
|
||||||
|
messages = [{'role': 'user', 'content': 'What is (11434+12341)*412?'}]
|
||||||
|
while True:
|
||||||
|
response: ChatResponse = chat(
|
||||||
|
model='qwen3',
|
||||||
|
messages=messages,
|
||||||
|
tools=[add, multiply],
|
||||||
|
think=True,
|
||||||
|
)
|
||||||
|
messages.append(response.message)
|
||||||
|
print("Thinking: ", response.message.thinking)
|
||||||
|
print("Content: ", response.message.content)
|
||||||
|
if response.message.tool_calls:
|
||||||
|
for tc in response.message.tool_calls:
|
||||||
|
if tc.function.name in available_functions:
|
||||||
|
print(f"Calling {tc.function.name} with arguments {tc.function.arguments}")
|
||||||
|
result = available_functions[tc.function.name](**tc.function.arguments)
|
||||||
|
print(f"Result: {result}")
|
||||||
|
# add the tool result to the messages
|
||||||
|
messages.append({'role': 'tool', 'tool_name': tc.function.name, 'content': str(result)})
|
||||||
|
else:
|
||||||
|
# end the loop when there are no more tool calls
|
||||||
|
break
|
||||||
|
# continue the loop with the updated messages
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```typescript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
type ToolName = 'add' | 'multiply'
|
||||||
|
|
||||||
|
function add(a: number, b: number): number {
|
||||||
|
return a + b
|
||||||
|
}
|
||||||
|
|
||||||
|
function multiply(a: number, b: number): number {
|
||||||
|
return a * b
|
||||||
|
}
|
||||||
|
|
||||||
|
const availableFunctions: Record<ToolName, (a: number, b: number) => number> = {
|
||||||
|
add,
|
||||||
|
multiply,
|
||||||
|
}
|
||||||
|
|
||||||
|
const tools = [
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'add',
|
||||||
|
description: 'Add two numbers',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['a', 'b'],
|
||||||
|
properties: {
|
||||||
|
a: { type: 'integer', description: 'The first number' },
|
||||||
|
b: { type: 'integer', description: 'The second number' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'multiply',
|
||||||
|
description: 'Multiply two numbers',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['a', 'b'],
|
||||||
|
properties: {
|
||||||
|
a: { type: 'integer', description: 'The first number' },
|
||||||
|
b: { type: 'integer', description: 'The second number' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
async function agentLoop() {
|
||||||
|
const messages = [{ role: 'user', content: 'What is (11434+12341)*412?' }]
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages,
|
||||||
|
tools,
|
||||||
|
think: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
messages.push(response.message)
|
||||||
|
console.log('Thinking:', response.message.thinking)
|
||||||
|
console.log('Content:', response.message.content)
|
||||||
|
|
||||||
|
const toolCalls = response.message.tool_calls ?? []
|
||||||
|
if (toolCalls.length) {
|
||||||
|
for (const call of toolCalls) {
|
||||||
|
const fn = availableFunctions[call.function.name as ToolName]
|
||||||
|
if (!fn) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
const args = call.function.arguments as { a: number; b: number }
|
||||||
|
console.log(`Calling ${call.function.name} with arguments`, args)
|
||||||
|
const result = fn(args.a, args.b)
|
||||||
|
console.log(`Result: ${result}`)
|
||||||
|
messages.push({ role: 'tool', tool_name: call.function.name, content: String(result) })
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
agentLoop().catch(console.error)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
## Tool calling with streaming
|
||||||
|
|
||||||
|
When streaming, gather every chunk of `thinking`, `content`, and `tool_calls`, then return those fields together with any tool results in the follow-up request.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
|
||||||
|
def get_temperature(city: str) -> str:
|
||||||
|
"""Get the current temperature for a city
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city: The name of the city
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current temperature for the city
|
||||||
|
"""
|
||||||
|
temperatures = {
|
||||||
|
'New York': '22°C',
|
||||||
|
'London': '15°C',
|
||||||
|
}
|
||||||
|
return temperatures.get(city, 'Unknown')
|
||||||
|
|
||||||
|
|
||||||
|
messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
stream = chat(
|
||||||
|
model='qwen3',
|
||||||
|
messages=messages,
|
||||||
|
tools=[get_temperature],
|
||||||
|
stream=True,
|
||||||
|
think=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
thinking = ''
|
||||||
|
content = ''
|
||||||
|
tool_calls = []
|
||||||
|
|
||||||
|
done_thinking = False
|
||||||
|
# accumulate the partial fields
|
||||||
|
for chunk in stream:
|
||||||
|
if chunk.message.thinking:
|
||||||
|
thinking += chunk.message.thinking
|
||||||
|
print(chunk.message.thinking, end='', flush=True)
|
||||||
|
if chunk.message.content:
|
||||||
|
if not done_thinking:
|
||||||
|
done_thinking = True
|
||||||
|
print('\n')
|
||||||
|
content += chunk.message.content
|
||||||
|
print(chunk.message.content, end='', flush=True)
|
||||||
|
if chunk.message.tool_calls:
|
||||||
|
tool_calls.extend(chunk.message.tool_calls)
|
||||||
|
print(chunk.message.tool_calls)
|
||||||
|
|
||||||
|
# append accumulated fields to the messages
|
||||||
|
if thinking or content or tool_calls:
|
||||||
|
messages.append({'role': 'assistant', 'thinking': thinking, 'content': content, 'tool_calls': tool_calls})
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
break
|
||||||
|
|
||||||
|
for call in tool_calls:
|
||||||
|
if call.function.name == 'get_temperature':
|
||||||
|
result = get_temperature(**call.function.arguments)
|
||||||
|
else:
|
||||||
|
result = 'Unknown tool'
|
||||||
|
messages.append({'role': 'tool', 'tool_name': call.function.name, 'content': result})
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```typescript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
function getTemperature(city: string): string {
|
||||||
|
const temperatures: Record<string, string> = {
|
||||||
|
'New York': '22°C',
|
||||||
|
'London': '15°C',
|
||||||
|
}
|
||||||
|
return temperatures[city] ?? 'Unknown'
|
||||||
|
}
|
||||||
|
|
||||||
|
const getTemperatureTool = {
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'get_temperature',
|
||||||
|
description: 'Get the current temperature for a city',
|
||||||
|
parameters: {
|
||||||
|
type: 'object',
|
||||||
|
required: ['city'],
|
||||||
|
properties: {
|
||||||
|
city: { type: 'string', description: 'The name of the city' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
async function agentLoop() {
|
||||||
|
const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const stream = await ollama.chat({
|
||||||
|
model: 'qwen3',
|
||||||
|
messages,
|
||||||
|
tools: [getTemperatureTool],
|
||||||
|
stream: true,
|
||||||
|
think: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
let thinking = ''
|
||||||
|
let content = ''
|
||||||
|
const toolCalls: any[] = []
|
||||||
|
let doneThinking = false
|
||||||
|
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
if (chunk.message.thinking) {
|
||||||
|
thinking += chunk.message.thinking
|
||||||
|
process.stdout.write(chunk.message.thinking)
|
||||||
|
}
|
||||||
|
if (chunk.message.content) {
|
||||||
|
if (!doneThinking) {
|
||||||
|
doneThinking = true
|
||||||
|
process.stdout.write('\n')
|
||||||
|
}
|
||||||
|
content += chunk.message.content
|
||||||
|
process.stdout.write(chunk.message.content)
|
||||||
|
}
|
||||||
|
if (chunk.message.tool_calls?.length) {
|
||||||
|
toolCalls.push(...chunk.message.tool_calls)
|
||||||
|
console.log(chunk.message.tool_calls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (thinking || content || toolCalls.length) {
|
||||||
|
messages.push({ role: 'assistant', thinking, content, tool_calls: toolCalls } as any)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!toolCalls.length) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const call of toolCalls) {
|
||||||
|
if (call.function.name === 'get_temperature') {
|
||||||
|
const args = call.function.arguments as { city: string }
|
||||||
|
const result = getTemperature(args.city)
|
||||||
|
messages.push({ role: 'tool', tool_name: call.function.name, content: result } )
|
||||||
|
} else {
|
||||||
|
messages.push({ role: 'tool', tool_name: call.function.name, content: 'Unknown tool' } )
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
agentLoop().catch(console.error)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
This loop streams the assistant response, accumulates partial fields, passes them back together, and appends the tool results so the model can complete its answer.
|
||||||
|
|
||||||
|
|
||||||
|
## Using functions as tools with Ollama Python SDK
|
||||||
|
The Python SDK automatically parses functions as a tool schema so we can pass them directly.
|
||||||
|
Schemas can still be passed if needed.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
|
||||||
|
def get_temperature(city: str) -> str:
|
||||||
|
"""Get the current temperature for a city
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city: The name of the city
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current temperature for the city
|
||||||
|
"""
|
||||||
|
temperatures = {
|
||||||
|
'New York': '22°C',
|
||||||
|
'London': '15°C',
|
||||||
|
}
|
||||||
|
return temperatures.get(city, 'Unknown')
|
||||||
|
|
||||||
|
available_functions = {
|
||||||
|
'get_temperature': get_temperature,
|
||||||
|
}
|
||||||
|
# directly pass the function as part of the tools list
|
||||||
|
response = chat(model='qwen3', messages=messages, tools=available_functions.values(), think=True)
|
||||||
|
```
|
||||||
85
docs/capabilities/vision.mdx
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
---
|
||||||
|
title: Vision
|
||||||
|
---
|
||||||
|
|
||||||
|
Vision models accept images alongside text so the model can describe, classify, and answer questions about what it sees.
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ollama run gemma3 ./image.png whats in this image?
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Usage with Ollama's API
|
||||||
|
Provide an `images` array. SDKs accept file paths, URLs or raw bytes while the REST API expects base64-encoded image data.
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="cURL">
|
||||||
|
```shell
|
||||||
|
# 1. Download a sample image
|
||||||
|
curl -L -o test.jpg "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
|
||||||
|
|
||||||
|
# 2. Encode the image
|
||||||
|
IMG=$(base64 < test.jpg | tr -d '\n')
|
||||||
|
|
||||||
|
# 3. Send it to Ollama
|
||||||
|
curl -X POST http://localhost:11434/api/chat \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gemma3",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What is in this image?",
|
||||||
|
"images": ["'"$IMG"'"]
|
||||||
|
}],
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
"
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
```python
|
||||||
|
from ollama import chat
|
||||||
|
# from pathlib import Path
|
||||||
|
|
||||||
|
# Pass in the path to the image
|
||||||
|
path = input('Please enter the path to the image: ')
|
||||||
|
|
||||||
|
# You can also pass in base64 encoded image data
|
||||||
|
# img = base64.b64encode(Path(path).read_bytes()).decode()
|
||||||
|
# or the raw bytes
|
||||||
|
# img = Path(path).read_bytes()
|
||||||
|
|
||||||
|
response = chat(
|
||||||
|
model='gemma3',
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'What is in this image? Be concise.',
|
||||||
|
'images': [path],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
```javascript
|
||||||
|
import ollama from 'ollama'
|
||||||
|
|
||||||
|
const imagePath = '/absolute/path/to/image.jpg'
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: 'gemma3',
|
||||||
|
messages: [
|
||||||
|
{ role: 'user', content: 'What is in this image?', images: [imagePath] }
|
||||||
|
],
|
||||||
|
stream: false,
|
||||||
|
})
|
||||||
|
|
||||||
|
console.log(response.message.content)
|
||||||
|
```
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
360
docs/capabilities/web-search.mdx
Normal file
@@ -0,0 +1,360 @@
|
|||||||
|
---
|
||||||
|
title: Web search
|
||||||
|
---
|
||||||
|
|
||||||
|
Ollama's web search API can be used to augment models with the latest information to reduce hallucinations and improve accuracy.
|
||||||
|
|
||||||
|
Web search is provided as a REST API with deeper tool integrations in the Python and JavaScript libraries. This also enables models like OpenAI’s gpt-oss models to conduct long-running research tasks.
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
For access to Ollama's web search API, create an [API key](https://ollama.com/settings/keys). A free Ollama account is required.
|
||||||
|
|
||||||
|
## Web search API
|
||||||
|
|
||||||
|
Performs a web search for a single query and returns relevant results.
|
||||||
|
|
||||||
|
### Request
|
||||||
|
|
||||||
|
`POST https://ollama.com/api/web_search`
|
||||||
|
|
||||||
|
- `query` (string, required): the search query string
|
||||||
|
- `max_results` (integer, optional): maximum results to return (default 5, max 10)
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
|
Returns an object containing:
|
||||||
|
|
||||||
|
- `results` (array): array of search result objects, each containing:
|
||||||
|
- `title` (string): the title of the web page
|
||||||
|
- `url` (string): the URL of the web page
|
||||||
|
- `content` (string): relevant content snippet from the web page
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
Ensure OLLAMA_API_KEY is set or it must be passed in the Authorization header.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
#### cURL Request
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://ollama.com/api/web_search \
|
||||||
|
--header "Authorization: Bearer $OLLAMA_API_KEY" \
|
||||||
|
-d '{
|
||||||
|
"query":"what is ollama?"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"title": "Ollama",
|
||||||
|
"url": "https://ollama.com/",
|
||||||
|
"content": "Cloud models are now available..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "What is Ollama? Introduction to the AI model management tool",
|
||||||
|
"url": "https://www.hostinger.com/tutorials/what-is-ollama",
|
||||||
|
"content": "Ariffud M. 6min Read..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Ollama Explained: Transforming AI Accessibility and Language ...",
|
||||||
|
"url": "https://www.geeksforgeeks.org/artificial-intelligence/ollama-explained-transforming-ai-accessibility-and-language-processing/",
|
||||||
|
"content": "Data Science Data Science Projects Data Analysis..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Python library
|
||||||
|
|
||||||
|
```python
|
||||||
|
import ollama
|
||||||
|
response = ollama.web_search("What is Ollama?")
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example output**
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
|
results = [
|
||||||
|
{
|
||||||
|
"title": "Ollama",
|
||||||
|
"url": "https://ollama.com/",
|
||||||
|
"content": "Cloud models are now available in Ollama..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "What is Ollama? Features, Pricing, and Use Cases - Walturn",
|
||||||
|
"url": "https://www.walturn.com/insights/what-is-ollama-features-pricing-and-use-cases",
|
||||||
|
"content": "Our services..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Complete Ollama Guide: Installation, Usage & Code Examples",
|
||||||
|
"url": "https://collabnix.com/complete-ollama-guide-installation-usage-code-examples",
|
||||||
|
"content": "Join our Discord Server..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/examples/web-search.py)
|
||||||
|
|
||||||
|
#### JavaScript Library
|
||||||
|
|
||||||
|
```tsx
|
||||||
|
import { Ollama } from "ollama";
|
||||||
|
|
||||||
|
const client = new Ollama();
|
||||||
|
const results = await client.webSearch({ query: "what is ollama?" });
|
||||||
|
console.log(JSON.stringify(results, null, 2));
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example output**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"title": "Ollama",
|
||||||
|
"url": "https://ollama.com/",
|
||||||
|
"content": "Cloud models are now available..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "What is Ollama? Introduction to the AI model management tool",
|
||||||
|
"url": "https://www.hostinger.com/tutorials/what-is-ollama",
|
||||||
|
"content": "Ollama is an open-source tool..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Ollama Explained: Transforming AI Accessibility and Language Processing",
|
||||||
|
"url": "https://www.geeksforgeeks.org/artificial-intelligence/ollama-explained-transforming-ai-accessibility-and-language-processing/",
|
||||||
|
"content": "Ollama is a groundbreaking..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
More Ollama [JavaScript example](https://github.com/ollama/ollama-js/blob/main/examples/websearch/websearch-tools.ts)
|
||||||
|
|
||||||
|
## Web fetch API
|
||||||
|
|
||||||
|
Fetches a single web page by URL and returns its content.
|
||||||
|
|
||||||
|
### Request
|
||||||
|
|
||||||
|
`POST https://ollama.com/api/web_fetch`
|
||||||
|
|
||||||
|
- `url` (string, required): the URL to fetch
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
|
Returns an object containing:
|
||||||
|
|
||||||
|
- `title` (string): the title of the web page
|
||||||
|
- `content` (string): the main content of the web page
|
||||||
|
- `links` (array): array of links found on the page
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
#### cURL Request
|
||||||
|
|
||||||
|
```python
|
||||||
|
curl --request POST \
|
||||||
|
--url https://ollama.com/api/web_fetch \
|
||||||
|
--header "Authorization: Bearer $OLLAMA_API_KEY" \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"url": "ollama.com"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"title": "Ollama",
|
||||||
|
"content": "[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama...",
|
||||||
|
"links": [
|
||||||
|
"http://ollama.com/",
|
||||||
|
"http://ollama.com/models",
|
||||||
|
"https://github.com/ollama/ollama"
|
||||||
|
]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Python SDK
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import web_fetch
|
||||||
|
|
||||||
|
result = web_fetch('https://ollama.com')
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result**
|
||||||
|
|
||||||
|
```python
|
||||||
|
WebFetchResponse(
|
||||||
|
title='Ollama',
|
||||||
|
content='[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama\n\n**Chat & build
|
||||||
|
with open models**\n\n[Download](https://ollama.com/download) [Explore
|
||||||
|
models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
|
||||||
|
links=['https://ollama.com/', 'https://ollama.com/models', 'https://github.com/ollama/ollama']
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### JavaScript SDK
|
||||||
|
|
||||||
|
```tsx
|
||||||
|
import { Ollama } from "ollama";
|
||||||
|
|
||||||
|
const client = new Ollama();
|
||||||
|
const fetchResult = await client.webFetch({ url: "https://ollama.com" });
|
||||||
|
console.log(JSON.stringify(fetchResult, null, 2));
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"title": "Ollama",
|
||||||
|
"content": "[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama...",
|
||||||
|
"links": [
|
||||||
|
"https://ollama.com/",
|
||||||
|
"https://ollama.com/models",
|
||||||
|
"https://github.com/ollama/ollama"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building a search agent
|
||||||
|
|
||||||
|
Use Ollama’s web search API as a tool to build a mini search agent.
|
||||||
|
|
||||||
|
This example uses Alibaba’s Qwen 3 model with 4B parameters.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama pull qwen3:4b
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import chat, web_fetch, web_search
|
||||||
|
|
||||||
|
available_tools = {'web_search': web_search, 'web_fetch': web_fetch}
|
||||||
|
|
||||||
|
messages = [{'role': 'user', 'content': "what is ollama's new engine"}]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
response = chat(
|
||||||
|
model='qwen3:4b',
|
||||||
|
messages=messages,
|
||||||
|
tools=[web_search, web_fetch],
|
||||||
|
think=True
|
||||||
|
)
|
||||||
|
if response.message.thinking:
|
||||||
|
print('Thinking: ', response.message.thinking)
|
||||||
|
if response.message.content:
|
||||||
|
print('Content: ', response.message.content)
|
||||||
|
messages.append(response.message)
|
||||||
|
if response.message.tool_calls:
|
||||||
|
print('Tool calls: ', response.message.tool_calls)
|
||||||
|
for tool_call in response.message.tool_calls:
|
||||||
|
function_to_call = available_tools.get(tool_call.function.name)
|
||||||
|
if function_to_call:
|
||||||
|
args = tool_call.function.arguments
|
||||||
|
result = function_to_call(**args)
|
||||||
|
print('Result: ', str(result)[:200]+'...')
|
||||||
|
# Result is truncated for limited context lengths
|
||||||
|
messages.append({'role': 'tool', 'content': str(result)[:2000 * 4], 'tool_name': tool_call.function.name})
|
||||||
|
else:
|
||||||
|
messages.append({'role': 'tool', 'content': f'Tool {tool_call.function.name} not found', 'tool_name': tool_call.function.name})
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result**
|
||||||
|
|
||||||
|
```
|
||||||
|
Thinking: Okay, the user is asking about Ollama's new engine. I need to figure out what they're referring to. Ollama is a company that develops large language models, so maybe they've released a new model or an updated version of their existing engine....
|
||||||
|
|
||||||
|
Tool calls: [ToolCall(function=Function(name='web_search', arguments={'max_results': 3, 'query': 'Ollama new engine'}))]
|
||||||
|
Result: results=[WebSearchResult(content='# New model scheduling\n\n## September 23, 2025\n\nOllama now includes a significantly improved model scheduling system. Ahead of running a model, Ollama’s new engine
|
||||||
|
|
||||||
|
Thinking: Okay, the user asked about Ollama's new engine. Let me look at the search results.
|
||||||
|
|
||||||
|
First result is from September 23, 2025, talking about new model scheduling. It mentions improved memory management, reduced crashes, better GPU utilization, and multi-GPU performance. Examples show speed improvements and accurate memory reporting. Supported models include gemma3, llama4, qwen3, etc...
|
||||||
|
|
||||||
|
Content: Ollama has introduced two key updates to its engine, both released in 2025:
|
||||||
|
|
||||||
|
1. **Enhanced Model Scheduling (September 23, 2025)**
|
||||||
|
- **Precision Memory Management**: Exact memory allocation reduces out-of-memory crashes and optimizes GPU utilization.
|
||||||
|
- **Performance Gains**: Examples show significant speed improvements (e.g., 85.54 tokens/s vs 52.02 tokens/s) and full GPU layer utilization.
|
||||||
|
- **Multi-GPU Support**: Improved efficiency across multiple GPUs, with accurate memory reporting via tools like `nvidia-smi`.
|
||||||
|
- **Supported Models**: Includes `gemma3`, `llama4`, `qwen3`, `mistral-small3.2`, and more.
|
||||||
|
|
||||||
|
2. **Multimodal Engine (May 15, 2025)**
|
||||||
|
- **Vision Support**: First-class support for vision models, including `llama4:scout` (109B parameters), `gemma3`, `qwen2.5vl`, and `mistral-small3.1`.
|
||||||
|
- **Multimodal Tasks**: Examples include identifying animals in multiple images, answering location-based questions from videos, and document scanning.
|
||||||
|
|
||||||
|
These updates highlight Ollama's focus on efficiency, performance, and expanded capabilities for both text and vision tasks.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Context length and agents
|
||||||
|
|
||||||
|
Web search results can return thousands of tokens. It is recommended to increase the context length of the model to at least ~32000 tokens. Search agents work best with full context length. [Ollama's cloud models](https://docs.ollama.com/cloud) run at the full context length.
|
||||||
|
|
||||||
|
## MCP Server
|
||||||
|
|
||||||
|
You can enable web search in any MCP client through the [Python MCP server](https://github.com/ollama/ollama-python/blob/main/examples/web-search-mcp.py).
|
||||||
|
|
||||||
|
### Cline
|
||||||
|
|
||||||
|
Ollama's web search can be integrated with Cline easily using the MCP server configuration.
|
||||||
|
|
||||||
|
`Manage MCP Servers` > `Configure MCP Servers` > Add the following configuration:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"web_search_and_fetch": {
|
||||||
|
"type": "stdio",
|
||||||
|
"command": "uv",
|
||||||
|
"args": ["run", "path/to/web-search-mcp.py"],
|
||||||
|
"env": { "OLLAMA_API_KEY": "your_api_key_here" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### Codex
|
||||||
|
|
||||||
|
Ollama works well with OpenAI's Codex tool.
|
||||||
|
|
||||||
|
Add the following configuration to `~/.codex/config.toml`
|
||||||
|
|
||||||
|
```python
|
||||||
|
[mcp_servers.web_search]
|
||||||
|
command = "uv"
|
||||||
|
args = ["run", "path/to/web-search-mcp.py"]
|
||||||
|
env = { "OLLAMA_API_KEY" = "your_api_key_here" }
|
||||||
|
```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### Goose
|
||||||
|
|
||||||
|
Ollama can integrate with Goose via its MCP feature.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### Other integrations
|
||||||
|
|
||||||
|
Ollama can be integrated into most of the tools available either through direct integration of Ollama's API, Python / JavaScript libraries, OpenAI compatible API, and MCP server integration.
|
||||||
91
docs/cli.mdx
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
---
|
||||||
|
title: CLI Reference
|
||||||
|
---
|
||||||
|
|
||||||
|
### Run a model
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama run gemma3
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Multiline input
|
||||||
|
|
||||||
|
For multiline input, you can wrap text with `"""`:
|
||||||
|
|
||||||
|
```
|
||||||
|
>>> """Hello,
|
||||||
|
... world!
|
||||||
|
... """
|
||||||
|
I'm a basic program that prints the famous "Hello, world!" message to the console.
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Multimodal models
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama run gemma3 "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Download a model
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama pull gemma3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Remove a model
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama rm gemma3
|
||||||
|
```
|
||||||
|
|
||||||
|
### List models
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama ls
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sign in to Ollama
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama signin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sign out of Ollama
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama signout
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create a customized model
|
||||||
|
|
||||||
|
First, create a `Modelfile`
|
||||||
|
|
||||||
|
```
|
||||||
|
FROM gemma3
|
||||||
|
SYSTEM """You are a happy cat."""
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run `ollama create`:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama create -f Modelfile
|
||||||
|
```
|
||||||
|
|
||||||
|
### List running models
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama ps
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stop a running model
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama stop gemma3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Start Ollama
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama serve
|
||||||
|
```
|
||||||
|
|
||||||
|
To view a list of environment variables that can be set run `ollama serve --help`
|
||||||
236
docs/cloud.mdx
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
---
|
||||||
|
title: Cloud
|
||||||
|
sidebarTitle: Cloud
|
||||||
|
---
|
||||||
|
|
||||||
|
<Info>Ollama's cloud is currently in preview.</Info>
|
||||||
|
|
||||||
|
## Cloud Models
|
||||||
|
|
||||||
|
Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
|
||||||
|
|
||||||
|
Ollama currently supports the following cloud models, with more coming soon:
|
||||||
|
|
||||||
|
- `deepseek-v3.1:671b-cloud`
|
||||||
|
- `gpt-oss:20b-cloud`
|
||||||
|
- `gpt-oss:120b-cloud`
|
||||||
|
- `kimi-k2:1t-cloud`
|
||||||
|
- `qwen3-coder:480b-cloud`
|
||||||
|
- `glm-4.6:cloud`
|
||||||
|
- `minimax-m2:cloud`
|
||||||
|
|
||||||
|
### Running Cloud models
|
||||||
|
|
||||||
|
Ollama's cloud models require an account on [ollama.com](https://ollama.com). To sign in or create an account, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama signin
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="CLI">
|
||||||
|
|
||||||
|
To run a cloud model, open the terminal and run:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama run gpt-oss:120b-cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="Python">
|
||||||
|
|
||||||
|
First, pull a cloud model so it can be accessed:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama pull gpt-oss:120b-cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, install [Ollama's Python library](https://github.com/ollama/ollama-python):
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, create and run a simple Python script:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from ollama import Client
|
||||||
|
|
||||||
|
client = Client()
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Why is the sky blue?',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for part in client.chat('gpt-oss:120b-cloud', messages=messages, stream=True):
|
||||||
|
print(part['message']['content'], end='', flush=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
|
||||||
|
First, pull a cloud model so it can be accessed:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama pull gpt-oss:120b-cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, install [Ollama's JavaScript library](https://github.com/ollama/ollama-js):
|
||||||
|
|
||||||
|
```
|
||||||
|
npm i ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
Then use the library to run a cloud model:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { Ollama } from "ollama";
|
||||||
|
|
||||||
|
const ollama = new Ollama();
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: "gpt-oss:120b-cloud",
|
||||||
|
messages: [{ role: "user", content: "Explain quantum computing" }],
|
||||||
|
stream: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
for await (const part of response) {
|
||||||
|
process.stdout.write(part.message.content);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="cURL">
|
||||||
|
|
||||||
|
First, pull a cloud model so it can be accessed:
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama pull gpt-oss:120b-cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
Run the following cURL command to run the command via Ollama's API:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl http://localhost:11434/api/chat -d '{
|
||||||
|
"model": "gpt-oss:120b-cloud",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Why is the sky blue?"
|
||||||
|
}],
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Cloud API access
|
||||||
|
|
||||||
|
Cloud models can also be accessed directly on ollama.com's API. In this mode, ollama.com acts as a remote Ollama host.
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
|
||||||
|
For direct access to ollama.com's API, first create an [API key](https://ollama.com/settings/keys).
|
||||||
|
|
||||||
|
Then, set the `OLLAMA_API_KEY` environment variable to your API key.
|
||||||
|
|
||||||
|
```
|
||||||
|
export OLLAMA_API_KEY=your_api_key
|
||||||
|
```
|
||||||
|
|
||||||
|
### Listing models
|
||||||
|
|
||||||
|
For models available directly via Ollama's API, models can be listed via:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl https://ollama.com/api/tags
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generating a response
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<Tab title="Python">
|
||||||
|
|
||||||
|
First, install [Ollama's Python library](https://github.com/ollama/ollama-python)
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
Then make a request
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from ollama import Client
|
||||||
|
|
||||||
|
client = Client(
|
||||||
|
host="https://ollama.com",
|
||||||
|
headers={'Authorization': 'Bearer ' + os.environ.get('OLLAMA_API_KEY')}
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Why is the sky blue?',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
|
||||||
|
print(part['message']['content'], end='', flush=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="JavaScript">
|
||||||
|
|
||||||
|
First, install [Ollama's JavaScript library](https://github.com/ollama/ollama-js):
|
||||||
|
|
||||||
|
```
|
||||||
|
npm i ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, make a request to the model:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { Ollama } from "ollama";
|
||||||
|
|
||||||
|
const ollama = new Ollama({
|
||||||
|
host: "https://ollama.com",
|
||||||
|
headers: {
|
||||||
|
Authorization: "Bearer " + process.env.OLLAMA_API_KEY,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model: "gpt-oss:120b",
|
||||||
|
messages: [{ role: "user", content: "Explain quantum computing" }],
|
||||||
|
stream: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
for await (const part of response) {
|
||||||
|
process.stdout.write(part.message.content);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
<Tab title="cURL">
|
||||||
|
|
||||||
|
Generate a response via Ollama's chat API:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl https://ollama.com/api/chat \
|
||||||
|
-H "Authorization: Bearer $OLLAMA_API_KEY" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-oss:120b",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Why is the sky blue?"
|
||||||
|
}],
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</Tab>
|
||||||
|
</Tabs>
|
||||||
38
docs/context-length.mdx
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
---
|
||||||
|
title: Context length
|
||||||
|
---
|
||||||
|
|
||||||
|
Context length is the maximum number of tokens that the model has access to in memory.
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
The default context length in Ollama is 4096 tokens.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
Tasks which require large context like web search, agents, and coding tools should be set to at least 32000 tokens.
|
||||||
|
|
||||||
|
## Setting context length
|
||||||
|
|
||||||
|
Setting a larger context length will increase the amount of memory required to run a model. Ensure you have enough VRAM available to increase the context length.
|
||||||
|
|
||||||
|
Cloud models are set to their maximum context length by default.
|
||||||
|
|
||||||
|
### App
|
||||||
|
|
||||||
|
Change the slider in the Ollama app under settings to your desired context length.
|
||||||
|

|
||||||
|
|
||||||
|
### CLI
|
||||||
|
If editing the context length for Ollama is not possible, the context length can also be updated when serving Ollama.
|
||||||
|
```
|
||||||
|
OLLAMA_CONTEXT_LENGTH=32000 ollama serve
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check allocated context length and model offloading
|
||||||
|
For best performance, use the maximum context length for a model, and avoid offloading the model to CPU. Verify the split under `PROCESSOR` using `ollama ps`.
|
||||||
|
```
|
||||||
|
ollama ps
|
||||||
|
```
|
||||||
|
```
|
||||||
|
NAME ID SIZE PROCESSOR CONTEXT UNTIL
|
||||||
|
gemma3:latest a2af6cc3eb7f 6.6 GB 100% GPU 65536 2 minutes from now
|
||||||
|
```
|
||||||
@@ -11,6 +11,10 @@ Then build and run Ollama from the root directory of the repository:
|
|||||||
go run . serve
|
go run . serve
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Ollama includes native code compiled with CGO. From time to time these data structures can change and CGO can get out of sync resulting in unexpected crashes. You can force a full build of the native code by running `go clean -cache` first.
|
||||||
|
|
||||||
|
|
||||||
## macOS (Apple Silicon)
|
## macOS (Apple Silicon)
|
||||||
|
|
||||||
macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
|
macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
# Ollama Docker image
|
## CPU only
|
||||||
|
|
||||||
### CPU only
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
### Nvidia GPU
|
## Nvidia GPU
|
||||||
|
|
||||||
Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
|
Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
|
||||||
|
|
||||||
#### Install with Apt
|
### Install with Apt
|
||||||
|
|
||||||
1. Configure the repository
|
1. Configure the repository
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
|
||||||
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
|
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
|
||||||
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
||||||
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
@@ -27,11 +27,12 @@ Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-
|
|||||||
sudo apt-get install -y nvidia-container-toolkit
|
sudo apt-get install -y nvidia-container-toolkit
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Install with Yum or Dnf
|
### Install with Yum or Dnf
|
||||||
|
|
||||||
1. Configure the repository
|
1. Configure the repository
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
|
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
|
||||||
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -41,23 +42,25 @@ Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-
|
|||||||
sudo yum install -y nvidia-container-toolkit
|
sudo yum install -y nvidia-container-toolkit
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Configure Docker to use Nvidia driver
|
### Configure Docker to use Nvidia driver
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
sudo nvidia-ctk runtime configure --runtime=docker
|
sudo nvidia-ctk runtime configure --runtime=docker
|
||||||
sudo systemctl restart docker
|
sudo systemctl restart docker
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Start the container
|
### Start the container
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
<Note>
|
||||||
> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
|
If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version.
|
||||||
|
Pass the environment variable `JETSON_JETPACK=5` or `JETSON_JETPACK=6` to the container to select version 5 or 6.
|
||||||
|
</Note>
|
||||||
|
|
||||||
### AMD GPU
|
## AMD GPU
|
||||||
|
|
||||||
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
||||||
|
|
||||||
@@ -65,7 +68,7 @@ To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following c
|
|||||||
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
|
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run model locally
|
## Run model locally
|
||||||
|
|
||||||
Now you can run a model:
|
Now you can run a model:
|
||||||
|
|
||||||
@@ -73,6 +76,6 @@ Now you can run a model:
|
|||||||
docker exec -it ollama ollama run llama3.2
|
docker exec -it ollama ollama run llama3.2
|
||||||
```
|
```
|
||||||
|
|
||||||
### Try different models
|
## Try different models
|
||||||
|
|
||||||
More models can be found on the [Ollama library](https://ollama.com/library).
|
More models can be found on the [Ollama library](https://ollama.com/library).
|
||||||
162
docs/docs.json
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
{
|
||||||
|
"$schema": "https://mintlify.com/docs.json",
|
||||||
|
"name": "Ollama",
|
||||||
|
"colors": {
|
||||||
|
"primary": "#000",
|
||||||
|
"light": "#b5b5b5",
|
||||||
|
"dark": "#000"
|
||||||
|
},
|
||||||
|
"favicon": "/images/favicon.png",
|
||||||
|
"logo": {
|
||||||
|
"light": "/images/logo.png",
|
||||||
|
"dark": "/images/logo-dark.png",
|
||||||
|
"href": "https://ollama.com"
|
||||||
|
},
|
||||||
|
"theme": "maple",
|
||||||
|
"background": {
|
||||||
|
"color": {
|
||||||
|
"light": "#ffffff",
|
||||||
|
"dark": "#000000"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fonts": {
|
||||||
|
"family": "system-ui",
|
||||||
|
"heading": {
|
||||||
|
"family": "system-ui"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"family": "system-ui"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"styling": {
|
||||||
|
"codeblocks": "system"
|
||||||
|
},
|
||||||
|
"contextual": {
|
||||||
|
"options": ["copy"]
|
||||||
|
},
|
||||||
|
"navbar": {
|
||||||
|
"links": [
|
||||||
|
{
|
||||||
|
"label": "Sign in",
|
||||||
|
"href": "https://ollama.com/signin"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primary": {
|
||||||
|
"type": "button",
|
||||||
|
"label": "Download",
|
||||||
|
"href": "https://ollama.com/download"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"api": {
|
||||||
|
"playground": {
|
||||||
|
"display": "simple"
|
||||||
|
},
|
||||||
|
"examples": {
|
||||||
|
"languages": ["curl"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"redirects": [
|
||||||
|
{
|
||||||
|
"source": "/openai",
|
||||||
|
"destination": "/api/openai-compatibility"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "/api/openai",
|
||||||
|
"destination": "/api/openai-compatibility"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"navigation": {
|
||||||
|
"tabs": [
|
||||||
|
{
|
||||||
|
"tab": "Documentation",
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"group": "Get started",
|
||||||
|
"pages": [
|
||||||
|
"index",
|
||||||
|
"quickstart",
|
||||||
|
"/cloud"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"group": "Capabilities",
|
||||||
|
"pages": [
|
||||||
|
"/capabilities/streaming",
|
||||||
|
"/capabilities/thinking",
|
||||||
|
"/capabilities/structured-outputs",
|
||||||
|
"/capabilities/vision",
|
||||||
|
"/capabilities/embeddings",
|
||||||
|
"/capabilities/tool-calling",
|
||||||
|
"/capabilities/web-search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"group": "Integrations",
|
||||||
|
"pages": [
|
||||||
|
"/integrations/vscode",
|
||||||
|
"/integrations/jetbrains",
|
||||||
|
"/integrations/codex",
|
||||||
|
"/integrations/cline",
|
||||||
|
"/integrations/droid",
|
||||||
|
"/integrations/goose",
|
||||||
|
"/integrations/zed",
|
||||||
|
"/integrations/roo-code",
|
||||||
|
"/integrations/n8n",
|
||||||
|
"/integrations/xcode"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"group": "More information",
|
||||||
|
"pages": [
|
||||||
|
"/cli",
|
||||||
|
"/modelfile",
|
||||||
|
"/context-length",
|
||||||
|
"/linux",
|
||||||
|
"/macos",
|
||||||
|
"/windows",
|
||||||
|
"/docker",
|
||||||
|
"/import",
|
||||||
|
"/faq",
|
||||||
|
"/gpu",
|
||||||
|
"/troubleshooting"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tab": "API Reference",
|
||||||
|
"openapi": "/openapi.yaml",
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"group": "API Reference",
|
||||||
|
"pages": [
|
||||||
|
"/api/index",
|
||||||
|
"/api/authentication",
|
||||||
|
"/api/streaming",
|
||||||
|
"/api/usage",
|
||||||
|
"/api/errors",
|
||||||
|
"/api/openai-compatibility"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"group": "Endpoints",
|
||||||
|
"pages": [
|
||||||
|
"POST /api/generate",
|
||||||
|
"POST /api/chat",
|
||||||
|
"POST /api/embed",
|
||||||
|
"GET /api/tags",
|
||||||
|
"GET /api/ps",
|
||||||
|
"POST /api/show",
|
||||||
|
"POST /api/create",
|
||||||
|
"POST /api/copy",
|
||||||
|
"POST /api/pull",
|
||||||
|
"POST /api/push",
|
||||||
|
"DELETE /api/delete",
|
||||||
|
"GET /api/version"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -12,9 +12,3 @@ Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/oll
|
|||||||
|
|
||||||
## OpenAI compatibility examples
|
## OpenAI compatibility examples
|
||||||
Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
|
Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
|
||||||
|
|
||||||
|
|
||||||
## Community examples
|
|
||||||
|
|
||||||
- [LangChain Ollama Python](https://python.langchain.com/docs/integrations/chat/ollama/)
|
|
||||||
- [LangChain Ollama JS](https://js.langchain.com/docs/integrations/chat/ollama/)
|
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
# FAQ
|
---
|
||||||
|
title: FAQ
|
||||||
|
---
|
||||||
|
|
||||||
## How can I upgrade Ollama?
|
## How can I upgrade Ollama?
|
||||||
|
|
||||||
@@ -20,9 +22,9 @@ Please refer to the [GPU docs](./gpu.md).
|
|||||||
|
|
||||||
## How can I specify the context window size?
|
## How can I specify the context window size?
|
||||||
|
|
||||||
By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
|
By default, Ollama uses a context window size of 2048 tokens.
|
||||||
|
|
||||||
This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
||||||
@@ -46,8 +48,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
|
|
||||||
|
|
||||||
## How can I tell if my model was loaded onto the GPU?
|
## How can I tell if my model was loaded onto the GPU?
|
||||||
|
|
||||||
Use the `ollama ps` command to see what models are currently loaded into memory.
|
Use the `ollama ps` command to see what models are currently loaded into memory.
|
||||||
@@ -56,17 +56,16 @@ Use the `ollama ps` command to see what models are currently loaded into memory.
|
|||||||
ollama ps
|
ollama ps
|
||||||
```
|
```
|
||||||
|
|
||||||
> **Output**:
|
<Info>
|
||||||
>
|
**Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
|
||||||
> ```
|
100% GPU 4 minutes from now ```
|
||||||
> NAME ID SIZE PROCESSOR CONTEXT UNTIL
|
</Info>
|
||||||
> gpt-oss:20b 05afbac4bad6 16 GB 100% GPU 8192 4 minutes from now
|
|
||||||
> ```
|
|
||||||
|
|
||||||
The `Processor` column will show which memory the model was loaded in to:
|
The `Processor` column will show which memory the model was loaded in to:
|
||||||
* `100% GPU` means the model was loaded entirely into the GPU
|
|
||||||
* `100% CPU` means the model was loaded entirely in system memory
|
- `100% GPU` means the model was loaded entirely into the GPU
|
||||||
* `48%/52% CPU/GPU` means the model was loaded partially onto both the GPU and into system memory
|
- `100% CPU` means the model was loaded entirely in system memory
|
||||||
|
- `48%/52% CPU/GPU` means the model was loaded partially onto both the GPU and into system memory
|
||||||
|
|
||||||
## How do I configure Ollama server?
|
## How do I configure Ollama server?
|
||||||
|
|
||||||
@@ -126,8 +125,10 @@ On Windows, Ollama inherits your user and system environment variables.
|
|||||||
|
|
||||||
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
||||||
|
|
||||||
> [!NOTE]
|
<Note>
|
||||||
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
|
Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only
|
||||||
|
HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
|
||||||
|
</Note>
|
||||||
|
|
||||||
### How do I use Ollama behind a proxy in Docker?
|
### How do I use Ollama behind a proxy in Docker?
|
||||||
|
|
||||||
@@ -150,11 +151,9 @@ docker build -t ollama-with-ca .
|
|||||||
docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
|
docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
|
||||||
```
|
```
|
||||||
|
|
||||||
## Does Ollama send my prompts and responses back to ollama.com?
|
## Does Ollama send my prompts and answers back to ollama.com?
|
||||||
|
|
||||||
If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
|
No. Ollama runs locally, and conversation data does not leave your machine.
|
||||||
|
|
||||||
If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
|
|
||||||
|
|
||||||
## How can I expose Ollama on my network?
|
## How can I expose Ollama on my network?
|
||||||
|
|
||||||
@@ -216,7 +215,9 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
|||||||
|
|
||||||
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
||||||
|
|
||||||
> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
|
<Note>
|
||||||
|
On Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
|
||||||
|
</Note>
|
||||||
|
|
||||||
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
||||||
|
|
||||||
@@ -235,7 +236,7 @@ GPU acceleration is not available for Docker Desktop in macOS due to the lack of
|
|||||||
This can impact both installing Ollama, as well as downloading models.
|
This can impact both installing Ollama, as well as downloading models.
|
||||||
|
|
||||||
Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
|
Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
|
||||||
Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
|
Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. _Disable_ both of these
|
||||||
properties.
|
properties.
|
||||||
|
|
||||||
## How can I preload a model into Ollama to get faster response times?
|
## How can I preload a model into Ollama to get faster response times?
|
||||||
@@ -269,10 +270,11 @@ ollama stop llama3.2
|
|||||||
```
|
```
|
||||||
|
|
||||||
If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
|
If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
|
||||||
* a duration string (such as "10m" or "24h")
|
|
||||||
* a number in seconds (such as 3600)
|
- a duration string (such as "10m" or "24h")
|
||||||
* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
|
- a number in seconds (such as 3600)
|
||||||
* '0' which will unload the model immediately after generating a response
|
- any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
|
||||||
|
- '0' which will unload the model immediately after generating a response
|
||||||
|
|
||||||
For example, to preload a model and leave it in memory use:
|
For example, to preload a model and leave it in memory use:
|
||||||
|
|
||||||
@@ -296,7 +298,7 @@ If too many requests are sent to the server, it will respond with a 503 error in
|
|||||||
|
|
||||||
## How does Ollama handle concurrent requests?
|
## How does Ollama handle concurrent requests?
|
||||||
|
|
||||||
Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
|
Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
|
||||||
|
|
||||||
If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
|
If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
|
||||||
|
|
||||||
@@ -304,8 +306,8 @@ Parallel request processing for a given model results in increasing the context
|
|||||||
|
|
||||||
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
|
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
|
||||||
|
|
||||||
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 * the number of GPUs or 3 for CPU inference.
|
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference.
|
||||||
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default is 1, and will handle 1 request per model at a time.
|
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
|
||||||
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
||||||
|
|
||||||
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
|
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
|
||||||
@@ -326,7 +328,10 @@ To use quantized K/V cache with Ollama you can set the following environment var
|
|||||||
|
|
||||||
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`.
|
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`.
|
||||||
|
|
||||||
> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
|
<Note>
|
||||||
|
Currently this is a global option - meaning all models will run with the
|
||||||
|
specified quantization type.
|
||||||
|
</Note>
|
||||||
|
|
||||||
The currently available K/V cache quantization types are:
|
The currently available K/V cache quantization types are:
|
||||||
|
|
||||||
@@ -338,15 +343,36 @@ How much the cache quantization impacts the model's response quality will depend
|
|||||||
|
|
||||||
You may need to experiment with different quantization types to find the best balance between memory usage and quality.
|
You may need to experiment with different quantization types to find the best balance between memory usage and quality.
|
||||||
|
|
||||||
## How can I stop Ollama from starting when I login to my computer
|
## Where can I find my Ollama Public Key?
|
||||||
|
|
||||||
Ollama for Windows and macOS register as a login item during installation. You can disable this if you prefer not to have Ollama automatically start. Ollama will respect this setting across upgrades, unless you uninstall the application.
|
Your **Ollama Public Key** is the public part of the key pair that lets your local Ollama instance talk to [ollama.com](https://ollama.com).
|
||||||
|
|
||||||
**Windows**
|
You'll need it to:
|
||||||
- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
|
* Push models to Ollama
|
||||||
|
* Pull private models from Ollama to your machine
|
||||||
|
* Run models hosted in [Ollama Cloud](https://ollama.com/cloud)
|
||||||
|
|
||||||
**MacOS Monterey (v12)**
|
### How to Add the Key
|
||||||
- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
|
|
||||||
|
|
||||||
**MacOS Ventura (v13) and later**
|
* **Sign-in via the Settings page** in the **Mac** and **Windows App**
|
||||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
|
||||||
|
* **Sign‑in via CLI**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ollama signin
|
||||||
|
```
|
||||||
|
|
||||||
|
* **Manually copy & paste** the key on the **Ollama Keys** page:
|
||||||
|
[https://ollama.com/settings/keys](https://ollama.com/settings/keys)
|
||||||
|
|
||||||
|
### Where the Ollama Public Key lives
|
||||||
|
|
||||||
|
| OS | Path to `id_ed25519.pub` |
|
||||||
|
| :- | :- |
|
||||||
|
| macOS | `~/.ollama/id_ed25519.pub` |
|
||||||
|
| Linux | `/usr/share/ollama/.ollama/id_ed25519.pub` |
|
||||||
|
| Windows | `C:\Users\<username>\.ollama\id_ed25519.pub` |
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
Replace <username> with your actual Windows user name.
|
||||||
|
</Note>
|
||||||
3
docs/favicon-dark.svg
Normal file
|
After Width: | Height: | Size: 6.7 KiB |
3
docs/favicon.svg
Normal file
|
After Width: | Height: | Size: 6.5 KiB |
@@ -1,14 +1,16 @@
|
|||||||
# GPU
|
---
|
||||||
|
title: Hardware support
|
||||||
|
---
|
||||||
|
|
||||||
## Nvidia
|
## Nvidia
|
||||||
Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
|
|
||||||
|
Ollama supports Nvidia GPUs with compute capability 5.0+.
|
||||||
|
|
||||||
Check your compute compatibility to see if your card is supported:
|
Check your compute compatibility to see if your card is supported:
|
||||||
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
| Compute Capability | Family | Cards |
|
| Compute Capability | Family | Cards |
|
||||||
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| 12.0 | GeForce RTX 50xx | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090` |
|
|
||||||
| | NVIDIA Professioal | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell` |
|
|
||||||
| 9.0 | NVIDIA | `H200` `H100` |
|
| 9.0 | NVIDIA | `H200` `H100` |
|
||||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||||
@@ -48,9 +50,11 @@ driver bug by reloading the NVIDIA UVM driver with `sudo rmmod nvidia_uvm &&
|
|||||||
sudo modprobe nvidia_uvm`
|
sudo modprobe nvidia_uvm`
|
||||||
|
|
||||||
## AMD Radeon
|
## AMD Radeon
|
||||||
|
|
||||||
Ollama supports the following AMD GPUs:
|
Ollama supports the following AMD GPUs:
|
||||||
|
|
||||||
### Linux Support
|
### Linux Support
|
||||||
|
|
||||||
| Family | Cards and accelerators |
|
| Family | Cards and accelerators |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
|
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
|
||||||
@@ -58,15 +62,16 @@ Ollama supports the following AMD GPUs:
|
|||||||
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
|
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
|
||||||
|
|
||||||
### Windows Support
|
### Windows Support
|
||||||
|
|
||||||
With ROCm v6.1, the following GPUs are supported on Windows.
|
With ROCm v6.1, the following GPUs are supported on Windows.
|
||||||
|
|
||||||
| Family | Cards and accelerators |
|
| Family | Cards and accelerators |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
|
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
|
||||||
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
|
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
|
||||||
|
|
||||||
|
|
||||||
### Overrides on Linux
|
### Overrides on Linux
|
||||||
|
|
||||||
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
|
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
|
||||||
some cases you can force the system to try to use a similar LLVM target that is
|
some cases you can force the system to try to use a similar LLVM target that is
|
||||||
close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
|
close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
|
||||||
@@ -118,4 +123,5 @@ accessing the AMD GPU devices. On the host system you can run
|
|||||||
`sudo setsebool container_use_devices=1` to allow containers to use devices.
|
`sudo setsebool container_use_devices=1` to allow containers to use devices.
|
||||||
|
|
||||||
### Metal (Apple GPUs)
|
### Metal (Apple GPUs)
|
||||||
|
|
||||||
Ollama supports GPU acceleration on Apple devices via the Metal API.
|
Ollama supports GPU acceleration on Apple devices via the Metal API.
|
||||||
BIN
docs/images/cline-mcp.png
Normal file
|
After Width: | Height: | Size: 556 KiB |
BIN
docs/images/cline-settings.png
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
docs/images/codex-mcp.png
Normal file
|
After Width: | Height: | Size: 948 KiB |
BIN
docs/images/favicon.png
Normal file
|
After Width: | Height: | Size: 890 B |
BIN
docs/images/goose-cli.png
Normal file
|
After Width: | Height: | Size: 160 KiB |
BIN
docs/images/goose-mcp-1.png
Normal file
|
After Width: | Height: | Size: 877 KiB |
BIN
docs/images/goose-mcp-2.png
Normal file
|
After Width: | Height: | Size: 911 KiB |
BIN
docs/images/goose-settings.png
Normal file
|
After Width: | Height: | Size: 109 KiB |
BIN
docs/images/intellij-chat-sidebar.png
Normal file
|
After Width: | Height: | Size: 69 KiB |
BIN
docs/images/intellij-current-model.png
Normal file
|
After Width: | Height: | Size: 106 KiB |
BIN
docs/images/intellij-local-models.png
Normal file
|
After Width: | Height: | Size: 79 KiB |
BIN
docs/images/logo-dark.png
Normal file
|
After Width: | Height: | Size: 3.3 KiB |
BIN
docs/images/logo.png
Normal file
|
After Width: | Height: | Size: 2.7 KiB |
BIN
docs/images/n8n-chat-model.png
Normal file
|
After Width: | Height: | Size: 87 KiB |
BIN
docs/images/n8n-chat-node.png
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
docs/images/n8n-credential-creation.png
Normal file
|
After Width: | Height: | Size: 43 KiB |
BIN
docs/images/n8n-models.png
Normal file
|
After Width: | Height: | Size: 130 KiB |
BIN
docs/images/n8n-ollama-form.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
docs/images/ollama-settings.png
Normal file
|
After Width: | Height: | Size: 3.6 MiB |
BIN
docs/images/vscode-model-options.png
Normal file
|
After Width: | Height: | Size: 77 KiB |
BIN
docs/images/vscode-models.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
docs/images/vscode-sidebar.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
docs/images/welcome.png
Normal file
|
After Width: | Height: | Size: 233 KiB |
BIN
docs/images/xcode-chat-icon.png
Normal file
|
After Width: | Height: | Size: 186 KiB |