mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Compare commits
171 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c8d6d9a010 | ||
|
|
5804cf1723 | ||
|
|
bf7ee0f4d4 | ||
|
|
504a410f02 | ||
|
|
d05da29912 | ||
|
|
72962c6e08 | ||
|
|
7bd7b02712 | ||
|
|
8f9ab5e14d | ||
|
|
7717bb6a84 | ||
|
|
0ec2915ea7 | ||
|
|
c9a7541b9c | ||
|
|
d81cfd7d6f | ||
|
|
b330c830d3 | ||
|
|
d889c6fd07 | ||
|
|
56b9af336a | ||
|
|
fda0d3be52 | ||
|
|
cd5c8f6471 | ||
|
|
fef257c5c5 | ||
|
|
d066d9b8e0 | ||
|
|
5a00dc9fc9 | ||
|
|
c354e87809 | ||
|
|
93ac3760cb | ||
|
|
abed273de3 | ||
|
|
034392624c | ||
|
|
ecab6f1cc5 | ||
|
|
7d6900827d | ||
|
|
9246e6dd15 | ||
|
|
735a0ca2e4 | ||
|
|
dddb72e084 | ||
|
|
83a9b5271a | ||
|
|
4a8069f9c4 | ||
|
|
84b84ce2db | ||
|
|
486ae433ae | ||
|
|
bb6a086d63 | ||
|
|
30c8f201cc | ||
|
|
06db1f2cf5 | ||
|
|
06d4fba851 | ||
|
|
108fb6c1d1 | ||
|
|
da915345d1 | ||
|
|
8a027bc401 | ||
|
|
5446903fbd | ||
|
|
56318fb365 | ||
|
|
fe91d7fff1 | ||
|
|
608e87bf87 | ||
|
|
48685c6ed0 | ||
|
|
9565fa64a8 | ||
|
|
6719097649 | ||
|
|
b05c9e83d9 | ||
|
|
a60d9b89ce | ||
|
|
bf612cd608 | ||
|
|
ef98e56122 | ||
|
|
5f944baac7 | ||
|
|
6fc9d22707 | ||
|
|
f27c00d8c5 | ||
|
|
c7c845ec52 | ||
|
|
cf48603943 | ||
|
|
6e67be09b6 | ||
|
|
0f5f060d2b | ||
|
|
b3554778bd | ||
|
|
bbe7b96ded | ||
|
|
c18ff18b2c | ||
|
|
133770a548 | ||
|
|
f36ebfb478 | ||
|
|
5b55379651 | ||
|
|
93eb43d020 | ||
|
|
369479cc30 | ||
|
|
7d89e48f5c | ||
|
|
27bcce6d9f | ||
|
|
491fc312ae | ||
|
|
5e2653f9fe | ||
|
|
f29b167e1a | ||
|
|
037a4d103e | ||
|
|
50c05d57e0 | ||
|
|
35159de18a | ||
|
|
94fff5805f | ||
|
|
14d5093cd0 | ||
|
|
9df5f0e8e4 | ||
|
|
ad3eb00bee | ||
|
|
bfc2d61549 | ||
|
|
741affdfd6 | ||
|
|
8a7baa1bbf | ||
|
|
5f7b4a5e30 | ||
|
|
1aad838707 | ||
|
|
a1cef4d0a5 | ||
|
|
c41f0b9e6c | ||
|
|
142cbb722d | ||
|
|
9468c6824a | ||
|
|
11018196e0 | ||
|
|
56346ccfa3 | ||
|
|
8e4e509fa4 | ||
|
|
47c2b947a9 | ||
|
|
5eb77bf976 | ||
|
|
e4d0a9c325 | ||
|
|
7416ced70f | ||
|
|
9cfd2dd3e3 | ||
|
|
8e6da3cbc5 | ||
|
|
d9d50c43cc | ||
|
|
76feb6c569 | ||
|
|
6c1c1ad6a9 | ||
|
|
93ea9240ae | ||
|
|
413ae39f3c | ||
|
|
60e47573a6 | ||
|
|
d13c3daa0b | ||
|
|
1713eddcd0 | ||
|
|
4e1c4f6e0b | ||
|
|
397cae7962 | ||
|
|
1c70a00f71 | ||
|
|
eae3af6807 | ||
|
|
3eb08377f8 | ||
|
|
ac80010db8 | ||
|
|
47fa0839b9 | ||
|
|
0f92b19bec | ||
|
|
69be940bf6 | ||
|
|
9638c24c58 | ||
|
|
bb362caf88 | ||
|
|
386af6c1a0 | ||
|
|
0c819e167b | ||
|
|
7a1e1c1caf | ||
|
|
0b03b9c32f | ||
|
|
90ca84172c | ||
|
|
6bd8a4b0a1 | ||
|
|
77903ab8b4 | ||
|
|
e22286c9e1 | ||
|
|
107f695929 | ||
|
|
4ecc70d3b4 | ||
|
|
f9e1f572c2 | ||
|
|
3546bbd08c | ||
|
|
beb49eef65 | ||
|
|
5a28b9cf5f | ||
|
|
a017cf2fea | ||
|
|
19e5a890f7 | ||
|
|
f91c9e3709 | ||
|
|
2df6905ede | ||
|
|
d8be22e47d | ||
|
|
652c273f0e | ||
|
|
88e7705079 | ||
|
|
f9e31da946 | ||
|
|
88bb9e3328 | ||
|
|
3b19cdba2a | ||
|
|
927d98a6cd | ||
|
|
f6c811b320 | ||
|
|
4fe3a556fa | ||
|
|
fc3b4cda89 | ||
|
|
d470ebe78b | ||
|
|
c7bcb00319 | ||
|
|
74d45f0102 | ||
|
|
9fddef3731 | ||
|
|
885cf45087 | ||
|
|
9352eeb752 | ||
|
|
0ad0e738cd | ||
|
|
3442ca76a9 | ||
|
|
4574e557ee | ||
|
|
bdc4308afb | ||
|
|
d29cd4c2ed | ||
|
|
a84c05cf91 | ||
|
|
e3d7f32af7 | ||
|
|
3a75e74e34 | ||
|
|
28832df4bd | ||
|
|
237dccba1e | ||
|
|
b3f75fc812 | ||
|
|
8200c371ae | ||
|
|
0a8d6ea86d | ||
|
|
8e1050f366 | ||
|
|
eda8a32a09 | ||
|
|
a0a40aa20c | ||
|
|
2697d7f5aa | ||
|
|
1f32276178 | ||
|
|
4c4fe3f87f | ||
|
|
feedf49c71 | ||
|
|
8b00a415ab | ||
|
|
2003d60159 |
@@ -7,3 +7,5 @@ llm/llama.cpp
|
|||||||
.env
|
.env
|
||||||
.cache
|
.cache
|
||||||
test_data
|
test_data
|
||||||
|
llm/build
|
||||||
|
llama/build
|
||||||
|
|||||||
249
.github/workflows/release.yaml
vendored
249
.github/workflows/release.yaml
vendored
@@ -31,7 +31,7 @@ jobs:
|
|||||||
security set-keychain-settings -lut 3600 build.keychain
|
security set-keychain-settings -lut 3600 build.keychain
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: Build Darwin
|
- name: Build Darwin
|
||||||
env:
|
env:
|
||||||
@@ -87,7 +87,7 @@ jobs:
|
|||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
- run: go get ./...
|
||||||
- run: |
|
- run: |
|
||||||
@@ -102,7 +102,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
|
build/**/*.a
|
||||||
llm/build/**/*.a
|
llm/build/**/*.a
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
|
|
||||||
@@ -141,7 +142,7 @@ jobs:
|
|||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
- name: 'Install ROCm'
|
||||||
run: |
|
run: |
|
||||||
@@ -176,7 +177,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
@@ -187,6 +188,13 @@ jobs:
|
|||||||
generate-windows-cuda:
|
generate-windows-cuda:
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: windows
|
runs-on: windows
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
cuda:
|
||||||
|
- version: "11"
|
||||||
|
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
|
||||||
|
- version: "12"
|
||||||
|
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
|
||||||
env:
|
env:
|
||||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||||
steps:
|
steps:
|
||||||
@@ -218,13 +226,13 @@ jobs:
|
|||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: 'Install CUDA ${{ matrix.cuda.version }}'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading CUDA Installer"
|
write-host "downloading CUDA Installer"
|
||||||
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
||||||
write-host "Installing CUDA"
|
write-host "Installing CUDA"
|
||||||
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
||||||
write-host "Completed CUDA"
|
write-host "Completed CUDA"
|
||||||
@@ -256,15 +264,16 @@ jobs:
|
|||||||
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
|
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: windows-cuda-deps-${{ matrix.cuda.version }}
|
||||||
path: dist/deps/*
|
path: dist/deps/*
|
||||||
|
|
||||||
|
|
||||||
# Import the prior generation steps and build the final windows assets
|
# Import the prior generation steps and build the final windows assets
|
||||||
build-windows:
|
build-windows:
|
||||||
environment: release
|
environment: release
|
||||||
@@ -306,7 +315,7 @@ jobs:
|
|||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get
|
- run: go get
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
@@ -314,17 +323,23 @@ jobs:
|
|||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda-11
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: generate-windows-cuda-12
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: windows-cuda-deps-11
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: windows-cuda-deps-12
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-rocm-deps
|
name: windows-rocm-deps
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
- run: dir llm/build
|
- run: dir build
|
||||||
- run: |
|
- run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
$gopath=(get-command go).source | split-path -parent
|
||||||
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
|
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
|
||||||
@@ -345,9 +360,7 @@ jobs:
|
|||||||
environment: release
|
environment: release
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
PLATFORM: linux/amd64
|
||||||
BUILD_ARCH: amd64
|
|
||||||
PUSH: '1'
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -355,15 +368,8 @@ jobs:
|
|||||||
- name: Set Version
|
- name: Set Version
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_linux.sh
|
./scripts/build_linux.sh
|
||||||
./scripts/build_docker.sh
|
|
||||||
mv dist/deps/* dist/
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-linux-amd64
|
name: dist-linux-amd64
|
||||||
@@ -377,9 +383,7 @@ jobs:
|
|||||||
environment: release
|
environment: release
|
||||||
runs-on: linux-arm64
|
runs-on: linux-arm64
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
PLATFORM: linux/arm64
|
||||||
BUILD_ARCH: arm64
|
|
||||||
PUSH: '1'
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -408,14 +412,8 @@ jobs:
|
|||||||
sudo usermod -aG docker $USER
|
sudo usermod -aG docker $USER
|
||||||
sudo apt-get install acl
|
sudo apt-get install acl
|
||||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_linux.sh
|
./scripts/build_linux.sh
|
||||||
./scripts/build_docker.sh
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-linux-arm64
|
name: dist-linux-arm64
|
||||||
@@ -423,6 +421,178 @@ jobs:
|
|||||||
dist/*linux*
|
dist/*linux*
|
||||||
!dist/*-cov
|
!dist/*-cov
|
||||||
|
|
||||||
|
# Container image build
|
||||||
|
build-container-image:
|
||||||
|
environment: release
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
runner:
|
||||||
|
- linux
|
||||||
|
- linux-arm64
|
||||||
|
runs-on: ${{ matrix.runner }}
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: 'Install Docker'
|
||||||
|
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y ca-certificates curl
|
||||||
|
sudo install -m 0755 -d /etc/apt/keyrings
|
||||||
|
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||||
|
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||||
|
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||||
|
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||||
|
sudo usermod -aG docker $USER
|
||||||
|
sudo apt-get install acl
|
||||||
|
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
machine=$(uname -m)
|
||||||
|
case ${machine} in
|
||||||
|
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||||
|
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||||
|
esac >>$GITHUB_ENV
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Build and push by digest
|
||||||
|
id: build
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: "."
|
||||||
|
platforms: linux/${{ env.ARCH }}
|
||||||
|
build-args: |
|
||||||
|
GOFLAGS
|
||||||
|
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
|
||||||
|
- name: Export digest
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/digests
|
||||||
|
digest="${{ steps.build.outputs.digest }}"
|
||||||
|
touch "/tmp/digests/${digest#sha256:}"
|
||||||
|
- name: Upload digest
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: digests-${{ env.PLATFORM_PAIR }}
|
||||||
|
path: /tmp/digests/*
|
||||||
|
if-no-files-found: error
|
||||||
|
retention-days: 1
|
||||||
|
merge:
|
||||||
|
environment: release
|
||||||
|
runs-on: linux
|
||||||
|
needs:
|
||||||
|
- build-container-image
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: Download digests
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: /tmp/digests
|
||||||
|
pattern: digests-*
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
machine=$(uname -m)
|
||||||
|
case ${machine} in
|
||||||
|
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||||
|
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||||
|
esac >>$GITHUB_ENV
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Create manifest list and push
|
||||||
|
working-directory: /tmp/digests
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||||
|
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
|
||||||
|
- name: Inspect image
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
|
||||||
|
build-container-image-rocm:
|
||||||
|
environment: release
|
||||||
|
runs-on: linux
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
ARCH: amd64
|
||||||
|
PLATFORM_PAIR: linux-amd64
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Build and push by digest
|
||||||
|
id: build
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: "."
|
||||||
|
target: runtime-rocm
|
||||||
|
build-args: |
|
||||||
|
GOFLAGS
|
||||||
|
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
|
||||||
|
push: true
|
||||||
|
|
||||||
# Aggregate all the assets and ship a release
|
# Aggregate all the assets and ship a release
|
||||||
release:
|
release:
|
||||||
needs:
|
needs:
|
||||||
@@ -435,8 +605,6 @@ jobs:
|
|||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_IMAGE_BUILD: '1'
|
|
||||||
PUSH: '1'
|
|
||||||
GH_TOKEN: ${{ github.token }}
|
GH_TOKEN: ${{ github.token }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -445,12 +613,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
|
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: ./scripts/build_docker.sh
|
|
||||||
- name: Retrieve built artifact
|
- name: Retrieve built artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
@@ -459,7 +621,8 @@ jobs:
|
|||||||
merge-multiple: true
|
merge-multiple: true
|
||||||
- run: |
|
- run: |
|
||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
(cd dist; sha256sum * > sha256sum.txt)
|
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
|
||||||
|
mv sha256sum.txt dist/
|
||||||
cat dist/sha256sum.txt
|
cat dist/sha256sum.txt
|
||||||
- name: Create or update Release
|
- name: Create or update Release
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
53
.github/workflows/test.yaml
vendored
53
.github/workflows/test.yaml
vendored
@@ -63,7 +63,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
- run: go get ./...
|
||||||
- run: |
|
- run: |
|
||||||
@@ -81,12 +81,6 @@ jobs:
|
|||||||
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
||||||
name: 'Unix Go Generate'
|
name: 'Unix Go Generate'
|
||||||
- run: go build .
|
- run: go build .
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
llm/build/**/*.a
|
|
||||||
generate-cuda:
|
generate-cuda:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
|
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
|
||||||
@@ -114,12 +108,6 @@ jobs:
|
|||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: cuda-${{ matrix.cuda-version }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
generate-rocm:
|
generate-rocm:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
||||||
@@ -147,12 +135,6 @@ jobs:
|
|||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: rocm-${{ matrix.rocm-version }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
generate-windows-rocm:
|
generate-windows-rocm:
|
||||||
@@ -163,7 +145,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
- name: 'Install ROCm'
|
||||||
run: |
|
run: |
|
||||||
@@ -189,7 +171,6 @@ jobs:
|
|||||||
name: go generate
|
name: go generate
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
# TODO - do we need any artifacts?
|
|
||||||
|
|
||||||
# CUDA generation step
|
# CUDA generation step
|
||||||
generate-windows-cuda:
|
generate-windows-cuda:
|
||||||
@@ -200,7 +181,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: 'Install CUDA'
|
||||||
run: |
|
run: |
|
||||||
@@ -231,7 +212,6 @@ jobs:
|
|||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
# TODO - do we need any artifacts?
|
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
strategy:
|
strategy:
|
||||||
@@ -255,7 +235,7 @@ jobs:
|
|||||||
submodules: recursive
|
submodules: recursive
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: false
|
cache: false
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
@@ -263,14 +243,6 @@ jobs:
|
|||||||
arm64) echo ARCH=arm64 ;;
|
arm64) echo ARCH=arm64 ;;
|
||||||
esac >>$GITHUB_ENV
|
esac >>$GITHUB_ENV
|
||||||
shell: bash
|
shell: bash
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
|
||||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
|
||||||
- uses: golangci/golangci-lint-action@v6
|
- uses: golangci/golangci-lint-action@v6
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 8m0s -v
|
||||||
@@ -297,27 +269,14 @@ jobs:
|
|||||||
submodules: recursive
|
submodules: recursive
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
amd64) echo ARCH=x86_64 ;;
|
amd64) echo ARCH=amd64 ;;
|
||||||
arm64) echo ARCH=arm64 ;;
|
arm64) echo ARCH=arm64 ;;
|
||||||
esac >>$GITHUB_ENV
|
esac >>$GITHUB_ENV
|
||||||
shell: bash
|
shell: bash
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
|
||||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
|
||||||
shell: bash
|
|
||||||
- run: go generate ./...
|
- run: go generate ./...
|
||||||
- run: go build
|
- run: go build
|
||||||
- run: go test -v ./...
|
- run: go test -v ./...
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ matrix.os }}-binaries
|
|
||||||
path: ollama
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -14,4 +14,7 @@ ggml-metal.metal
|
|||||||
test_data
|
test_data
|
||||||
*.crt
|
*.crt
|
||||||
llm/build
|
llm/build
|
||||||
|
build/*/*/*
|
||||||
|
!build/**/placeholder
|
||||||
|
llama/build
|
||||||
__debug_bin*
|
__debug_bin*
|
||||||
@@ -24,7 +24,6 @@ linters:
|
|||||||
- nosprintfhostport
|
- nosprintfhostport
|
||||||
- staticcheck
|
- staticcheck
|
||||||
- tenv
|
- tenv
|
||||||
- testifylint
|
|
||||||
- unconvert
|
- unconvert
|
||||||
- unused
|
- unused
|
||||||
- usestdlibvars
|
- usestdlibvars
|
||||||
@@ -33,6 +32,10 @@ linters:
|
|||||||
linters-settings:
|
linters-settings:
|
||||||
gci:
|
gci:
|
||||||
sections: [standard, default, localmodule]
|
sections: [standard, default, localmodule]
|
||||||
|
staticcheck:
|
||||||
|
checks:
|
||||||
|
- all
|
||||||
|
- -SA1019 # omit Deprecated check
|
||||||
severity:
|
severity:
|
||||||
default-severity: error
|
default-severity: error
|
||||||
rules:
|
rules:
|
||||||
|
|||||||
37
CONTRIBUTING.md
Normal file
37
CONTRIBUTING.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# Contributing to Ollama
|
||||||
|
|
||||||
|
Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started.
|
||||||
|
|
||||||
|
## Set up
|
||||||
|
|
||||||
|
See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally.
|
||||||
|
|
||||||
|
## Pull requests
|
||||||
|
|
||||||
|
### Ideal issues
|
||||||
|
|
||||||
|
* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error.
|
||||||
|
* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading.
|
||||||
|
* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly.
|
||||||
|
|
||||||
|
### Issues that are harder to review
|
||||||
|
|
||||||
|
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
|
||||||
|
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
|
||||||
|
* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
|
||||||
|
|
||||||
|
### Issues that may not be accepted
|
||||||
|
|
||||||
|
* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API)
|
||||||
|
* Changes that add significant friction to the user experience
|
||||||
|
* Changes that create a large future maintenance burden for maintainers and contributors
|
||||||
|
|
||||||
|
### Best practices
|
||||||
|
|
||||||
|
* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact.
|
||||||
|
* Tests: please add test coverage to changes where possible.
|
||||||
|
* Minimize dependencies: avoid adding new dependencies unless absolutely necessary.
|
||||||
|
|
||||||
|
## Need help?
|
||||||
|
|
||||||
|
If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama).
|
||||||
222
Dockerfile
222
Dockerfile
@@ -1,7 +1,9 @@
|
|||||||
ARG GOLANG_VERSION=1.22.5
|
ARG GOLANG_VERSION=1.22.5
|
||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
|
ARG CUDA_VERSION_11=11.3.1
|
||||||
ARG CUDA_VERSION=11.3.1
|
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
||||||
|
ARG CUDA_VERSION_12=12.4.0
|
||||||
|
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
ARG ROCM_VERSION=6.1.2
|
ARG ROCM_VERSION=6.1.2
|
||||||
|
|
||||||
# Copy the minimal context we need to run the generate scripts
|
# Copy the minimal context we need to run the generate scripts
|
||||||
@@ -10,131 +12,243 @@ COPY .git .git
|
|||||||
COPY .gitmodules .gitmodules
|
COPY .gitmodules .gitmodules
|
||||||
COPY llm llm
|
COPY llm llm
|
||||||
|
|
||||||
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
|
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
ARG CUDA_V11_ARCHITECTURES
|
||||||
|
ENV GOARCH=amd64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
|
||||||
|
CUDA_VARIANT="_v11" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
|
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
ARG CUDA_V12_ARCHITECTURES
|
||||||
|
ENV GOARCH=amd64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
|
||||||
|
CUDA_VARIANT="_v12" \
|
||||||
|
OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
|
||||||
|
ARG CMAKE_VERSION
|
||||||
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
|
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||||
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ARG CUDA_V11_ARCHITECTURES
|
||||||
|
ENV GOARCH=arm64
|
||||||
|
RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
|
||||||
|
CUDA_VARIANT="_v11" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
|
||||||
|
ARG CMAKE_VERSION
|
||||||
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
|
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||||
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ARG CUDA_V12_ARCHITECTURES
|
||||||
|
ENV GOARCH=arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
|
||||||
|
CUDA_VARIANT="_v12" \
|
||||||
|
OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
|
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||||
ENV LIBRARY_PATH /opt/amdgpu/lib64
|
ENV LIBRARY_PATH=/opt/amdgpu/lib64
|
||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
ARG AMDGPU_TARGETS
|
ARG AMDGPU_TARGETS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
ENV GOARCH=amd64
|
||||||
RUN mkdir /tmp/scratch && \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
|
||||||
cp ${dep} /tmp/scratch/ || exit 1 ; \
|
RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
|
||||||
done && \
|
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
|
||||||
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
|
|
||||||
mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
|
|
||||||
(cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
|
|
||||||
|
|
||||||
|
|
||||||
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
|
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
ARG GOLANG_VERSION
|
ARG GOLANG_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
||||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
ARG OLLAMA_CUSTOM_CPU_DEFS
|
ARG OLLAMA_CUSTOM_CPU_DEFS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH=amd64
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
|
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
|
||||||
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_CPU_TARGET="static" bash gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
|
FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
ARG GOLANG_VERSION
|
ARG GOLANG_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
||||||
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
ARG OLLAMA_CUSTOM_CPU_DEFS
|
ARG OLLAMA_CUSTOM_CPU_DEFS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH=arm64
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
|
|
||||||
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
|
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
|
||||||
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_CPU_TARGET="static" bash gen_linux.sh
|
||||||
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
|
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
|
||||||
|
|
||||||
|
|
||||||
# Intermediate stage used for ./scripts/build_linux.sh
|
# Intermediate stages used for ./scripts/build_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
|
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
|
||||||
ENV CGO_ENABLED 1
|
ENV CGO_ENABLED=1
|
||||||
WORKDIR /go/src/github.com/ollama/ollama
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
|
||||||
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
|
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN go build -trimpath .
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
go build -trimpath -o dist/linux-amd64/bin/ollama .
|
||||||
|
RUN cd dist/linux-$GOARCH && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-rocm && \
|
||||||
|
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
|
||||||
|
|
||||||
# Intermediate stage used for ./scripts/build_linux.sh
|
|
||||||
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
|
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
|
||||||
ENV CGO_ENABLED 1
|
ENV CGO_ENABLED=1
|
||||||
ARG GOLANG_VERSION
|
ARG GOLANG_VERSION
|
||||||
WORKDIR /go/src/github.com/ollama/ollama
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
|
||||||
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN go build -trimpath .
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
||||||
|
RUN cd dist/linux-$GOARCH && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
||||||
|
|
||||||
# Runtime stages
|
FROM --platform=linux/amd64 scratch AS dist-amd64
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
||||||
RUN apt-get update && apt-get install -y ca-certificates
|
FROM --platform=linux/arm64 scratch AS dist-arm64
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
|
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
||||||
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
|
FROM dist-$TARGETARCH as dist
|
||||||
RUN apt-get update && apt-get install -y ca-certificates
|
|
||||||
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
|
|
||||||
|
|
||||||
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
|
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
|
# Optimized container images do not cary nested payloads
|
||||||
RUN update-pciids
|
FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
|
COPY . .
|
||||||
|
ARG GOFLAGS
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
go build -trimpath -o dist/linux-amd64/bin/ollama .
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
|
COPY . .
|
||||||
|
ARG GOFLAGS
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
||||||
|
|
||||||
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y ca-certificates && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
|
COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y ca-certificates && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
||||||
|
COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
|
||||||
|
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
||||||
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
||||||
|
# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
|
||||||
|
# across releases
|
||||||
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y ca-certificates && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
|
COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
EXPOSE 11434
|
EXPOSE 11434
|
||||||
ENV OLLAMA_HOST 0.0.0.0
|
ENV OLLAMA_HOST=0.0.0.0
|
||||||
|
|
||||||
ENTRYPOINT ["/bin/ollama"]
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
CMD ["serve"]
|
CMD ["serve"]
|
||||||
|
|
||||||
FROM runtime-$TARGETARCH
|
FROM runtime-$TARGETARCH
|
||||||
EXPOSE 11434
|
EXPOSE 11434
|
||||||
ENV OLLAMA_HOST 0.0.0.0
|
ENV OLLAMA_HOST=0.0.0.0
|
||||||
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
|||||||
46
README.md
46
README.md
@@ -215,6 +215,18 @@ ollama show llama3.1
|
|||||||
ollama list
|
ollama list
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### List which models are currently loaded
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama ps
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stop a model which is currently running
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama stop llama3.1
|
||||||
|
```
|
||||||
|
|
||||||
### Start Ollama
|
### Start Ollama
|
||||||
|
|
||||||
`ollama serve` is used when you want to start ollama without running the desktop application.
|
`ollama serve` is used when you want to start ollama without running the desktop application.
|
||||||
@@ -313,13 +325,24 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
||||||
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
||||||
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
||||||
|
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
|
||||||
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
||||||
|
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
|
||||||
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
||||||
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
||||||
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
||||||
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
||||||
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
||||||
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
||||||
|
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
|
||||||
|
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
||||||
|
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
|
||||||
|
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
||||||
|
- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
|
||||||
|
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||||
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
|
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||||
|
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
@@ -344,6 +367,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
||||||
- [gollama](https://github.com/sammcj/gollama)
|
- [gollama](https://github.com/sammcj/gollama)
|
||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
|
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||||
|
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
||||||
|
|
||||||
|
### Apple Vision Pro
|
||||||
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
|
|
||||||
### Database
|
### Database
|
||||||
|
|
||||||
@@ -353,23 +381,28 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
### Package managers
|
### Package managers
|
||||||
|
|
||||||
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
||||||
|
- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
|
||||||
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
||||||
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
||||||
|
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
|
||||||
|
- [Flox](https://flox.dev/blog/ollama-part-one)
|
||||||
|
|
||||||
### Libraries
|
### Libraries
|
||||||
|
|
||||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||||
|
- [crewAI](https://github.com/crewAIInc/crewAI)
|
||||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||||
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
||||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||||
|
- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
|
||||||
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
||||||
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
|
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
|
||||||
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
||||||
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
|
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
|
||||||
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
|
- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
|
||||||
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
||||||
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
|
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
|
||||||
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
||||||
@@ -386,11 +419,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
||||||
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
|
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
|
||||||
- [LlamaScript](https://github.com/Project-Llama/llamascript)
|
- [LlamaScript](https://github.com/Project-Llama/llamascript)
|
||||||
|
- [Gollm](https://docs.gollm.co/examples/ollama-example)
|
||||||
|
- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
|
||||||
|
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
|
||||||
|
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
||||||
|
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
||||||
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
|
|
||||||
### Extensions & Plugins
|
### Extensions & Plugins
|
||||||
|
|
||||||
@@ -415,11 +454,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
||||||
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
||||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||||
|
- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model)
|
||||||
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
||||||
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
|
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||||
|
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||||
|
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||||
|
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|||||||
@@ -298,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
|
|||||||
return &lr, nil
|
return &lr, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// List running models.
|
// ListRunning lists running models.
|
||||||
func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
|
func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
|
||||||
var lr ProcessResponse
|
var lr ProcessResponse
|
||||||
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
|
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
|
||||||
@@ -333,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
|
|||||||
return &resp, nil
|
return &resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hearbeat checks if the server has started and is responsive; if yes, it
|
// Heartbeat checks if the server has started and is responsive; if yes, it
|
||||||
// returns nil, otherwise an error.
|
// returns nil, otherwise an error.
|
||||||
func (c *Client) Heartbeat(ctx context.Context) error {
|
func (c *Client) Heartbeat(ctx context.Context) error {
|
||||||
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
|
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
|
||||||
|
|||||||
16
api/types.go
16
api/types.go
@@ -296,15 +296,17 @@ type EmbeddingResponse struct {
|
|||||||
// CreateRequest is the request passed to [Client.Create].
|
// CreateRequest is the request passed to [Client.Create].
|
||||||
type CreateRequest struct {
|
type CreateRequest struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Path string `json:"path"`
|
|
||||||
Modelfile string `json:"modelfile"`
|
Modelfile string `json:"modelfile"`
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
Quantize string `json:"quantize,omitempty"`
|
Quantize string `json:"quantize,omitempty"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
|
||||||
// Quantization is deprecated, see Quantize
|
// Deprecated: set the file content with Modelfile instead
|
||||||
|
Path string `json:"path"`
|
||||||
|
|
||||||
|
// Deprecated: use Quantize instead
|
||||||
Quantization string `json:"quantization,omitempty"`
|
Quantization string `json:"quantization,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -312,7 +314,7 @@ type CreateRequest struct {
|
|||||||
type DeleteRequest struct {
|
type DeleteRequest struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -327,7 +329,7 @@ type ShowRequest struct {
|
|||||||
|
|
||||||
Options map[string]interface{} `json:"options"`
|
Options map[string]interface{} `json:"options"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -359,7 +361,7 @@ type PullRequest struct {
|
|||||||
Password string `json:"password"`
|
Password string `json:"password"`
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -380,7 +382,7 @@ type PushRequest struct {
|
|||||||
Password string `json:"password"`
|
Password string `json:"password"`
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Deprecated: set the model name with Model instead
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -88,19 +88,10 @@ DialogFontSize=12
|
|||||||
[Files]
|
[Files]
|
||||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
#if DirExists("..\dist\windows-amd64\cuda")
|
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
|
||||||
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
#if DirExists("..\dist\windows-amd64\oneapi")
|
|
||||||
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
#if DirExists("..\dist\windows-amd64\rocm")
|
|
||||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
[Icons]
|
[Icons]
|
||||||
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
updatAvailableMenuID = 1
|
updateAvailableMenuID = 1
|
||||||
updateMenuID = updatAvailableMenuID + 1
|
updateMenuID = updateAvailableMenuID + 1
|
||||||
separatorMenuID = updateMenuID + 1
|
separatorMenuID = updateMenuID + 1
|
||||||
diagLogsMenuID = separatorMenuID + 1
|
diagLogsMenuID = separatorMenuID + 1
|
||||||
diagSeparatorMenuID = diagLogsMenuID + 1
|
diagSeparatorMenuID = diagLogsMenuID + 1
|
||||||
@@ -35,7 +35,7 @@ func (t *winTray) initMenus() error {
|
|||||||
func (t *winTray) UpdateAvailable(ver string) error {
|
func (t *winTray) UpdateAvailable(ver string) error {
|
||||||
if !t.updateNotified {
|
if !t.updateNotified {
|
||||||
slog.Debug("updating menu and sending notification for new update")
|
slog.Debug("updating menu and sending notification for new update")
|
||||||
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
|
if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
|
||||||
return fmt.Errorf("unable to create menu entries %w", err)
|
return fmt.Errorf("unable to create menu entries %w", err)
|
||||||
}
|
}
|
||||||
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
|
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"golang.org/x/sys/windows"
|
"golang.org/x/sys/windows"
|
||||||
@@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
|
|||||||
t.muNID.Lock()
|
t.muNID.Lock()
|
||||||
defer t.muNID.Unlock()
|
defer t.muNID.Unlock()
|
||||||
t.nid.Icon = h
|
t.nid.Icon = h
|
||||||
t.nid.Flags |= NIF_ICON
|
t.nid.Flags |= NIF_ICON | NIF_TIP
|
||||||
|
if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
|
||||||
|
copy(t.nid.Tip[:], toolTipUTF16)
|
||||||
|
} else {
|
||||||
|
return err
|
||||||
|
}
|
||||||
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
|
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
|
||||||
|
|
||||||
return t.nid.modify()
|
return t.nid.modify()
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ const (
|
|||||||
MIIM_SUBMENU = 0x00000004
|
MIIM_SUBMENU = 0x00000004
|
||||||
MIM_APPLYTOSUBMENUS = 0x80000000
|
MIM_APPLYTOSUBMENUS = 0x80000000
|
||||||
NIF_ICON = 0x00000002
|
NIF_ICON = 0x00000002
|
||||||
|
NIF_TIP = 0x00000004
|
||||||
NIF_INFO = 0x00000010
|
NIF_INFO = 0x00000010
|
||||||
NIF_MESSAGE = 0x00000001
|
NIF_MESSAGE = 0x00000001
|
||||||
SW_HIDE = 0
|
SW_HIDE = 0
|
||||||
|
|||||||
1
build/darwin/amd64/placeholder
Normal file
1
build/darwin/amd64/placeholder
Normal file
@@ -0,0 +1 @@
|
|||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
||||||
1
build/darwin/arm64/placeholder
Normal file
1
build/darwin/arm64/placeholder
Normal file
@@ -0,0 +1 @@
|
|||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
||||||
8
build/embed_darwin_amd64.go
Normal file
8
build/embed_darwin_amd64.go
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||||
|
|
||||||
|
//go:embed darwin/amd64/*
|
||||||
|
var EmbedFS embed.FS
|
||||||
8
build/embed_darwin_arm64.go
Normal file
8
build/embed_darwin_arm64.go
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||||
|
|
||||||
|
//go:embed darwin/arm64/*
|
||||||
|
var EmbedFS embed.FS
|
||||||
6
build/embed_linux.go
Normal file
6
build/embed_linux.go
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
//go:embed linux/*
|
||||||
|
var EmbedFS embed.FS
|
||||||
8
build/embed_unused.go
Normal file
8
build/embed_unused.go
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
//go:build !linux && !darwin
|
||||||
|
|
||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// unused on windows
|
||||||
|
var EmbedFS embed.FS
|
||||||
1
build/linux/amd64/placeholder
Normal file
1
build/linux/amd64/placeholder
Normal file
@@ -0,0 +1 @@
|
|||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
||||||
1
build/linux/arm64/placeholder
Normal file
1
build/linux/arm64/placeholder
Normal file
@@ -0,0 +1 @@
|
|||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
||||||
250
cmd/cmd.go
250
cmd/cmd.go
@@ -2,6 +2,7 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/zip"
|
"archive/zip"
|
||||||
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crypto/ed25519"
|
"crypto/ed25519"
|
||||||
@@ -21,6 +22,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -204,6 +206,12 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
// safetensors files might be unresolved git lfs references; skip if they are
|
// safetensors files might be unresolved git lfs references; skip if they are
|
||||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
||||||
files = append(files, st...)
|
files = append(files, st...)
|
||||||
|
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||||
|
// covers adapters.safetensors
|
||||||
|
files = append(files, st...)
|
||||||
|
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||||
|
// covers adapter_model.safetensors
|
||||||
|
files = append(files, st...)
|
||||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
||||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
||||||
@@ -223,6 +231,14 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
}
|
}
|
||||||
files = append(files, js...)
|
files = append(files, js...)
|
||||||
|
|
||||||
|
// bert models require a nested config.json
|
||||||
|
// TODO(mxyng): merge this with the glob above
|
||||||
|
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
files = append(files, js...)
|
||||||
|
|
||||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||||
@@ -252,6 +268,11 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zfi.Name, err = filepath.Rel(path, file)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
zf, err := zipfile.CreateHeader(zfi)
|
zf, err := zipfile.CreateHeader(zfi)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@@ -325,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
|
|||||||
return len(p), nil
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||||
|
p := progress.NewProgress(os.Stderr)
|
||||||
|
defer p.StopAndClear()
|
||||||
|
|
||||||
|
spinner := progress.NewSpinner("")
|
||||||
|
p.Add("", spinner)
|
||||||
|
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
req := &api.GenerateRequest{
|
||||||
|
Model: opts.Model,
|
||||||
|
KeepAlive: opts.KeepAlive,
|
||||||
|
}
|
||||||
|
|
||||||
|
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
||||||
|
}
|
||||||
|
|
||||||
|
func StopHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
opts := &runOptions{
|
||||||
|
Model: args[0],
|
||||||
|
KeepAlive: &api.Duration{Duration: 0},
|
||||||
|
}
|
||||||
|
if err := loadOrUnloadModel(cmd, opts); err != nil {
|
||||||
|
if strings.Contains(err.Error(), "not found") {
|
||||||
|
return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
interactive := true
|
interactive := true
|
||||||
|
|
||||||
@@ -403,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
opts.ParentModel = info.Details.ParentModel
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
|
||||||
if interactive {
|
if interactive {
|
||||||
if err := loadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -559,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
|
|||||||
table.SetHeaderLine(false)
|
table.SetHeaderLine(false)
|
||||||
table.SetBorder(false)
|
table.SetBorder(false)
|
||||||
table.SetNoWhiteSpace(true)
|
table.SetNoWhiteSpace(true)
|
||||||
table.SetTablePadding("\t")
|
table.SetTablePadding(" ")
|
||||||
table.AppendBulk(data)
|
table.AppendBulk(data)
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
@@ -594,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
|||||||
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
||||||
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
|
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
|
||||||
}
|
}
|
||||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
|
|
||||||
|
var until string
|
||||||
|
delta := time.Since(m.ExpiresAt)
|
||||||
|
if delta > 0 {
|
||||||
|
until = "Stopping..."
|
||||||
|
} else {
|
||||||
|
until = format.HumanTime(m.ExpiresAt, "Never")
|
||||||
|
}
|
||||||
|
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -605,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
|||||||
table.SetHeaderLine(false)
|
table.SetHeaderLine(false)
|
||||||
table.SetBorder(false)
|
table.SetBorder(false)
|
||||||
table.SetNoWhiteSpace(true)
|
table.SetNoWhiteSpace(true)
|
||||||
table.SetTablePadding("\t")
|
table.SetTablePadding(" ")
|
||||||
table.AppendBulk(data)
|
table.AppendBulk(data)
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
@@ -701,122 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
showInfo(resp)
|
return showInfo(resp, os.Stdout)
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func showInfo(resp *api.ShowResponse) {
|
func showInfo(resp *api.ShowResponse, w io.Writer) error {
|
||||||
|
tableRender := func(header string, rows func() [][]string) {
|
||||||
|
fmt.Fprintln(w, " ", header)
|
||||||
|
table := tablewriter.NewWriter(w)
|
||||||
|
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||||
|
table.SetBorder(false)
|
||||||
|
table.SetNoWhiteSpace(true)
|
||||||
|
table.SetTablePadding(" ")
|
||||||
|
|
||||||
|
switch header {
|
||||||
|
case "Template", "System", "License":
|
||||||
|
table.SetColWidth(100)
|
||||||
|
}
|
||||||
|
|
||||||
|
table.AppendBulk(rows())
|
||||||
|
table.Render()
|
||||||
|
fmt.Fprintln(w)
|
||||||
|
}
|
||||||
|
|
||||||
|
tableRender("Model", func() (rows [][]string) {
|
||||||
|
if resp.ModelInfo != nil {
|
||||||
arch := resp.ModelInfo["general.architecture"].(string)
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
|
rows = append(rows, []string{"", "architecture", arch})
|
||||||
modelData := [][]string{
|
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
|
||||||
{"arch", arch},
|
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
|
||||||
{"parameters", resp.Details.ParameterSize},
|
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||||
{"quantization", resp.Details.QuantizationLevel},
|
} else {
|
||||||
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
|
rows = append(rows, []string{"", "architecture", resp.Details.Family})
|
||||||
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
|
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
|
||||||
}
|
|
||||||
|
|
||||||
mainTableData := [][]string{
|
|
||||||
{"Model"},
|
|
||||||
{renderSubTable(modelData, false)},
|
|
||||||
}
|
}
|
||||||
|
rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
|
||||||
|
return
|
||||||
|
})
|
||||||
|
|
||||||
if resp.ProjectorInfo != nil {
|
if resp.ProjectorInfo != nil {
|
||||||
projectorData := [][]string{
|
tableRender("Projector", func() (rows [][]string) {
|
||||||
{"arch", "clip"},
|
arch := resp.ProjectorInfo["general.architecture"].(string)
|
||||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
rows = append(rows, []string{"", "architecture", arch})
|
||||||
}
|
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
|
||||||
|
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||||
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
|
||||||
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
return
|
||||||
}
|
})
|
||||||
|
|
||||||
projectorData = append(projectorData,
|
|
||||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
|
||||||
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
|
||||||
)
|
|
||||||
|
|
||||||
mainTableData = append(mainTableData,
|
|
||||||
[]string{"Projector"},
|
|
||||||
[]string{renderSubTable(projectorData, false)},
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.Parameters != "" {
|
if resp.Parameters != "" {
|
||||||
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
|
tableRender("Parameters", func() (rows [][]string) {
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
|
||||||
|
for scanner.Scan() {
|
||||||
|
if text := scanner.Text(); text != "" {
|
||||||
|
rows = append(rows, append([]string{""}, strings.Fields(text)...))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
head := func(s string, n int) (rows [][]string) {
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||||
|
for scanner.Scan() && (len(rows) < n || n < 0) {
|
||||||
|
if text := scanner.Text(); text != "" {
|
||||||
|
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.System != "" {
|
if resp.System != "" {
|
||||||
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
|
tableRender("System", func() [][]string {
|
||||||
|
return head(resp.System, 2)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.License != "" {
|
if resp.License != "" {
|
||||||
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
|
tableRender("License", func() [][]string {
|
||||||
|
return head(resp.License, 2)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
table := tablewriter.NewWriter(os.Stdout)
|
return nil
|
||||||
table.SetAutoWrapText(false)
|
|
||||||
table.SetBorder(false)
|
|
||||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
|
||||||
|
|
||||||
for _, v := range mainTableData {
|
|
||||||
table.Append(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
table.Render()
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderSubTable(data [][]string, file bool) string {
|
|
||||||
var buf bytes.Buffer
|
|
||||||
table := tablewriter.NewWriter(&buf)
|
|
||||||
table.SetAutoWrapText(!file)
|
|
||||||
table.SetBorder(false)
|
|
||||||
table.SetNoWhiteSpace(true)
|
|
||||||
table.SetTablePadding("\t")
|
|
||||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
|
||||||
|
|
||||||
for _, v := range data {
|
|
||||||
table.Append(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
table.Render()
|
|
||||||
|
|
||||||
renderedTable := buf.String()
|
|
||||||
lines := strings.Split(renderedTable, "\n")
|
|
||||||
for i, line := range lines {
|
|
||||||
lines[i] = "\t" + line
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(lines, "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
func twoLines(s string) [][]string {
|
|
||||||
lines := strings.Split(s, "\n")
|
|
||||||
res := [][]string{}
|
|
||||||
|
|
||||||
count := 0
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if line != "" {
|
|
||||||
count++
|
|
||||||
res = append(res, []string{line})
|
|
||||||
if count == 2 {
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
func formatParams(s string) string {
|
|
||||||
lines := strings.Split(s, "\n")
|
|
||||||
table := [][]string{}
|
|
||||||
|
|
||||||
for _, line := range lines {
|
|
||||||
table = append(table, strings.Fields(line))
|
|
||||||
}
|
|
||||||
return renderSubTable(table, false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func CopyHandler(cmd *cobra.Command, args []string) error {
|
func CopyHandler(cmd *cobra.Command, args []string) error {
|
||||||
@@ -1125,7 +1154,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
func RunServer(_ *cobra.Command, _ []string) error {
|
||||||
if err := initializeKeypair(); err != nil {
|
if err := initializeKeypair(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -1306,6 +1335,15 @@ func NewCLI() *cobra.Command {
|
|||||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||||
|
|
||||||
|
stopCmd := &cobra.Command{
|
||||||
|
Use: "stop MODEL",
|
||||||
|
Short: "Stop a running model",
|
||||||
|
Args: cobra.ExactArgs(1),
|
||||||
|
PreRunE: checkServerHeartbeat,
|
||||||
|
RunE: StopHandler,
|
||||||
|
}
|
||||||
|
|
||||||
serveCmd := &cobra.Command{
|
serveCmd := &cobra.Command{
|
||||||
Use: "serve",
|
Use: "serve",
|
||||||
Aliases: []string{"start"},
|
Aliases: []string{"start"},
|
||||||
@@ -1373,6 +1411,7 @@ func NewCLI() *cobra.Command {
|
|||||||
createCmd,
|
createCmd,
|
||||||
showCmd,
|
showCmd,
|
||||||
runCmd,
|
runCmd,
|
||||||
|
stopCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
listCmd,
|
listCmd,
|
||||||
@@ -1399,6 +1438,8 @@ func NewCLI() *cobra.Command {
|
|||||||
envVars["OLLAMA_TMPDIR"],
|
envVars["OLLAMA_TMPDIR"],
|
||||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
|
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||||
|
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
@@ -1410,6 +1451,7 @@ func NewCLI() *cobra.Command {
|
|||||||
createCmd,
|
createCmd,
|
||||||
showCmd,
|
showCmd,
|
||||||
runCmd,
|
runCmd,
|
||||||
|
stopCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
listCmd,
|
listCmd,
|
||||||
|
|||||||
206
cmd/cmd_test.go
Normal file
206
cmd/cmd_test.go
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestShowInfo(t *testing.T) {
|
||||||
|
t.Run("bare details", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
`
|
||||||
|
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("bare model info", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
ModelInfo: map[string]any{
|
||||||
|
"general.architecture": "test",
|
||||||
|
"general.parameter_count": float64(7_000_000_000),
|
||||||
|
"test.context_length": float64(0),
|
||||||
|
"test.embedding_length": float64(0),
|
||||||
|
},
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
context length 0
|
||||||
|
embedding length 0
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("parameters", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
Parameters: `
|
||||||
|
stop never
|
||||||
|
stop gonna
|
||||||
|
stop give
|
||||||
|
stop you
|
||||||
|
stop up
|
||||||
|
temperature 99`,
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
stop never
|
||||||
|
stop gonna
|
||||||
|
stop give
|
||||||
|
stop you
|
||||||
|
stop up
|
||||||
|
temperature 99
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("project info", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
ProjectorInfo: map[string]any{
|
||||||
|
"general.architecture": "clip",
|
||||||
|
"general.parameter_count": float64(133_700_000),
|
||||||
|
"clip.vision.embedding_length": float64(0),
|
||||||
|
"clip.vision.projection_dim": float64(0),
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
Projector
|
||||||
|
architecture clip
|
||||||
|
parameters 133.70M
|
||||||
|
embedding length 0
|
||||||
|
dimensions 0
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("system", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
System: `You are a pirate!
|
||||||
|
Ahoy, matey!
|
||||||
|
Weigh anchor!
|
||||||
|
`,
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
System
|
||||||
|
You are a pirate!
|
||||||
|
Ahoy, matey!
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("license", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
License: string(license),
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
License
|
||||||
|
MIT License
|
||||||
|
Copyright (c) Ollama
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -18,7 +18,6 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/progress"
|
|
||||||
"github.com/ollama/ollama/readline"
|
"github.com/ollama/ollama/readline"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
)
|
)
|
||||||
@@ -31,26 +30,6 @@ const (
|
|||||||
MultilineSystem
|
MultilineSystem
|
||||||
)
|
)
|
||||||
|
|
||||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
|
||||||
defer p.StopAndClear()
|
|
||||||
|
|
||||||
spinner := progress.NewSpinner("")
|
|
||||||
p.Add("", spinner)
|
|
||||||
|
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
chatReq := &api.ChatRequest{
|
|
||||||
Model: opts.Model,
|
|
||||||
KeepAlive: opts.KeepAlive,
|
|
||||||
}
|
|
||||||
|
|
||||||
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
usage := func() {
|
usage := func() {
|
||||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||||
@@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
opts.Model = args[1]
|
opts.Model = args[1]
|
||||||
opts.Messages = []api.Message{}
|
opts.Messages = []api.Message{}
|
||||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||||
if err := loadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
@@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
|
|
||||||
switch args[1] {
|
switch args[1] {
|
||||||
case "info":
|
case "info":
|
||||||
showInfo(resp)
|
_ = showInfo(resp, os.Stderr)
|
||||||
case "license":
|
case "license":
|
||||||
if resp.License == "" {
|
if resp.License == "" {
|
||||||
fmt.Println("No license was specified for this model.")
|
fmt.Println("No license was specified for this model.")
|
||||||
|
|||||||
@@ -7,16 +7,27 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Parameters struct {
|
type ModelParameters struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) KV(t *Tokenizer) llm.KV {
|
type AdapterParameters struct {
|
||||||
|
Alpha uint32 `json:"lora_alpha"`
|
||||||
|
LoraLayers uint32 `json:"lora_layers"`
|
||||||
|
LoraParameters struct {
|
||||||
|
Rank uint32 `json:"rank"`
|
||||||
|
Alpha float32 `json:"alpha"`
|
||||||
|
Scale float32 `json:"scale"`
|
||||||
|
} `json:"lora_parameters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||||
kv := llm.KV{
|
kv := llm.KV{
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
"general.quantization_version": uint32(2),
|
"general.quantization_version": uint32(2),
|
||||||
@@ -43,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) specialTokenTypes() []string {
|
func (p AdapterParameters) KV() llm.KV {
|
||||||
|
var alpha float32
|
||||||
|
if p.LoraParameters.Alpha == 0 {
|
||||||
|
alpha = float32(p.Alpha)
|
||||||
|
} else {
|
||||||
|
alpha = p.LoraParameters.Alpha
|
||||||
|
}
|
||||||
|
|
||||||
|
kv := llm.KV{
|
||||||
|
"adapter.lora.alpha": alpha,
|
||||||
|
"adapter.type": "lora",
|
||||||
|
"general.file_type": uint32(1),
|
||||||
|
"general.type": "adapter",
|
||||||
|
"general.version": "v0.2",
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ModelParameters) specialTokenTypes() []string {
|
||||||
return []string{
|
return []string{
|
||||||
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
type Converter interface {
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
|
}
|
||||||
|
|
||||||
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) llm.KV
|
KV(*Tokenizer) llm.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []llm.Tensor
|
||||||
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
|
Replacements() []string
|
||||||
|
|
||||||
// tensorName returns the LLM tensor name for a specific input name
|
|
||||||
tensorName(string) string
|
|
||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type moreParser interface {
|
||||||
|
parseMore(fs.FS) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type AdapterConverter interface {
|
||||||
|
// KV maps parameters to LLM key-values
|
||||||
|
KV(llm.KV) llm.KV
|
||||||
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
|
Tensors([]Tensor) []llm.Tensor
|
||||||
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
|
Replacements() []string
|
||||||
|
|
||||||
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
|
}
|
||||||
|
|
||||||
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var p AdapterParameters
|
||||||
|
if err := json.Unmarshal(bts, &p); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
arch, ok := baseKV["general.architecture"]
|
||||||
|
if !ok {
|
||||||
|
return errors.New("architecture not set for the base model")
|
||||||
|
}
|
||||||
|
|
||||||
|
var conv AdapterConverter
|
||||||
|
switch arch {
|
||||||
|
case "llama":
|
||||||
|
conv = &llamaAdapter{}
|
||||||
|
case "gemma2":
|
||||||
|
conv = &gemma2Adapter{}
|
||||||
|
default:
|
||||||
|
return errors.New("unsupported architecture")
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, conv); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
|
||||||
|
}
|
||||||
|
|
||||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||||
// and files it finds in the input path.
|
// and files it finds in the input path.
|
||||||
// Supported input model formats include safetensors.
|
// Supported input model formats include safetensors.
|
||||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||||
func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
bts, err := fs.ReadFile(fsys, "config.json")
|
bts, err := fs.ReadFile(fsys, "config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var p Parameters
|
var p ModelParameters
|
||||||
if err := json.Unmarshal(bts, &p); err != nil {
|
if err := json.Unmarshal(bts, &p); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -85,16 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
|||||||
return errors.New("unknown architecture")
|
return errors.New("unknown architecture")
|
||||||
}
|
}
|
||||||
|
|
||||||
var conv Converter
|
var conv ModelConverter
|
||||||
switch p.Architectures[0] {
|
switch p.Architectures[0] {
|
||||||
case "LlamaForCausalLM", "MistralForCausalLM":
|
case "LlamaForCausalLM", "MistralForCausalLM":
|
||||||
conv = &llama{}
|
conv = &llamaModel{}
|
||||||
case "MixtralForCausalLM":
|
case "MixtralForCausalLM":
|
||||||
conv = &mixtral{}
|
conv = &mixtralModel{}
|
||||||
case "GemmaForCausalLM":
|
case "GemmaForCausalLM":
|
||||||
conv = &gemma{}
|
conv = &gemmaModel{}
|
||||||
|
case "Gemma2ForCausalLM":
|
||||||
|
conv = &gemma2Model{}
|
||||||
case "Phi3ForCausalLM":
|
case "Phi3ForCausalLM":
|
||||||
conv = &phi3{}
|
conv = &phi3Model{}
|
||||||
|
case "BertModel":
|
||||||
|
conv = &bertModel{}
|
||||||
default:
|
default:
|
||||||
return errors.New("unsupported architecture")
|
return errors.New("unsupported architecture")
|
||||||
}
|
}
|
||||||
@@ -103,23 +197,33 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if t, ok := conv.(moreParser); ok {
|
||||||
|
if err := t.parseMore(fsys); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
|
vocabSize := int(p.VocabSize)
|
||||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
|
switch {
|
||||||
|
case vocabSize > len(t.Vocabulary.Tokens):
|
||||||
|
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||||
}
|
}
|
||||||
} else {
|
case vocabSize < len(t.Vocabulary.Tokens):
|
||||||
|
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||||
|
default:
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
|
|
||||||
ts, err := parseTensors(fsys)
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
174
convert/convert_bert.go
Normal file
174
convert/convert_bert.go
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"encoding/json"
|
||||||
|
"io/fs"
|
||||||
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type bertModel struct {
|
||||||
|
ModelParameters
|
||||||
|
NLayers uint32 `json:"n_layers"`
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
NLayer uint32 `json:"n_layer"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
NCtx uint32 `json:"n_ctx"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
NEmbd uint32 `json:"n_embd"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NInner uint32 `json:"n_inner"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NHead uint32 `json:"n_head"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
|
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||||
|
NormEpsilon float32 `json:"norm_epsilon"`
|
||||||
|
|
||||||
|
PoolingType uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
_ ModelConverter = (*bertModel)(nil)
|
||||||
|
_ moreParser = (*bertModel)(nil)
|
||||||
|
)
|
||||||
|
|
||||||
|
func (p *bertModel) parseMore(fsys fs.FS) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var modules []struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Path string `json:"path"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pooling string
|
||||||
|
for _, m := range modules {
|
||||||
|
if m.Type == "sentence_transformers.models.Pooling" {
|
||||||
|
pooling = m.Path
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if pooling != "" {
|
||||||
|
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pc struct {
|
||||||
|
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||||
|
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if pc.PoolingModeMeanTokens {
|
||||||
|
p.PoolingType = 1
|
||||||
|
} else if pc.PoolingModeCLSToken {
|
||||||
|
p.PoolingType = 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "bert"
|
||||||
|
kv["bert.attention.causal"] = false
|
||||||
|
kv["bert.pooling_type"] = p.PoolingType
|
||||||
|
|
||||||
|
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||||
|
|
||||||
|
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
|
||||||
|
kv["bert.context_length"] = contextLength
|
||||||
|
}
|
||||||
|
|
||||||
|
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
|
||||||
|
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||||
|
}
|
||||||
|
|
||||||
|
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
|
||||||
|
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
|
||||||
|
}
|
||||||
|
|
||||||
|
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
|
||||||
|
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
|
||||||
|
}
|
||||||
|
|
||||||
|
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
|
||||||
|
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.model"] = "bert"
|
||||||
|
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||||
|
|
||||||
|
// convert to phantom space tokens
|
||||||
|
for i, e := range t.Tokens {
|
||||||
|
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
|
||||||
|
// noop
|
||||||
|
} else if strings.HasPrefix(e, "##") {
|
||||||
|
t.Tokens[i] = e[2:]
|
||||||
|
} else {
|
||||||
|
t.Tokens[i] = "\u2581" + e
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
if slices.Contains([]string{
|
||||||
|
"embeddings.position_ids",
|
||||||
|
"pooler.dense.weight",
|
||||||
|
"pooler.dense.bias",
|
||||||
|
}, t.Name()) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (bertModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"encoder.layer", "blk",
|
||||||
|
"encoder.layers", "blk",
|
||||||
|
"embeddings.word_embeddings", "token_embd",
|
||||||
|
"embeddings.token_type_embeddings", "token_types",
|
||||||
|
"embeddings.LayerNorm", "token_embd_norm",
|
||||||
|
"embeddings.position_embeddings", "position_embd",
|
||||||
|
"attention.self.query", "attn_q",
|
||||||
|
"attention.self.key", "attn_k",
|
||||||
|
"attention.self.value", "attn_v",
|
||||||
|
"attention.output.dense", "attn_output",
|
||||||
|
"attention.output.LayerNorm", "attn_output_norm",
|
||||||
|
"intermediate.dense", "ffn_up",
|
||||||
|
"output.dense", "ffn_down",
|
||||||
|
"output.LayerNorm", "layer_output_norm",
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,8 +9,8 @@ import (
|
|||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma struct {
|
type gemmaModel struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
@@ -21,12 +21,11 @@ type gemma struct {
|
|||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*gemma)(nil)
|
var _ ModelConverter = (*gemmaModel)(nil)
|
||||||
|
|
||||||
func (p *gemma) KV(t *Tokenizer) llm.KV {
|
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma"
|
kv["general.architecture"] = "gemma"
|
||||||
kv["general.name"] = "gemma"
|
|
||||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||||
kv["gemma.embedding_length"] = p.HiddenSize
|
kv["gemma.embedding_length"] = p.HiddenSize
|
||||||
kv["gemma.block_count"] = p.HiddenLayers
|
kv["gemma.block_count"] = p.HiddenLayers
|
||||||
@@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []llm.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
if strings.HasSuffix(name, "_norm.weight") {
|
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
@@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma) tensorName(n string) string {
|
func (p *gemmaModel) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
@@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
|
|||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.up_proj", "ffn_up",
|
"mlp.up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
"block_sparse_moe.gate", "ffn_inp",
|
}
|
||||||
).Replace(n)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
|
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
|
||||||
ones := tensor.Ones(tensor.Float32, int(shape[0]))
|
ones := tensor.Ones(tensor.Float32, int(shape[0]))
|
||||||
|
|
||||||
|
|||||||
53
convert/convert_gemma2.go
Normal file
53
convert/convert_gemma2.go
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type gemma2Model struct {
|
||||||
|
gemmaModel
|
||||||
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
|
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
|
||||||
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "gemma2"
|
||||||
|
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
kv["gemma2.embedding_length"] = p.HiddenSize
|
||||||
|
kv["gemma2.block_count"] = p.HiddenLayers
|
||||||
|
kv["gemma2.feed_forward_length"] = p.IntermediateSize
|
||||||
|
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
|
||||||
|
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||||
|
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||||
|
kv["gemma2.attention.key_length"] = p.HeadDim
|
||||||
|
kv["gemma2.attention.value_length"] = p.HeadDim
|
||||||
|
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
|
||||||
|
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
|
||||||
|
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||||
|
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
|
||||||
|
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
|
||||||
|
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
|
||||||
|
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Model) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"model.embed_tokens", "token_embd",
|
||||||
|
"model.norm", "output_norm",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"input_layernorm", "attn_norm",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"post_attention_layernorm", "post_attention_norm",
|
||||||
|
"pre_feedforward_layernorm", "ffn_norm",
|
||||||
|
"post_feedforward_layernorm", "post_ffw_norm",
|
||||||
|
}
|
||||||
|
}
|
||||||
91
convert/convert_gemma2_adapter.go
Normal file
91
convert/convert_gemma2_adapter.go
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type gemma2Adapter struct {
|
||||||
|
AdapterParameters
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
||||||
|
kv := p.AdapterParameters.KV()
|
||||||
|
kv["general.architecture"] = "gemma2"
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
shape := t.Shape()
|
||||||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
|
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||||
|
shape[0], shape[1] = shape[1], shape[0]
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"base_model.model.", "",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"lora_A.weight", "weight.lora_a",
|
||||||
|
"lora_B.weight", "weight.lora_b",
|
||||||
|
"lora_a", "weight.lora_a",
|
||||||
|
"lora_b", "weight.lora_b",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
if err := n.T(1, 0); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ package convert
|
|||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
@@ -11,8 +12,8 @@ import (
|
|||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llama struct {
|
type llamaModel struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
NLayers uint32 `json:"n_layers"`
|
NLayers uint32 `json:"n_layers"`
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
NLayer uint32 `json:"n_layer"`
|
NLayer uint32 `json:"n_layer"`
|
||||||
@@ -28,7 +29,13 @@ type llama struct {
|
|||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
RopeScaling struct {
|
RopeScaling struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
|
RopeType string `json:"rope_type"`
|
||||||
Factor float32 `json:"factor"`
|
Factor float32 `json:"factor"`
|
||||||
|
LowFrequencyFactor float32 `json:"low_freq_factor"`
|
||||||
|
HighFrequencyFactor float32 `json:"high_freq_factor"`
|
||||||
|
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
|
||||||
|
|
||||||
|
factors ropeFactor
|
||||||
} `json:"rope_scaling"`
|
} `json:"rope_scaling"`
|
||||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
@@ -37,12 +44,11 @@ type llama struct {
|
|||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*llama)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
|
|
||||||
func (p *llama) KV(t *Tokenizer) llm.KV {
|
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["general.name"] = "llama"
|
|
||||||
kv["llama.vocab_size"] = p.VocabSize
|
kv["llama.vocab_size"] = p.VocabSize
|
||||||
|
|
||||||
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||||
@@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
|
|||||||
if p.RopeScaling.Type == "linear" {
|
if p.RopeScaling.Type == "linear" {
|
||||||
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
|
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
|
||||||
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
|
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||||
|
} else if p.RopeScaling.RopeType == "llama3" {
|
||||||
|
dim := p.HiddenSize / p.NumAttentionHeads
|
||||||
|
for i := uint32(0); i < dim; i += 2 {
|
||||||
|
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
|
||||||
|
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
|
||||||
|
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
|
||||||
|
|
||||||
|
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
|
||||||
|
lambdaLow := float32(original) / factorLow
|
||||||
|
lambdaHigh := float32(original) / factorHigh
|
||||||
|
|
||||||
|
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
|
||||||
|
if lambda < float64(lambdaHigh) {
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
|
||||||
|
} else if lambda > float64(lambdaLow) {
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
|
||||||
|
} else {
|
||||||
|
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.NumKeyValueHeads > 0 {
|
if p.NumKeyValueHeads > 0 {
|
||||||
@@ -93,17 +120,26 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []llm.Tensor
|
var out []llm.Tensor
|
||||||
|
|
||||||
|
if p.RopeScaling.factors != nil {
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: "rope_freqs.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
|
WriterTo: p.RopeScaling.factors,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasSuffix(t.Name(), "attn_q.weight") ||
|
||||||
if strings.HasSuffix(name, "attn_q.weight") ||
|
strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||||
strings.HasSuffix(name, "attn_k.weight") {
|
|
||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
@@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) tensorName(n string) string {
|
func (p *llamaModel) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"lm_head", "output",
|
"lm_head", "output",
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
@@ -128,21 +164,19 @@ func (p *llama) tensorName(n string) string {
|
|||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.up_proj", "ffn_up",
|
"mlp.up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
// mixtral
|
}
|
||||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
|
||||||
).Replace(n)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
var dims []int
|
var dims []int
|
||||||
for _, dim := range shape {
|
for _, dim := range shape {
|
||||||
dims = append(dims, int(dim))
|
dims = append(dims, int(dim))
|
||||||
}
|
}
|
||||||
|
|
||||||
var heads uint32
|
var heads uint32
|
||||||
if strings.HasSuffix(name, "q_proj.weight") {
|
if strings.HasSuffix(name, "attn_q.weight") {
|
||||||
heads = p.NumAttentionHeads
|
heads = p.NumAttentionHeads
|
||||||
} else if strings.HasSuffix(name, "k_proj.weight") {
|
} else if strings.HasSuffix(name, "attn_k.weight") {
|
||||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
|||||||
169
convert/convert_llama_adapter.go
Normal file
169
convert/convert_llama_adapter.go
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type llamaAdapter struct {
|
||||||
|
AdapterParameters
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||||
|
|
||||||
|
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||||
|
kv := p.AdapterParameters.KV()
|
||||||
|
kv["general.architecture"] = "llama"
|
||||||
|
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||||
|
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
|
||||||
|
|
||||||
|
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
shape := t.Shape()
|
||||||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
|
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||||
|
shape[0], shape[1] = shape[1], shape[0]
|
||||||
|
t.SetRepacker(p.repackAndTranspose)
|
||||||
|
} else {
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: shape,
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"base_model.model.", "",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"lora_A.weight", "weight.lora_a",
|
||||||
|
"lora_B.weight", "weight.lora_b",
|
||||||
|
"lora_a", "weight.lora_a",
|
||||||
|
"lora_b", "weight.lora_b",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
} else {
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
}
|
||||||
|
|
||||||
|
if heads > 0 {
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(1, 0); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
@@ -9,16 +9,14 @@ import (
|
|||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mixtral struct {
|
type mixtralModel struct {
|
||||||
llama
|
llamaModel
|
||||||
NumLocalExperts uint32 `json:"num_local_experts"`
|
NumLocalExperts uint32 `json:"num_local_experts"`
|
||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*mixtral)(nil)
|
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.llamaModel.KV(t)
|
||||||
func (p *mixtral) KV(t *Tokenizer) llm.KV {
|
|
||||||
kv := p.llama.KV(t)
|
|
||||||
|
|
||||||
if p.NumLocalExperts > 0 {
|
if p.NumLocalExperts > 0 {
|
||||||
kv["llama.expert_count"] = p.NumLocalExperts
|
kv["llama.expert_count"] = p.NumLocalExperts
|
||||||
@@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return append(out, p.llama.Tensors(ts)...)
|
return append(out, p.llamaModel.Tensors(ts)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mixtralModel) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
p.llamaModel.Replacements(),
|
||||||
|
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
type experts []Tensor
|
type experts []Tensor
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ import (
|
|||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type phi3 struct {
|
type phi3Model struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
NLayers uint32 `json:"n_layers"`
|
NLayers uint32 `json:"n_layers"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
@@ -35,12 +35,11 @@ type phi3 struct {
|
|||||||
SlidingWindow uint32 `json:"sliding_window"`
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*phi3)(nil)
|
var _ ModelConverter = (*phi3Model)(nil)
|
||||||
|
|
||||||
func (p *phi3) KV(t *Tokenizer) llm.KV {
|
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "phi3"
|
kv["general.architecture"] = "phi3"
|
||||||
kv["general.name"] = "phi3"
|
|
||||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||||
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||||
kv["phi3.feed_forward_length"] = p.IntermediateSize
|
kv["phi3.feed_forward_length"] = p.IntermediateSize
|
||||||
@@ -69,13 +68,12 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
out := make([]llm.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
if strings.HasPrefix(name, "blk.0.") {
|
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
@@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
@@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3) tensorName(n string) string {
|
func (p *phi3Model) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"lm_head", "output",
|
"lm_head", "output",
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
@@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string {
|
|||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.gate_up_proj", "ffn_up",
|
"mlp.gate_up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
).Replace(n)
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type ropeFactor []float32
|
type ropeFactor []float32
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
|
"encoding/binary"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"flag"
|
"flag"
|
||||||
@@ -13,6 +15,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
@@ -20,6 +23,12 @@ import (
|
|||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type tensorData struct {
|
||||||
|
Offsets []int `json:"data_offsets"`
|
||||||
|
Type string `json:"dtype"`
|
||||||
|
Shape []int `json:"shape"`
|
||||||
|
}
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
@@ -29,7 +38,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if err := Convert(fsys, f); err != nil {
|
if err := ConvertModel(fsys, f); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,37 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
|||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMain(m *testing.M) {
|
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
|
||||||
var level slog.Level
|
|
||||||
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
|
|
||||||
flag.Parse()
|
|
||||||
slog.SetLogLoggerLevel(level)
|
|
||||||
os.Exit(m.Run())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestConvertFull(t *testing.T) {
|
|
||||||
cases := []string{
|
|
||||||
"Meta-Llama-3-8B-Instruct",
|
|
||||||
"Mistral-7B-Instruct-v0.2",
|
|
||||||
"Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"gemma-2b-it",
|
|
||||||
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
|
||||||
"Phi-3-mini-128k-instruct",
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range cases {
|
|
||||||
tt := cases[i]
|
|
||||||
t.Run(tt, func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
p := filepath.Join("testdata", tt)
|
|
||||||
if testing.Short() {
|
|
||||||
t.Skip("skipping in short mode")
|
|
||||||
} else if _, err := os.Stat(p); err != nil {
|
|
||||||
t.Skipf("%s not found", p)
|
|
||||||
}
|
|
||||||
|
|
||||||
f, kv, tensors := convertFull(t, os.DirFS(p))
|
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
@@ -106,6 +85,46 @@ func TestConvertFull(t *testing.T) {
|
|||||||
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return actual
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMain(m *testing.M) {
|
||||||
|
var level slog.Level
|
||||||
|
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
|
||||||
|
flag.Parse()
|
||||||
|
slog.SetLogLoggerLevel(level)
|
||||||
|
os.Exit(m.Run())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConvertModel(t *testing.T) {
|
||||||
|
cases := []string{
|
||||||
|
"Meta-Llama-3-8B-Instruct",
|
||||||
|
"Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"Mistral-7B-Instruct-v0.2",
|
||||||
|
"Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"gemma-2b-it",
|
||||||
|
"gemma-2-2b-it",
|
||||||
|
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
||||||
|
"Phi-3-mini-128k-instruct",
|
||||||
|
"all-MiniLM-L6-v2",
|
||||||
|
"gemma-2-9b-it",
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range cases {
|
||||||
|
tt := cases[i]
|
||||||
|
t.Run(tt, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
p := filepath.Join("testdata", tt)
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("skipping in short mode")
|
||||||
|
} else if _, err := os.Stat(p); err != nil {
|
||||||
|
t.Skipf("%s not found", p)
|
||||||
|
}
|
||||||
|
|
||||||
|
f, kv, tensors := convertFull(t, os.DirFS(p))
|
||||||
|
actual := generateResultsJSON(t, f, kv, tensors)
|
||||||
|
|
||||||
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
|
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
@@ -128,3 +147,330 @@ func TestConvertFull(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConvertInvalidTensorNames(t *testing.T) {
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "testmodel")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
|
||||||
|
td := map[string]*tensorData{}
|
||||||
|
offset := 4096
|
||||||
|
|
||||||
|
td["model.layers.0.self_attn.q_proj.weight"] = &tensorData{
|
||||||
|
Offsets: []int{0, offset},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 4096},
|
||||||
|
}
|
||||||
|
td["blk.0.attn_q.weight"] = &tensorData{
|
||||||
|
Offsets: []int{offset, offset * 2},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 4096},
|
||||||
|
}
|
||||||
|
generateSafetensorTestData(t, tempDir, td)
|
||||||
|
|
||||||
|
err = ConvertModel(os.DirFS(tempDir), f)
|
||||||
|
if err == nil || !strings.HasPrefix(err.Error(), "duplicate tensor name") {
|
||||||
|
t.Errorf("expected error but didn't get one")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConvertInvalidDatatype(t *testing.T) {
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "testmodel")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
|
||||||
|
td := map[string]*tensorData{}
|
||||||
|
offset := 4096 * 14336
|
||||||
|
|
||||||
|
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
|
||||||
|
Offsets: []int{0, offset},
|
||||||
|
Type: "I8",
|
||||||
|
Shape: []int{4096, 14336},
|
||||||
|
}
|
||||||
|
td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
|
||||||
|
Offsets: []int{offset, offset},
|
||||||
|
Type: "U8",
|
||||||
|
Shape: []int{},
|
||||||
|
}
|
||||||
|
generateSafetensorTestData(t, tempDir, td)
|
||||||
|
|
||||||
|
err = ConvertModel(os.DirFS(tempDir), f)
|
||||||
|
if err == nil || err.Error() != "unsupported safetensors model" {
|
||||||
|
t.Errorf("expected error but didn't get one")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[string]*tensorData) {
|
||||||
|
data, err := json.Marshal(tensorData)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
|
||||||
|
l := int64(len(data))
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, l)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = buf.Write(data)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer fdata.Close()
|
||||||
|
|
||||||
|
_, err = fdata.Write(buf.Bytes())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
configData := `
|
||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
f, err := os.Create(filepath.Join(tempDir, "config.json"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
_, err = f.WriteString(configData)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizerData := `
|
||||||
|
{
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
_, err = f.WriteString(tokenizerData)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConvertAdapter(t *testing.T) {
|
||||||
|
type AdapterCase struct {
|
||||||
|
Name string
|
||||||
|
BaseKV map[string]any
|
||||||
|
Expected map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []AdapterCase{
|
||||||
|
{
|
||||||
|
Name: "discollama",
|
||||||
|
BaseKV: map[string]any{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"llama.attention.head_count": uint32(32),
|
||||||
|
"llama.attention.head_count_kv": uint32(8),
|
||||||
|
},
|
||||||
|
Expected: map[string]string{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"general.file_type": "1",
|
||||||
|
"general.parameter_count": "106496",
|
||||||
|
"general.type": "adapter",
|
||||||
|
"general.version": "v0.2",
|
||||||
|
"adapter.lora.alpha": "16",
|
||||||
|
"adapter.type": "lora",
|
||||||
|
"llama.attention.head_count": "32",
|
||||||
|
"llama.attention.head_count_kv": "8",
|
||||||
|
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.Name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
generateLoraTestData(t, tempDir)
|
||||||
|
|
||||||
|
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
r, err := os.Open(f.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := r.Seek(0, io.SeekStart); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
||||||
|
|
||||||
|
keys := maps.Keys(c.Expected)
|
||||||
|
slices.Sort(keys)
|
||||||
|
for _, k := range keys {
|
||||||
|
if v, ok := actual[k]; !ok {
|
||||||
|
t.Errorf("missing %s", k)
|
||||||
|
} else if v != c.Expected[k] {
|
||||||
|
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateLoraTestData(t *testing.T, tempDir string) {
|
||||||
|
offset := 4096 * 8 * 4
|
||||||
|
|
||||||
|
td := map[string]*tensorData{"__metadata__": nil}
|
||||||
|
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
|
||||||
|
Offsets: []int{0, offset},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 8},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
|
||||||
|
Offsets: []int{offset, offset * 2},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{8, 4096},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
|
||||||
|
Offsets: []int{offset * 2, offset * 3},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 8},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
|
||||||
|
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{8, 1024},
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(td)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
|
||||||
|
l := int64(len(data))
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, l)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = buf.Write(data)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// write some data for the tensors
|
||||||
|
|
||||||
|
ones := make([]float32, 4096*8)
|
||||||
|
for i := range ones {
|
||||||
|
ones[i] = float32(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for range 3 {
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ones = make([]float32, 1024*8)
|
||||||
|
for i := range ones {
|
||||||
|
ones[i] = float32(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer fdata.Close()
|
||||||
|
|
||||||
|
_, err = fdata.Write(buf.Bytes())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
configData := `
|
||||||
|
{
|
||||||
|
"adapter_path": "adapters-test",
|
||||||
|
"batch_size": 8,
|
||||||
|
"config": "config-tiny.json",
|
||||||
|
"data": "../discollama-completion",
|
||||||
|
"grad_checkpoint": null,
|
||||||
|
"iters": 1000,
|
||||||
|
"learning_rate": 1e-05,
|
||||||
|
"lora_layers": 1,
|
||||||
|
"lora_parameters": {
|
||||||
|
"rank": 8,
|
||||||
|
"alpha": 16,
|
||||||
|
"dropout": 0.0,
|
||||||
|
"scale": 2.0
|
||||||
|
},
|
||||||
|
"lr_schedule": null,
|
||||||
|
"max_seq_length": 2048,
|
||||||
|
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
|
||||||
|
"resume_adapter_file": null,
|
||||||
|
"save_every": 100,
|
||||||
|
"seed": 0,
|
||||||
|
"steps_per_eval": 200,
|
||||||
|
"steps_per_report": 10,
|
||||||
|
"test": false,
|
||||||
|
"test_batches": 500,
|
||||||
|
"train": true,
|
||||||
|
"use_dora": false,
|
||||||
|
"val_batches": 25
|
||||||
|
}
|
||||||
|
`
|
||||||
|
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
_, err = f.WriteString(configData)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -35,7 +35,9 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func (t tensorBase) Kind() uint32 {
|
func (t tensorBase) Kind() uint32 {
|
||||||
if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
|
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||||
|
t.name == "token_types.weight" {
|
||||||
|
// these tensors are always F32
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {
|
|||||||
|
|
||||||
type repacker func(string, []float32, []uint64) ([]float32, error)
|
type repacker func(string, []float32, []uint64) ([]float32, error)
|
||||||
|
|
||||||
func parseTensors(fsys fs.FS) ([]Tensor, error) {
|
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
||||||
patterns := []struct {
|
patterns := []struct {
|
||||||
Pattern string
|
Pattern string
|
||||||
Func func(fs.FS, ...string) ([]Tensor, error)
|
Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
|
||||||
}{
|
}{
|
||||||
{"model-*-of-*.safetensors", parseSafetensors},
|
{"model-*-of-*.safetensors", parseSafetensors},
|
||||||
{"model.safetensors", parseSafetensors},
|
{"model.safetensors", parseSafetensors},
|
||||||
|
{"adapters.safetensors", parseSafetensors},
|
||||||
|
{"adapter_model.safetensors", parseSafetensors},
|
||||||
{"pytorch_model-*-of-*.bin", parseTorch},
|
{"pytorch_model-*-of-*.bin", parseTorch},
|
||||||
{"pytorch_model.bin", parseTorch},
|
{"pytorch_model.bin", parseTorch},
|
||||||
{"consolidated.*.pth", parseTorch},
|
{"consolidated.*.pth", parseTorch},
|
||||||
@@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(matches) > 0 {
|
if len(matches) > 0 {
|
||||||
return pattern.Func(fsys, matches...)
|
return pattern.Func(fsys, replacer, matches...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,12 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/d4l3k/go-bfloat16"
|
"github.com/d4l3k/go-bfloat16"
|
||||||
"github.com/x448/float16"
|
"github.com/x448/float16"
|
||||||
@@ -20,7 +22,7 @@ type safetensorMetadata struct {
|
|||||||
Offsets []int64 `json:"data_offsets"`
|
Offsets []int64 `json:"data_offsets"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||||
var ts []Tensor
|
var ts []Tensor
|
||||||
for _, p := range ps {
|
for _, p := range ps {
|
||||||
f, err := fsys.Open(p)
|
f, err := fsys.Open(p)
|
||||||
@@ -47,8 +49,19 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
|||||||
keys := maps.Keys(headers)
|
keys := maps.Keys(headers)
|
||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
|
names := make(map[string]struct{}, len(keys))
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
if value := headers[key]; value.Type != "" {
|
if value := headers[key]; value.Type != "" {
|
||||||
|
// bitsandbytes quantized models are unsupported
|
||||||
|
if len(value.Shape) == 0 {
|
||||||
|
return nil, errors.New("unsupported safetensors model")
|
||||||
|
}
|
||||||
|
ggufName := replacer.Replace(key)
|
||||||
|
if _, ok := names[ggufName]; ok {
|
||||||
|
return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
|
||||||
|
}
|
||||||
|
names[ggufName] = struct{}{}
|
||||||
ts = append(ts, safetensor{
|
ts = append(ts, safetensor{
|
||||||
fs: fsys,
|
fs: fsys,
|
||||||
path: p,
|
path: p,
|
||||||
@@ -56,7 +69,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
|||||||
offset: safetensorsPad(n, value.Offsets[0]),
|
offset: safetensorsPad(n, value.Offsets[0]),
|
||||||
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
||||||
tensorBase: &tensorBase{
|
tensorBase: &tensorBase{
|
||||||
name: key,
|
name: ggufName,
|
||||||
shape: value.Shape,
|
shape: value.Shape,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -3,12 +3,13 @@ package convert
|
|||||||
import (
|
import (
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/nlpodyssey/gopickle/pytorch"
|
"github.com/nlpodyssey/gopickle/pytorch"
|
||||||
"github.com/nlpodyssey/gopickle/types"
|
"github.com/nlpodyssey/gopickle/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||||
var ts []Tensor
|
var ts []Tensor
|
||||||
for _, p := range ps {
|
for _, p := range ps {
|
||||||
pt, err := pytorch.Load(p)
|
pt, err := pytorch.Load(p)
|
||||||
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
|||||||
ts = append(ts, torch{
|
ts = append(ts, torch{
|
||||||
storage: t.(*pytorch.Tensor).Source,
|
storage: t.(*pytorch.Tensor).Source,
|
||||||
tensorBase: &tensorBase{
|
tensorBase: &tensorBase{
|
||||||
name: k.(string),
|
name: replacer.Replace(k.(string)),
|
||||||
shape: shape,
|
shape: shape,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|||||||
3
convert/testdata/Meta-Llama-3.1-8B-Instruct.json
vendored
Normal file
3
convert/testdata/Meta-Llama-3.1-8B-Instruct.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
|
||||||
|
}
|
||||||
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
Normal file
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
{
|
||||||
|
"general.architecture": "bert",
|
||||||
|
"general.file_type": "1",
|
||||||
|
"general.quantization_version": "2",
|
||||||
|
"bert.attention.causal": "false",
|
||||||
|
"bert.attention.head_count": "12",
|
||||||
|
"bert.attention.layer_norm_epsilon": "1e-12",
|
||||||
|
"bert.block_count": "6",
|
||||||
|
"bert.context_length": "512",
|
||||||
|
"bert.embedding_length": "384",
|
||||||
|
"bert.feed_forward_length": "1536",
|
||||||
|
"bert.pooling_type": "1",
|
||||||
|
"tokenizer.ggml.model": "bert",
|
||||||
|
"tokenizer.ggml.padding_token_id": "0",
|
||||||
|
"tokenizer.ggml.unknown_token_id": "100",
|
||||||
|
"tokenizer.ggml.cls_token_id": "101",
|
||||||
|
"tokenizer.ggml.seperator_token_id": "102",
|
||||||
|
"tokenizer.ggml.mask_token_id": "103",
|
||||||
|
"tokenizer.ggml.token_type_count": "2",
|
||||||
|
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
|
||||||
|
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
|
||||||
|
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
|
||||||
|
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
|
||||||
|
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
|
||||||
|
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
|
||||||
|
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
|
||||||
|
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
|
||||||
|
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
|
||||||
|
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
|
||||||
|
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
|
||||||
|
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
|
||||||
|
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
|
||||||
|
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
|
||||||
|
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
|
||||||
|
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
|
||||||
|
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
|
||||||
|
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
|
||||||
|
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
|
||||||
|
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
|
||||||
|
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
|
||||||
|
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
|
||||||
|
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
|
||||||
|
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
|
||||||
|
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
|
||||||
|
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
|
||||||
|
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
|
||||||
|
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
|
||||||
|
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
|
||||||
|
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
|
||||||
|
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
|
||||||
|
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
|
||||||
|
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
|
||||||
|
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
|
||||||
|
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
|
||||||
|
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
|
||||||
|
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
|
||||||
|
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
|
||||||
|
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
|
||||||
|
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
|
||||||
|
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
|
||||||
|
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
|
||||||
|
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
|
||||||
|
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
|
||||||
|
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
|
||||||
|
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
|
||||||
|
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
|
||||||
|
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
|
||||||
|
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
|
||||||
|
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
|
||||||
|
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
|
||||||
|
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
|
||||||
|
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
|
||||||
|
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
|
||||||
|
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
|
||||||
|
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
|
||||||
|
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
|
||||||
|
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
|
||||||
|
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
|
||||||
|
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
|
||||||
|
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
|
||||||
|
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
|
||||||
|
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
|
||||||
|
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
|
||||||
|
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
|
||||||
|
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
|
||||||
|
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
|
||||||
|
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
|
||||||
|
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
|
||||||
|
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
|
||||||
|
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
|
||||||
|
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
|
||||||
|
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
|
||||||
|
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
|
||||||
|
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
|
||||||
|
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
|
||||||
|
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
|
||||||
|
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
|
||||||
|
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
|
||||||
|
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
|
||||||
|
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
|
||||||
|
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
|
||||||
|
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
|
||||||
|
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
|
||||||
|
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
|
||||||
|
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
|
||||||
|
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
|
||||||
|
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
|
||||||
|
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
|
||||||
|
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
|
||||||
|
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
|
||||||
|
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
|
||||||
|
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
|
||||||
|
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
|
||||||
|
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
|
||||||
|
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
|
||||||
|
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
|
||||||
|
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
|
||||||
|
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
|
||||||
|
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
|
||||||
|
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
|
||||||
|
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
|
||||||
|
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
|
||||||
|
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
|
||||||
|
}
|
||||||
312
convert/testdata/gemma-2-2b-it.json
vendored
Normal file
312
convert/testdata/gemma-2-2b-it.json
vendored
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
{
|
||||||
|
"general.architecture": "gemma2",
|
||||||
|
"general.file_type": "1",
|
||||||
|
"general.quantization_version": "2",
|
||||||
|
"gemma2.block_count": "26",
|
||||||
|
"gemma2.context_length": "8192",
|
||||||
|
"gemma2.embedding_length": "2304",
|
||||||
|
"gemma2.feed_forward_length": "9216",
|
||||||
|
"gemma2.attention.head_count": "8",
|
||||||
|
"gemma2.attention.head_count_kv": "4",
|
||||||
|
"gemma2.attention.key_length": "256",
|
||||||
|
"gemma2.attention.value_length": "256",
|
||||||
|
"gemma2.attention.layer_norm_rms_epsilon": "1e-06",
|
||||||
|
"tokenizer.ggml.model": "llama",
|
||||||
|
"tokenizer.ggml.add_bos_token": "true",
|
||||||
|
"tokenizer.ggml.add_eos_token": "false",
|
||||||
|
"tokenizer.ggml.bos_token_id": "2",
|
||||||
|
"tokenizer.ggml.eos_token_id": "1",
|
||||||
|
"tokenizer.ggml.padding_token_id": "0",
|
||||||
|
"tokenizer.ggml.unknown_token_id": "3",
|
||||||
|
"tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
|
||||||
|
"tokenizer.ggml.token_type": "8d40143b3477df77beea4139420335ede458bf5e14102f01b0170197b55da8d8",
|
||||||
|
"tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
|
||||||
|
"token_embd.weight": "64a9d30707e659e2e673656d71f5aef7a9fb9fd83bb9a77558dfc5abbe218a05",
|
||||||
|
"blk.0.attn_k.weight": "d8b4437c5edb3cddf6af9987038e1bb2b191c4f0fce0e160d2abace717f5d5d7",
|
||||||
|
"blk.0.attn_norm.weight": "1eb73e3f7aa8e502f6ca31cd19efbb8e4fd9a89692e13e48ac8205545a7fa7e8",
|
||||||
|
"blk.0.attn_output.weight": "39e7b78e57d356a22dd89ce1c4d7163b970712ba756545e1703f97866cd2192e",
|
||||||
|
"blk.0.attn_q.weight": "795058e23b6109febd9d55c89e1eebe6af0714ec8c56fd86a160876a6135ffe8",
|
||||||
|
"blk.0.attn_v.weight": "0cd6e583d1887c020472e961bbb113fe5a0d23ae2f1c2c876fc366cdb7692b52",
|
||||||
|
"blk.0.ffn_down.weight": "51eb4d962189e945a84e94e0dc1aad3f8f90cc1a11e18029670afcd0ea0acb1b",
|
||||||
|
"blk.0.ffn_gate.weight": "9811a29b8ad48432925897ab21dfcb13c5cbd372aeccbbefca9b7866883b4ce3",
|
||||||
|
"blk.0.ffn_norm.weight": "92cbf4652ef503c1de5b10f2be00b3fcf00100980cb3baa8f3013a8d8bf3d851",
|
||||||
|
"blk.0.ffn_up.weight": "af87de21746879483ed1b374cdd76b19ba11ca2b6dbb1beba98efdf3be3e8077",
|
||||||
|
"blk.0.post_attention_norm.weight": "32e135f1f258ffe407018899e39af1725d59d66d60022b9a21575ba160e0357a",
|
||||||
|
"blk.0.post_ffw_norm.weight": "ba286f5ac11b07fbc986173708c66f1920427be5a6d108af38fa0a837c1c8eb6",
|
||||||
|
"blk.1.attn_k.weight": "51584435552051f7fade76beca582b3f7190cf7fc07adcf527c2774d4b1c3901",
|
||||||
|
"blk.1.attn_norm.weight": "6833104c7fbf35a7e799ae56c262b97fffa14789642aee14381b25acd21ed80a",
|
||||||
|
"blk.1.attn_output.weight": "14c39481369087bf292ac9a3ab2ef166f9fe376a9f90c246653213ef264febdc",
|
||||||
|
"blk.1.attn_q.weight": "443f64ae2229f857c69d6bebb7800b685786cb77884c3ae19d4286aeed081325",
|
||||||
|
"blk.1.attn_v.weight": "0df482de2038f1e4c8a7733ac0ddb69ad90759dab5968b942af0155588de4c4a",
|
||||||
|
"blk.1.ffn_down.weight": "66f30763a8bbbcaea609a0087ed75fadb5e771c06378dd2cea94cf17e492e8cf",
|
||||||
|
"blk.1.ffn_gate.weight": "a7151bff00a545fa18b2c92dcd2a14572ccf9beb957a6c494f1374e8ebe174c9",
|
||||||
|
"blk.1.ffn_norm.weight": "e197d71ea11b5276bc0167d2663b88089b3ff42b47ba91e85f6c5d95f6306435",
|
||||||
|
"blk.1.ffn_up.weight": "57c182e0b14cccd1350d388f0c616991702e74281db54637451b70f4ccc24f9b",
|
||||||
|
"blk.1.post_attention_norm.weight": "3c56f837168d784c2d8bac247c130bdca6610c095c8da4558c536ccad7605609",
|
||||||
|
"blk.1.post_ffw_norm.weight": "d2a51d320fd01069dd7ccaa7082f16a7faeb671885607d7900b10a89c354d0fa",
|
||||||
|
"blk.2.attn_k.weight": "bc103c818192de7ce36caaf89dc117be4df13fb902e0bd9a23c64edace5df9b6",
|
||||||
|
"blk.2.attn_norm.weight": "0f2503aa126083a5d6ac72481be1ef66c6014705b573682b35bd864e4749a3d5",
|
||||||
|
"blk.2.attn_output.weight": "05fcd4a1226e482f91803a266f72caca887a93e63c2d2ba5611ab3c68d38743a",
|
||||||
|
"blk.2.attn_q.weight": "6a10b5c2fd423d1e4c4fd60fa8c154a0159b6b2501ea79cae2ef19f45a674e5e",
|
||||||
|
"blk.2.attn_v.weight": "3cf891945a1f8ae7cc908a5c6b729ff5b70f4436c5ffdbf245cc0ed4cc19cd1b",
|
||||||
|
"blk.2.ffn_down.weight": "ea204fd04e0d2fc728a9861a459216bbfec629c152004ba625f52cd8837bd51e",
|
||||||
|
"blk.2.ffn_gate.weight": "3a3518729f1b8b64a82b8792f33987db5418fdb094be0263c68f146a5c38de54",
|
||||||
|
"blk.2.ffn_norm.weight": "754ede678b725de41a34b82f0edf7688b5c065be7c0d46df6f7ad9430d986884",
|
||||||
|
"blk.2.ffn_up.weight": "ffdcb88439f5828ffbd9fc844b03ff91637b790b9838097258cc3ae75935720c",
|
||||||
|
"blk.2.post_attention_norm.weight": "4b3f53b7ba26e8c36b2dfda3b7e5fc4b1065257cefdea235fc7df9af130ac2fd",
|
||||||
|
"blk.2.post_ffw_norm.weight": "e550369e26b8485e2b54ad34b34bc98af5494287dcc513c2c39cf1eaa5b89d07",
|
||||||
|
"blk.3.attn_k.weight": "89f24ea450e37d9e95757651a83205c085d81b354ee9489dd6310a391d8409f3",
|
||||||
|
"blk.3.attn_norm.weight": "24e2ea662b7cb822b4ca5cd61bc17f2709f406d990ec3b4a0dac1cc112db45cf",
|
||||||
|
"blk.3.attn_output.weight": "ac4dad69473c6e3fac56669212cadd8c34ecc5973d945972e974d94805334967",
|
||||||
|
"blk.3.attn_q.weight": "b6a9c9a7d4722b9096631c65de62228dfddca6e26edfe6af7fce01e116ef0f4c",
|
||||||
|
"blk.3.attn_v.weight": "f272a960a40093942309bc342a379984cbacec2d7bc64428db3f64e6b1887ed4",
|
||||||
|
"blk.3.ffn_down.weight": "c0188ba50d8228805982029c277fc0e87aa57473b8363037c648f6d006ff828a",
|
||||||
|
"blk.3.ffn_gate.weight": "a04aec1561ee6c0fbb18c3db49dc62fb533619cf697fd548cbf2279761aaec3b",
|
||||||
|
"blk.3.ffn_norm.weight": "bc053837d44087ec05eb5d9458357b2a5be787789b19cdbbdc694b57697f99a6",
|
||||||
|
"blk.3.ffn_up.weight": "b3ce8b274f20796d3b1a7c08ba27a919066f9de89a782faa544c4a8d6bea1382",
|
||||||
|
"blk.3.post_attention_norm.weight": "9c922dee7a7df5667289e2788e60170238239cee2dfdbbd9e435763f9f416718",
|
||||||
|
"blk.3.post_ffw_norm.weight": "b682544ac953ad2e0b49027ed8916f2e9d1aba5d1587bb4127ac703570c7a03a",
|
||||||
|
"blk.4.attn_k.weight": "143b0cbb4b787b95c2b6212374410e32173ccef2adb914908a2f89a7916de512",
|
||||||
|
"blk.4.attn_norm.weight": "5668f60491b780273745192662d02c9a92a4f692b29d16aa0bbc7413fec4f85b",
|
||||||
|
"blk.4.attn_output.weight": "b9f2bdb68be1e0cf66dd19f8fa2afb105910ad2ef394864cb32cea8f8944e0d5",
|
||||||
|
"blk.4.attn_q.weight": "ddcf1343dafbc2dfcd0b8741225af22fe4b54b2becce29240bd01c34265d126c",
|
||||||
|
"blk.4.attn_v.weight": "6dc7074366e7ed52d9f48c594dcc85bef738e096276cb99d28228c89eecc5b9c",
|
||||||
|
"blk.4.ffn_down.weight": "30334ffc59ce343cf2a1b973174acb7722823463adc07e19a99bd0f404bc9906",
|
||||||
|
"blk.4.ffn_gate.weight": "890f7c8af208d63b28db52c4b8c16c2288a382d87ff5a6a6d6b0a5b3bf27e6cd",
|
||||||
|
"blk.4.ffn_norm.weight": "ff0316cc7847221eb86a90c1ab441d4ee61553d410c66414a7755021b3b12448",
|
||||||
|
"blk.4.ffn_up.weight": "6af97d113f91564c636734f215e25ee602d48eb045458f300b3ec7582be0f41d",
|
||||||
|
"blk.4.post_attention_norm.weight": "69438f231e105e68216b078bdeb35a7cdc8b12c4e2845e18ecf4c8d361d6a321",
|
||||||
|
"blk.4.post_ffw_norm.weight": "0fd535da78bcf2b32c95b05b2b83dc49817393765be90d8cc1ed3d56f47b68ec",
|
||||||
|
"blk.5.attn_k.weight": "0166eb3c6d20dcf3d3c169e94caa8dee057535bb525e29f698fb6f8844f18a6c",
|
||||||
|
"blk.5.attn_norm.weight": "a7808f27f164023d5cde2be00fc23cac6c71aa0ddeb60bc23e12411b80087672",
|
||||||
|
"blk.5.attn_output.weight": "8b65b2027a0842b68c5308f91d6a31de9599d794157d77df8418b19f9e0d9334",
|
||||||
|
"blk.5.attn_q.weight": "966bc626ef2c2394d872087a41c126bb1b67d1d5f6de920204ef5e5b16c34003",
|
||||||
|
"blk.5.attn_v.weight": "9a362aef3f4437fbf0ef6e1ba785f3329c3db2960f93fe36547d2795e9c254ea",
|
||||||
|
"blk.5.ffn_down.weight": "63e53541d34197720c06f297aa8142ac6b6eec002c7987b296f26e8b1400f931",
|
||||||
|
"blk.5.ffn_gate.weight": "d9591fdd32f783e0fc26e20d5d587ee8971ac8ae2e4c818c6eac1c125c7c7f37",
|
||||||
|
"blk.5.ffn_norm.weight": "677334cc60ecce3a7f4ab3acda15d359353d7358872f614ad8914e3780e9fc6e",
|
||||||
|
"blk.5.ffn_up.weight": "a63764110e1c655ffbd55af0669b2dfe4cc29d0e198d33a8e5426461b08a85f7",
|
||||||
|
"blk.5.post_attention_norm.weight": "c55499f859b2c0a7f5cabceaae47309a5ad38bc29d0f4a8db81f1357023162a9",
|
||||||
|
"blk.5.post_ffw_norm.weight": "82752754665f842418f3e302cb5f43d1e0504dcd124c4b8ddb77018b2c793837",
|
||||||
|
"blk.6.attn_k.weight": "e20a5f0d6c807273c8d491439566b428497ac02097cf0aa55e33748c28e14be6",
|
||||||
|
"blk.6.attn_norm.weight": "2c6ba42fd3c73d72073ced03a32dd28d70a89ed9bbbc8fea1ba03a7ade951e6c",
|
||||||
|
"blk.6.attn_output.weight": "4de7c5c2f4a133a266e17ed8c14c52959466b54cc7ab9e19f789a33b4850f284",
|
||||||
|
"blk.6.attn_q.weight": "56462d921800e6b8cd2213fef04c4ff16d728905cb2f4c58e966d0a053a3b0ae",
|
||||||
|
"blk.6.attn_v.weight": "b758dcbff769d6240c2245ede1dbc62c4170a67c77458e866312589220fe29af",
|
||||||
|
"blk.6.ffn_down.weight": "582247fb3c2bf687cbe9413fe18d18ad47bef4b65df7d78905e10335c6134764",
|
||||||
|
"blk.6.ffn_gate.weight": "3035444d5286aefb7a6d04e55bc27e1fac7cf895cd5be02319a431b8e047b4ae",
|
||||||
|
"blk.6.ffn_norm.weight": "e582d24c66e01b96faa20ce6adfda3d8583b11e809bff89969927398175e369a",
|
||||||
|
"blk.6.ffn_up.weight": "6f4b7bbfedeacf61a4866ae0616c4ba6c9e856662e8f00ae6aaec7f52c53e7b4",
|
||||||
|
"blk.6.post_attention_norm.weight": "8fe51b50bd677d21586aecab0b565c4bf9fa68ad50bfe366f45e8fea3c657ca8",
|
||||||
|
"blk.6.post_ffw_norm.weight": "81ba3cb4c2bf5c546b86855b7a885d3fafededc67eb3a35cd3598b03c9e26e65",
|
||||||
|
"blk.7.attn_k.weight": "2e044179cdcae0946708c86bfea7aa0391e1f7e2a09b33fca035d384cc3ca758",
|
||||||
|
"blk.7.attn_norm.weight": "94b48c546b046803c60e75a3acb17a356b710735989938021b565f68df9b4985",
|
||||||
|
"blk.7.attn_output.weight": "65709b4ad7a581f4d75793d39d4032a359f6bcc0c3835205242a0b99e5b66824",
|
||||||
|
"blk.7.attn_q.weight": "8ded993c95d1f7caf201ceb6fa035cd6ed6d351b50b999fa9355dfee9486cb5b",
|
||||||
|
"blk.7.attn_v.weight": "c92d5e2d2d48397542bc03bea25bf39154075e66c5bb1ead85188505aa04ae91",
|
||||||
|
"blk.7.ffn_down.weight": "e8ba8fb57208805ef1dc23cd7c86e9a2d1fb7c52c3940d292cd5bb2eb24b3fac",
|
||||||
|
"blk.7.ffn_gate.weight": "f0f06d6a2e06c5ac252083bc61d05c814e6289d3f4e4a87d2f06918254c02c36",
|
||||||
|
"blk.7.ffn_norm.weight": "ebf8ef775f72624148e09d68a4332187a7a5020c521fe0623da1cd3485ad33e0",
|
||||||
|
"blk.7.ffn_up.weight": "a554adc4fc7122c247c77670e169916ba1794c787b5be30a2b36705138f1f746",
|
||||||
|
"blk.7.post_attention_norm.weight": "3aa6bc21d85c3a0c12b964e82b12feaedfdd13130c3cd2229228e24e0967ebdf",
|
||||||
|
"blk.7.post_ffw_norm.weight": "508bc7b19ee8ff08f0007c890133a462fc57c7e72b16ee8f6dd64def264ef876",
|
||||||
|
"blk.8.attn_k.weight": "363c8e74056642fe9e7c2f3f9769d57319cd3fa0a6022810189ab8d894322885",
|
||||||
|
"blk.8.attn_norm.weight": "685b49a1f1acb169f4df0bdd8e3de6943f3033cebad14b898a72000595610d92",
|
||||||
|
"blk.8.attn_output.weight": "7bde571e4efef1c6a6143f0526721dfb59e0a0ea0e1a3616a322b2eb937efa48",
|
||||||
|
"blk.8.attn_q.weight": "fc993dbc1074c28a0e1d85e5ab2f4ea6a9c6c1affe7ee56027000a275daed9b6",
|
||||||
|
"blk.8.attn_v.weight": "281e8791d3aef9b3864f1cb054da0ae0c2fef4ce0a58b1bad8bc136b2fa0f62b",
|
||||||
|
"blk.8.ffn_down.weight": "b1164a2578a7f87ed99c2bbc76c5dfbbbc6a1a803605391acc3f320fc989ffd7",
|
||||||
|
"blk.8.ffn_gate.weight": "6b39a3b3aaaa79aee61416b54d62160b9258042650e61c6b47bc77c2dd17daf3",
|
||||||
|
"blk.8.ffn_norm.weight": "17ea1362c72da27f12bc936500492035bdef3fd8f940cb12b57f37d42ba8ecb1",
|
||||||
|
"blk.8.ffn_up.weight": "bc3a7c47afc440d2bdf8fbe9ddf2c9220467472c60c8b4ded8c0f181470ec96c",
|
||||||
|
"blk.8.post_attention_norm.weight": "5c506204e00411ef9c8b4134d40eedcc19fffe68dd0af7d7cc49dcabf2dfac7e",
|
||||||
|
"blk.8.post_ffw_norm.weight": "002faec235c3678864e2901eed275ce4e9dc229164a91c9cd4c965142ba62305",
|
||||||
|
"blk.9.attn_k.weight": "0bab39d8c237f1b6d0010db40467142625a9e6f2e0e4c49a56c12b41e4e0b1fa",
|
||||||
|
"blk.9.attn_norm.weight": "de5f38e873b17f07aa7598831b89cc1cae2c9bc3eb2e042ee9af059d2563e84e",
|
||||||
|
"blk.9.attn_output.weight": "8a8184702c25a62df9ff309c0c7badc8587208523b2be3e8fa90ce7080573e6f",
|
||||||
|
"blk.9.attn_q.weight": "7c961b2431b09ddf95377acd07201cb91bf13d9cd3ae0f2c25c7d6a0358d9f50",
|
||||||
|
"blk.9.attn_v.weight": "e22d240cb4743067033e659cbf210ebe2ebbab3e1dea6ccbe5eaa982382ca038",
|
||||||
|
"blk.9.ffn_down.weight": "a426f81210f03d6ad53277416e1fdcdf37d8065e4817613edaf6c67a343426be",
|
||||||
|
"blk.9.ffn_gate.weight": "a82eba825cb77b8e64f85ff99ede2fc71bc9b01751eeb17e9e6c246ee12ea62e",
|
||||||
|
"blk.9.ffn_norm.weight": "1a97f9b1302a3a326d534c5c3fed2db6db0ae45fd0edd381a3e4fc1c75d81030",
|
||||||
|
"blk.9.ffn_up.weight": "5f20bac2bbf03bb42adb92fbf99561651e1edda57e0b61935ac7f6c08c0ed7cb",
|
||||||
|
"blk.9.post_attention_norm.weight": "9f9866d13988e1946b1e1c80d9374a92a6e3be33748f8eaed3e126d1e1a4c796",
|
||||||
|
"blk.9.post_ffw_norm.weight": "a6896dbf698db4dbbe5dbf12417d4fd80e9cad0c539c858892ec0aa5b046bb58",
|
||||||
|
"blk.10.attn_k.weight": "ca8446e5d21ecd4e6a70dca8d321be480be4fba94d70cba065205436feb44270",
|
||||||
|
"blk.10.attn_norm.weight": "4f41fe290e8f21f63b82151b6cce94bf7318d121468816b0c58af0ff7c1658ab",
|
||||||
|
"blk.10.attn_output.weight": "c626d2e9681c5c941bbde43dddfae1a8d4986bf2be4470857bc8e8bd7f869044",
|
||||||
|
"blk.10.attn_q.weight": "1e61b210a13a429977325cf15d781ab77d604cfa862f4270329cbd94237d5835",
|
||||||
|
"blk.10.attn_v.weight": "8ff8d3e3f058ec3b35ada1057f2ed59c06494d0e0be6a8dc3ff9edf9f0e1a115",
|
||||||
|
"blk.10.ffn_down.weight": "bcebc04219f8081a5f483e58103c0ddbbbc631a0a54fd6dd9d55778e041f70ee",
|
||||||
|
"blk.10.ffn_gate.weight": "7a23a1e620ef871384ddf9611ccdcfb893fbf013cc203ac8e72f745420f1eea0",
|
||||||
|
"blk.10.ffn_norm.weight": "e3a375e43c349a1c6c66c22328e513cc1af3137fe839e43dc8e9be2f65914fd7",
|
||||||
|
"blk.10.ffn_up.weight": "5d182e7c94369194fca5f19cbbe668a999911e57f3d363bc7fb6088428700cb9",
|
||||||
|
"blk.10.post_attention_norm.weight": "b841c6308296e8984f3c5f549c6e3a242f4b3e19141e1f54cc08de9c46759c09",
|
||||||
|
"blk.10.post_ffw_norm.weight": "9d66fa05b5c940208f634f5053d809094c99a2a10a1d1e8847c8281fbd99fb49",
|
||||||
|
"blk.11.attn_k.weight": "14adf24ebb2bb17b336ca81cec3e690fd854782f4440ca6c66cc1d7e7bf1c850",
|
||||||
|
"blk.11.attn_norm.weight": "2d2213f311f50414702b5b34f22aafb9d9a0b6787243e7578562583dc40ad195",
|
||||||
|
"blk.11.attn_output.weight": "de1f14cc2a7fff00cf11b229f0576999205f17b9536e97abc9d6de3cc79a7884",
|
||||||
|
"blk.11.attn_q.weight": "2bcc5c147524003109ece0be08b89ac8b25baa71416ffa76573c6c052ffc6eea",
|
||||||
|
"blk.11.attn_v.weight": "2e6ab8573070c22dc1e0d7aebe4d52123226dacf7822dcce06fadbb38fb036a4",
|
||||||
|
"blk.11.ffn_down.weight": "1b86902f4e36868421e5228b9445051f8290b292df22a6d1af836dcecc1f25c3",
|
||||||
|
"blk.11.ffn_gate.weight": "e756e8081bd0a16aea4a9ef5076ad102113524f7a3d50a3a77aaa7f7938b63e8",
|
||||||
|
"blk.11.ffn_norm.weight": "6913887267be227cf9d1991a3dd8db2e7e74bb9b5fbdfcb9ac954fd7d7b95b3b",
|
||||||
|
"blk.11.ffn_up.weight": "619a3ac0609ebdf42c3fb2b6e4b1db48df79e6dd8418d7ab8f1bbff13d8a6a50",
|
||||||
|
"blk.11.post_attention_norm.weight": "e4b4ba92cef7b6a78407e8ab1b0307d47dac6c3df7b6817e28038317ff662d7e",
|
||||||
|
"blk.11.post_ffw_norm.weight": "40aceeec58cb855f0c158c9cc217168fcd5d0e735567d587217b1d78df17bc5f",
|
||||||
|
"blk.12.attn_k.weight": "c54c5a4d4892522022d1aa2204cfc624f0b4042caa536e678967316293fe5cb1",
|
||||||
|
"blk.12.attn_norm.weight": "7cd2ef58298569ffdf244d9b390f3917245276c8206e5780af5f96d8c0bbb446",
|
||||||
|
"blk.12.attn_output.weight": "85495ef9cc8b3deb21f741bde463ff6493acae2be51f02ecdeef952cbdec3375",
|
||||||
|
"blk.12.attn_q.weight": "d19383f83fd119bfb8c0280c9515705c11d8e7d502019fcf8f49efeef0d106d0",
|
||||||
|
"blk.12.attn_v.weight": "869ac669ba49531d9128892a0e27cef15de508ff40cdf80cc1681dde50d09204",
|
||||||
|
"blk.12.ffn_down.weight": "578f39f8f9fc2f09138afc884a952d7cc3a9a31de4216acd10e88e19e0b75f8c",
|
||||||
|
"blk.12.ffn_gate.weight": "e29a0186bc6c4a0720246306e922d3a83f777dadcf4ac80bad468287031cc8b5",
|
||||||
|
"blk.12.ffn_norm.weight": "e1ee95c6584b5cb57fcf1db8ce2bcc03aff91eb389238c094a61c00dde93d1f2",
|
||||||
|
"blk.12.ffn_up.weight": "2a826f06d7cdfb3edc6ae250ff44363ef77a2a9cdf96313e23a331b99ebfa17d",
|
||||||
|
"blk.12.post_attention_norm.weight": "4bafc7699b948d5cbc0d3e09b418b06c6abc4651a61ada9609d9a2f21c7e5607",
|
||||||
|
"blk.12.post_ffw_norm.weight": "bbb8c34a7176bb1a49f9fe2bacca0bd26b673d52c0835b2e90fa11f2962f077f",
|
||||||
|
"blk.13.attn_k.weight": "ffeefccfe8255d1b694382012ff4134eee5fec9d9491c8d0ff0a13832d1a37e8",
|
||||||
|
"blk.13.attn_norm.weight": "35713726529e3887c4135a88e86e8a4d7270ba5b9f2d1ab462622fbf40a7cdce",
|
||||||
|
"blk.13.attn_output.weight": "0d60b7c5cd71190a9ef4b873b0f516be15447c32d83914db2794b14592b0b460",
|
||||||
|
"blk.13.attn_q.weight": "8296069e65bef794cefc61257fc65789b3cb22955e30f3df129205e5041b2222",
|
||||||
|
"blk.13.attn_v.weight": "ca0f4ab9d16a748fc643a5c0c7a19826a811bf2a4e7316a8c935d4bf0ce8abc6",
|
||||||
|
"blk.13.ffn_down.weight": "d5514e0c8e7b3ed1cbcc1605eb5be1733b6ab3514cf8a0508fc72f7d05ed8bcb",
|
||||||
|
"blk.13.ffn_gate.weight": "8108e517a82e08a3aefbbd267bfa50a1668f92a76273280ce8a6bc1f6dd61521",
|
||||||
|
"blk.13.ffn_norm.weight": "5fcb6132d2134bf1f835b904a99820fa501dbc57d2224129f7098bf3cabc1d36",
|
||||||
|
"blk.13.ffn_up.weight": "6d744b7cd390a3cae3aa350dd379b81246acd056a2259996b6aaadece8465ccc",
|
||||||
|
"blk.13.post_attention_norm.weight": "e08b14698912509790e9575b8676971fbb0a4d82d719367e3756c0d0c4ab8cc0",
|
||||||
|
"blk.13.post_ffw_norm.weight": "2b196e4450fc5f1e7367b2cf7fe33a15fe919fbcdd861d11002346f16e980535",
|
||||||
|
"blk.14.attn_k.weight": "120e5f48d7268dfd9ab5f4bc9cc57a7cec63ea9635f56b80d435eb22936e9483",
|
||||||
|
"blk.14.attn_norm.weight": "146367bcce4db72cc894419a2e0145a6f533507dd68e4739c10ee480308c401f",
|
||||||
|
"blk.14.attn_output.weight": "720fa0165e756876c5cb6ad9e2780dd910390933f3f8849e5add5da04266650b",
|
||||||
|
"blk.14.attn_q.weight": "f5183466f56219ca1aca52d8b82c2d966a4198fea40fdd6b39f4d8b06ca2a6dd",
|
||||||
|
"blk.14.attn_v.weight": "24f8ea3d5512cd37c43c8329cb0da0c90d1895aef763ac2dcee3fe5157ec50a2",
|
||||||
|
"blk.14.ffn_down.weight": "e29960965b384ae5ab3d898a4dbaa8fddd28fa0e477ac28bcac49dec12a5ac67",
|
||||||
|
"blk.14.ffn_gate.weight": "6d0d6a74bfe9692e8f8eedff0fc34fc4fa1c8687794f35f2e2b033ab2d7510b8",
|
||||||
|
"blk.14.ffn_norm.weight": "f7036c1a9a71e046c9d2af16e9218fda5dbb0f7241ab44747abed1f0f9d602ca",
|
||||||
|
"blk.14.ffn_up.weight": "7d69ea1424007ffc9c12247dd0308c616e93ac02a59ec341cfa48f92d6ce3b10",
|
||||||
|
"blk.14.post_attention_norm.weight": "65b9712834d9445d4236bec362f3fb795c20d60c541b3dc6dbb7914d9b493e41",
|
||||||
|
"blk.14.post_ffw_norm.weight": "9c6a8da2e4e437d5cfdf3b9097e9f8b64bf07946a048badec20f4d374613f38f",
|
||||||
|
"blk.15.attn_k.weight": "864bc618303a0e4ee67fb1d5e751de61e936cd51e96669dd86f8cd08f2305045",
|
||||||
|
"blk.15.attn_norm.weight": "f9f4187da6eeadc2fc5921d8fe669741697d16c13d71e4aaeb73b82f50dc577e",
|
||||||
|
"blk.15.attn_output.weight": "ce2419a0b097036b2a31f2f4ad731d5814bcc2ef4c511786e24471e5eefd273b",
|
||||||
|
"blk.15.attn_q.weight": "9539db5a970d11ebe99722d1e13fcd635e250033630811efe583d2f97778e4a9",
|
||||||
|
"blk.15.attn_v.weight": "1c834b48ccd88adaeabb7d8bcb6be0bcd6d5ac1354ce88fc28f19a1a96b81ab3",
|
||||||
|
"blk.15.ffn_down.weight": "bc1f97a65dde6fa2c1e5397afb612266944b343f2eaa868b635ddd25829f8a42",
|
||||||
|
"blk.15.ffn_gate.weight": "1b14529d57056b79037f6cb5008132e62cc35992353b38dda59572274623103b",
|
||||||
|
"blk.15.ffn_norm.weight": "9af77458de9ee55c66f93865759f9c2c398557f94f3fa8fa6af30543d7339cde",
|
||||||
|
"blk.15.ffn_up.weight": "41d524a26b61a9595816b4fd53cf57ef50a702e4ef32933ff6136dca9136a267",
|
||||||
|
"blk.15.post_attention_norm.weight": "c60a03cd0e63a7db5c80015e58e9b97ba2208caa19f66a6fef5c4447eca900ce",
|
||||||
|
"blk.15.post_ffw_norm.weight": "34f7f9f96769215bbc3d17084df091864aef96a6645b7d0b3b7d9bd92f1a4b0b",
|
||||||
|
"blk.16.attn_k.weight": "7e27240d9f3a8c6cf0f4a980113d43234f514eadc3e3e1792b86efb29ffb1a6d",
|
||||||
|
"blk.16.attn_norm.weight": "af798acc0899282a30448edec48223b3e8efda177090273e612d8eca5e377301",
|
||||||
|
"blk.16.attn_output.weight": "79df39a3709d3d53e84146291e0944a7a653d06705293d9ccb5648dceadb432c",
|
||||||
|
"blk.16.attn_q.weight": "db58a1c3b83ad294804e5fd7321005719e200659173466df5a52a182b80b7165",
|
||||||
|
"blk.16.attn_v.weight": "2af6d48cbaeb225b5c1a704f76abd89c8ab1521417695b112b4dcc2cbd39b74d",
|
||||||
|
"blk.16.ffn_down.weight": "fc1c813eb5e7da3d6194569d6cb21602fc6eff2dc8e1b0eb753f2d5df148189c",
|
||||||
|
"blk.16.ffn_gate.weight": "7a80bcbc42464bd55df4814a6edbd7b5c153e0428323bbe49de55e2d2add33e7",
|
||||||
|
"blk.16.ffn_norm.weight": "2041685ee926d30f3f2ae4ec35b5688f1cd834167a6359a7d4057eac804c58b2",
|
||||||
|
"blk.16.ffn_up.weight": "8da4b718973ac1d43b928829bc45e062fd101984d6c98dd825bd7c5d08ebfbe3",
|
||||||
|
"blk.16.post_attention_norm.weight": "975c48fe680a6167438a106140a8872eee7765191f152d80e3b8ddf47693e095",
|
||||||
|
"blk.16.post_ffw_norm.weight": "4de2d4d483acfe4fc77860ea929025df2f4e15c10729413f36a18c94eaa6d689",
|
||||||
|
"blk.17.attn_k.weight": "f937e61f0af8c4cd98ee742648eb60e02e579683e21d421071295a3b70aebaad",
|
||||||
|
"blk.17.attn_norm.weight": "c3270583ed28b7e423f5b170c59113234f258169b93a867d9274f4c10b7cb115",
|
||||||
|
"blk.17.attn_output.weight": "b8c1150e81e685e539a5dcf2c19047a24eba2b281fabe166674b1d71ef4612ea",
|
||||||
|
"blk.17.attn_q.weight": "c255100ae2011e7dc7e3bf3bc3ccd96d859fbb98581cae993d7b82c1ba8e8b39",
|
||||||
|
"blk.17.attn_v.weight": "5830bb0a555984c6485348067f70b5d22ae337c011aa9248dac2ff4c95944551",
|
||||||
|
"blk.17.ffn_down.weight": "8ff9a7cccaa3776434a9d895aae4fb5c36c736bf2ec98784226b4c234940fbb0",
|
||||||
|
"blk.17.ffn_gate.weight": "1b52876739712831c272911533da206f407b46034a1a4ae8a88c1f96b6bd5747",
|
||||||
|
"blk.17.ffn_norm.weight": "d0e16ba5e87c91b545334e022058c7d03849665c3b1a6298771b656531366b66",
|
||||||
|
"blk.17.ffn_up.weight": "4dd6211d01dbebbe21052708eddc242b082a58b5f18ed16479e17987c1d3432e",
|
||||||
|
"blk.17.post_attention_norm.weight": "6f49c775c7417dade77ba8268a0f8441c1e5ec28b5d7e4dc5ed07a04d04600c8",
|
||||||
|
"blk.17.post_ffw_norm.weight": "b91a0bb2e6679e9c9be06ad323adae441d00a3d673efb19d7c4954be2aa84b27",
|
||||||
|
"blk.18.attn_k.weight": "22b565ace1b4da8b33865a58625be1d90beea9891f29686a69fa9cf7c93217db",
|
||||||
|
"blk.18.attn_norm.weight": "3e0160d7063c8753de65d2356a66648e47d921efdc5c917efb8209892120f8db",
|
||||||
|
"blk.18.attn_output.weight": "e3180f0bb4ca90b31e9b08158db38e332de62dfbaefe34aa94cc316409331e09",
|
||||||
|
"blk.18.attn_q.weight": "f3a5a83614c3ba7ea41cdd5b1b0819a241ee2a951a381ce4a9e001d3f700ed8f",
|
||||||
|
"blk.18.attn_v.weight": "f3350a5984fb951fc738adcf78147e6d812ff1c576670c460cafc99c253c1654",
|
||||||
|
"blk.18.ffn_down.weight": "9e9d09b13a33525e14bdaee6efc65c551ac7cf7680e534b940ab122a3a7c1ac9",
|
||||||
|
"blk.18.ffn_gate.weight": "ebaec8b4b578a2e8d815baac12f1675c208f80c68074d5a18288a2e1a60680ee",
|
||||||
|
"blk.18.ffn_norm.weight": "33e7687c53a242f2f8dc7093a491c97b18d4a5a8c14d183f02bd586a770f05aa",
|
||||||
|
"blk.18.ffn_up.weight": "78a1816662378ce56cc870e705174492781897b3afd2d4d97a51f10f2f2987c1",
|
||||||
|
"blk.18.post_attention_norm.weight": "a58dde3f12df3e94cbc27d87c8ea86f89af8a388a506446ff6758f05399b05fc",
|
||||||
|
"blk.18.post_ffw_norm.weight": "cebf90cc143577d483cca27b032dfd82031ee59bdf17c0e2cf60a0a3ad5bf996",
|
||||||
|
"blk.19.attn_k.weight": "4683375d0599ac9e2232196aae1e90af13a14cae26e865465de5c8e257bb2055",
|
||||||
|
"blk.19.attn_norm.weight": "f3eba936bfb1814bbcb0a1d62739eb66daac839df8c9c836fe0e94860df88525",
|
||||||
|
"blk.19.attn_output.weight": "51c0f01d38a9dcfe9bdbc4643576fab164c1d9e4b7168b7695c0ee55e6965667",
|
||||||
|
"blk.19.attn_q.weight": "28d15b69b8416f2e7ddc88fe381cb1e2ef2ad705fb1c268139ba96498cc74848",
|
||||||
|
"blk.19.attn_v.weight": "6860f1cd720638e63a981fa2c0b4db900129826bcb9823c9ddf9fb8b1b9f3383",
|
||||||
|
"blk.19.ffn_down.weight": "bc7f2d7827ee01c2dd41401c7b3b1700ad3a4ff620e8bb734f92630d342dcc7f",
|
||||||
|
"blk.19.ffn_gate.weight": "54d03ef69ba373fc410fbca8f1e34a565d58e4296d9a035ff7e48340b9c848e7",
|
||||||
|
"blk.19.ffn_norm.weight": "9178fc796a340ee6e8128ca74c0cb6203d1adbed6927af4e5ac7863da57affc7",
|
||||||
|
"blk.19.ffn_up.weight": "a77bd708026c6e83ad5c79c223278e74621bcf74a9641c7818d96b595daaad20",
|
||||||
|
"blk.19.post_attention_norm.weight": "ae94aa26f4c411bf9496a6fd4a6df64ee589ee1ae9a04b531d45acc95721e582",
|
||||||
|
"blk.19.post_ffw_norm.weight": "9ad210700edeef12133bdcff04bf1c7f62b49f6f4a9ba483c7cdc59857c24a5c",
|
||||||
|
"blk.20.attn_k.weight": "e35bce1e9f4a7a09ef34721f57ea38cfca68c272f52d923fe50af8308f66cfaa",
|
||||||
|
"blk.20.attn_norm.weight": "644800f6926fd34f233795c4dec1151a295d2138ca8cac33e3e48167d26f8b41",
|
||||||
|
"blk.20.attn_output.weight": "8d3758cd236471741e1ad66c0710cb79077dc8c7a3a292d35bc551c0c5abe627",
|
||||||
|
"blk.20.attn_q.weight": "c333b1f0f6f956b5d73891df10b1a0321e55fc31c40d623a24e1f52caa6a998b",
|
||||||
|
"blk.20.attn_v.weight": "8562b418d0c4868a050fb19fa3fcaf50a8cf1c669f537d666c80c7b3a04714e1",
|
||||||
|
"blk.20.ffn_down.weight": "97efb608ac44cc804198faec3ee66eafe56ced6b7ca5359700c6f1df75b7205e",
|
||||||
|
"blk.20.ffn_gate.weight": "5c61151d86f28415c73c73d90ec088c646cbe5c1640197caf58eb501ba7db293",
|
||||||
|
"blk.20.ffn_norm.weight": "24bbe0a701afd4bbeea65b3edde712b3cbb2281043bbc43dbf250582453116ed",
|
||||||
|
"blk.20.ffn_up.weight": "e170cf68e249566aa99eb6f6b265679bf9a5a6b76830ba24e7e130c2515910c4",
|
||||||
|
"blk.20.post_attention_norm.weight": "e092d751cfe20dbf2d348358f3b38397bd83e4ed94d6bbaa6bbaddcd902b2ac4",
|
||||||
|
"blk.20.post_ffw_norm.weight": "219a18a47dcba76e669e4322223a5a9227bd3db1de3fbd3d3cfb22e54a783c5a",
|
||||||
|
"blk.21.attn_k.weight": "c3a095ebddb42c63824f1c98da65263dc88e4d790a26aa1632840b44f5cc7cb1",
|
||||||
|
"blk.21.attn_norm.weight": "ef8bbaded5fbc45ad9cf3985ae02174524e7090fe6362811124f942ef643bec7",
|
||||||
|
"blk.21.attn_output.weight": "668f018aba72baac6252aa3ad58569ddd55ab751a0dd8d7bcc9fb9b6efb4bf53",
|
||||||
|
"blk.21.attn_q.weight": "e759c65663089f3bbbd51847934c185e680c82f1249065d5d487da638e519e6d",
|
||||||
|
"blk.21.attn_v.weight": "2ff57762686cf9ba1f5a6be76503454b97556ce67f4ac98254bd0562231197ba",
|
||||||
|
"blk.21.ffn_down.weight": "3fd106556fb721b1c28ae3f4026bc83eb1b08ed910f2ba5f466c6b5f327d91cb",
|
||||||
|
"blk.21.ffn_gate.weight": "338022d882f4b6619e8054a6fb909696fa3eef3013cf69b65c3cacdfc5b9e42c",
|
||||||
|
"blk.21.ffn_norm.weight": "1e77660c23a3f9653ee721a863d1960f773d87437cabc4dc0a6e17ee3d4e5e44",
|
||||||
|
"blk.21.ffn_up.weight": "7d31b20fbc2e6eba8f350f170069dc36f0cb12f68fbc4206ec5022a74085ebcb",
|
||||||
|
"blk.21.post_attention_norm.weight": "9638bae8d8bdcd7ed68da282979cd84a07c41ff9cabcaea94ebc846a1803db23",
|
||||||
|
"blk.21.post_ffw_norm.weight": "d622ef11115fe0cbe04b727d5a3b6371e7f39bf08c8d5eb9bc6da52e3f3cfb9d",
|
||||||
|
"blk.22.attn_k.weight": "5c321cb29deffbe57de200dd206a62005f1e80acb86c4fd2349dd44c8d3594fd",
|
||||||
|
"blk.22.attn_norm.weight": "198d949705d7170a331d75889d8c7500c3635254dac2cc6aa4dc35d556584536",
|
||||||
|
"blk.22.attn_output.weight": "19805cd5d7025b457e5d41d70db8b3fd63c2dd0e4a94d3ef1704d50ef4e749e8",
|
||||||
|
"blk.22.attn_q.weight": "177836cd583fc87405975ddc21ebfebdaa090a0363799664c72caa3da851ae2c",
|
||||||
|
"blk.22.attn_v.weight": "fea255692483e30d0108f9e4e250eb3ed7dbda8d83f499b06519b8c223ae6096",
|
||||||
|
"blk.22.ffn_down.weight": "00cb8939f03e5817d6d412de8cf2c923c9568d5493e382cec7faf5718fb034eb",
|
||||||
|
"blk.22.ffn_gate.weight": "b0591065b91281b2fbd8a9567f3568d40479f680e1f0a29e27ae213f37642489",
|
||||||
|
"blk.22.ffn_norm.weight": "96b5c5d0737c2ceb8fc869f54adb9e5f46e28cb7b177c40f49fa926b923c00f8",
|
||||||
|
"blk.22.ffn_up.weight": "81f472185b24344ab0594ea8246cc6e200e0dc1cab4943e74fbe4ca19d5a9701",
|
||||||
|
"blk.22.post_attention_norm.weight": "27fa9aa6260aa3071e0391e1a1d49322dcb6e8072315b8a9b7064087108dbd06",
|
||||||
|
"blk.22.post_ffw_norm.weight": "f37e1dcd7f643d9545675ffe9dc527a11eba86eb204989c2f44f636b266d896a",
|
||||||
|
"blk.23.attn_k.weight": "5d82f36658a56c3f94d0bb2d61f65509c966fa6568f81812e0d3e338b380ef8c",
|
||||||
|
"blk.23.attn_norm.weight": "b7983f88d9cad88bc88a528923e6da592ad20e699965b223ebc10840fe1f4fec",
|
||||||
|
"blk.23.attn_output.weight": "59f97f80f430d71606aab0158a195aed29ccd3405e6c0a5c41c809be8eb01898",
|
||||||
|
"blk.23.attn_q.weight": "53ac4789fe958919cc02ea4222bcd64c0ea1b4baa54304bff46635bdf42f7490",
|
||||||
|
"blk.23.attn_v.weight": "ec8abe09b9e84dbb52c7a068094657c6d3c62fe551ba8d7c3a3f23da622e9756",
|
||||||
|
"blk.23.ffn_down.weight": "3cf547eccb1b82aa64f208cee9682d7f558ca84e0aead7d9d3d1420d90f3d992",
|
||||||
|
"blk.23.ffn_gate.weight": "366aa2486d911ba81eb519119e13807deacf7e9908bc1975a2a63e00d6b10124",
|
||||||
|
"blk.23.ffn_norm.weight": "6d1d4a4af34bb7dc090ac87d6457d398c3e0fb68bd2e2b60b099dc318b6cfac3",
|
||||||
|
"blk.23.ffn_up.weight": "53f76692e253f5d2420b3f200c731b9f3b7a83e379920b4a067c729b4674aa4d",
|
||||||
|
"blk.23.post_attention_norm.weight": "7c952fa0efa76b3f048c8c4c9e8dcb5e3724d231327eda6423a34d3f3d3367de",
|
||||||
|
"blk.23.post_ffw_norm.weight": "7ab188cfe61f0a91b40309a0ab6bfa99f19d0ff2a37b6ac10e5f0c7f44eb5270",
|
||||||
|
"blk.24.attn_k.weight": "225798792f9bfdd10eff0505ebe61e0aad0209c17b431f6044ee7968ffe8c198",
|
||||||
|
"blk.24.attn_norm.weight": "635e3c1ebf5219bbebfc40ef164bc32d2b726ef595a94da64ac524ae878e2915",
|
||||||
|
"blk.24.attn_output.weight": "482f5bb2db8d9ed22b253d9a3296333b239efe698e5992e5d77e7e12dc2a5cf5",
|
||||||
|
"blk.24.attn_q.weight": "43805bbccddb65d58fffc4be9b5c374d4e1df1395ec1e1ffb4bcff03e98d5adb",
|
||||||
|
"blk.24.attn_v.weight": "fa741af54b4a3b1775d32f59134756090c5df2e7345a12a2d8db94fe289667a7",
|
||||||
|
"blk.24.ffn_down.weight": "83c6351e3162626b276f524a57836144625c2556dbe321b57cbd8fd486a68fab",
|
||||||
|
"blk.24.ffn_gate.weight": "fbe66be0d84d12cea5176cc7eaef64382ffc7324cd9d6266a3342dc43442f2ac",
|
||||||
|
"blk.24.ffn_norm.weight": "77c1445a8639ad24938bdf0280233eea2362d47391421833dfa72ec756dfc1e8",
|
||||||
|
"blk.24.ffn_up.weight": "78235ac729ee23c1cf1ae543751e3af32776d8808cee6e529c2a625a1f027654",
|
||||||
|
"blk.24.post_attention_norm.weight": "161f71b6d07628d43e4ae51a4c9088ec6ca2db123a17986a14505d83fdd04dad",
|
||||||
|
"blk.24.post_ffw_norm.weight": "cf1ba692aa683368b02ac413e69b2521b98c69a5274eacbb54165b53bf38a8b2",
|
||||||
|
"blk.25.attn_k.weight": "057a56bd8c8d2b41608d1f71faa3052902152ddf85e47669ad950c1c3e77c33f",
|
||||||
|
"blk.25.attn_norm.weight": "b7179fe02c334da556ddcf6c1b502245639a728c4cbba8b552d8e1df4565ee9d",
|
||||||
|
"blk.25.attn_output.weight": "4fed8b05b08a0ff75ffd022701bbeb52f17b23d09332a1ddcba737244bd0d3b0",
|
||||||
|
"blk.25.attn_q.weight": "c52e99f5d38bf7538d6106a0bbf38ac6dc6296bca9a3f849afa384ea67b4af01",
|
||||||
|
"blk.25.attn_v.weight": "c49c23d8e1cfa6a8eb971eb69942204890c6d7d830dc8774c84b108a80598912",
|
||||||
|
"blk.25.ffn_down.weight": "c08d4dc8412b19fdc870c164b83c341b236ec6fe7bb4a9bcfe0dc100faa20286",
|
||||||
|
"blk.25.ffn_gate.weight": "1a4cb3f36735d59181721471452807903006539e5e1b5ceb4f72d1d7ae134127",
|
||||||
|
"blk.25.ffn_norm.weight": "8fd6bd0dcec5198761525a36992a57c9ec5e9da60a22092839a84ae8c4e87f26",
|
||||||
|
"blk.25.ffn_up.weight": "3a00f39bdd5f31dc5e3b281d2002e1ac4f2475d49a0ac1d7720a25b377dcd04a",
|
||||||
|
"blk.25.post_attention_norm.weight": "e5f31a648612c859b6d21c9ee426e87a86cb1973dfdd86276c767371d9cef5ad",
|
||||||
|
"blk.25.post_ffw_norm.weight": "553c3bd774922c99c2384380a142d019881d30dbf0fe3bf9430dabfb3f6cbd33",
|
||||||
|
"output_norm.weight": "49445c4585ab0a8135717a0bdb1cda4a062a030177d0119561d91542aec5744b"
|
||||||
|
}
|
||||||
6
convert/testdata/gemma-2-9b-it.json
vendored
Normal file
6
convert/testdata/gemma-2-9b-it.json
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"general.architecture": "gemma2",
|
||||||
|
"gemma2.attention.sliding_window": "4096",
|
||||||
|
"gemma2.attn_logit_softcapping": "50",
|
||||||
|
"gemma2.final_logit_softcapping": "30"
|
||||||
|
}
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@@ -11,6 +10,8 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"golang.org/x/exp/maps"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -99,8 +100,21 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if template, ok := p["chat_template"]; ok {
|
if template, ok := p["chat_template"]; ok {
|
||||||
if err := json.Unmarshal(template, &t.Template); err != nil {
|
var s []struct {
|
||||||
return nil, err
|
Name string `json:"name"`
|
||||||
|
Template string `json:"template"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(template, &t.Template); err == nil {
|
||||||
|
// noop
|
||||||
|
} else if err := json.Unmarshal(template, &s); err == nil {
|
||||||
|
for _, e := range s {
|
||||||
|
if e.Name == "default" {
|
||||||
|
t.Template = e.Template
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid chat_template: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,7 +154,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
type tokenizer struct {
|
type tokenizer struct {
|
||||||
Version string `json:"version"`
|
|
||||||
AddedTokens []token `json:"added_tokens"`
|
AddedTokens []token `json:"added_tokens"`
|
||||||
Model struct {
|
Model struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
@@ -184,32 +197,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var tokens []token
|
tokens := make(map[int]token, len(t.Model.Vocab))
|
||||||
for k, v := range t.Model.Vocab {
|
for k, v := range t.Model.Vocab {
|
||||||
tokens = append(tokens, token{
|
tokens[v] = token{
|
||||||
ID: v,
|
ID: v,
|
||||||
Content: k,
|
Content: k,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range t.AddedTokens {
|
for _, token := range t.AddedTokens {
|
||||||
t.UserDefined = true
|
token.UserDefined = true
|
||||||
tokens = append(tokens, t)
|
tokens[token.ID] = token
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.SortFunc(tokens, func(i, j token) int {
|
keys := maps.Keys(tokens)
|
||||||
return cmp.Compare(i.ID, j.ID)
|
slices.Sort(keys)
|
||||||
})
|
|
||||||
|
|
||||||
v := Vocabulary{Model: "gpt2"}
|
v := Vocabulary{Model: "gpt2"}
|
||||||
for _, t := range tokens {
|
for _, k := range keys {
|
||||||
v.Tokens = append(v.Tokens, t.Content)
|
token := tokens[k]
|
||||||
v.Scores = append(v.Scores, float32(t.ID))
|
v.Tokens = append(v.Tokens, token.Content)
|
||||||
|
v.Scores = append(v.Scores, float32(token.ID))
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case t.Special:
|
case token.Special:
|
||||||
v.Types = append(v.Types, tokenTypeControl)
|
v.Types = append(v.Types, tokenTypeControl)
|
||||||
case t.UserDefined:
|
case token.UserDefined:
|
||||||
v.Types = append(v.Types, tokenTypeUserDefined)
|
v.Types = append(v.Types, tokenTypeUserDefined)
|
||||||
default:
|
default:
|
||||||
v.Types = append(v.Types, tokenTypeNormal)
|
v.Types = append(v.Types, tokenTypeNormal)
|
||||||
@@ -238,7 +251,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
|
|||||||
return pattern.Func(fsys)
|
return pattern.Func(fsys)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, errors.New("unknown tensor format")
|
return nil, errors.New("unknown tokenizer format")
|
||||||
}
|
}
|
||||||
|
|
||||||
type SpecialVocabulary struct {
|
type SpecialVocabulary struct {
|
||||||
|
|||||||
@@ -15,6 +15,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
||||||
|
ast, err := parseAdditionalSpecialTokens(fsys)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
bts, err := fs.ReadFile(fsys, "tokenizer.model")
|
bts, err := fs.ReadFile(fsys, "tokenizer.model")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
|||||||
sentencepiece.ModelProto_SentencePiece_BYTE:
|
sentencepiece.ModelProto_SentencePiece_BYTE:
|
||||||
v.Types = append(v.Types, int32(t))
|
v.Types = append(v.Types, int32(t))
|
||||||
default:
|
default:
|
||||||
v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
|
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
|
||||||
|
if slices.Contains(ast, piece.GetPiece()) {
|
||||||
|
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
|
||||||
|
}
|
||||||
|
|
||||||
|
v.Types = append(v.Types, tt)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
|||||||
|
|
||||||
return &v, nil
|
return &v, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
|
||||||
|
f, err := fsys.Open("special_tokens_map.json")
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return nil, nil
|
||||||
|
} else if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var m struct {
|
||||||
|
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.NewDecoder(f).Decode(&m); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return m.AdditionalSpecialTokens, nil
|
||||||
|
}
|
||||||
|
|||||||
208
convert/tokenizer_test.go
Normal file
208
convert/tokenizer_test.go
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"io/fs"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
for k, v := range files {
|
||||||
|
if err := func() error {
|
||||||
|
f, err := os.Create(filepath.Join(dir, k))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
if _, err := io.Copy(f, v); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}(); err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return os.DirFS(dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseTokenizer(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
fsys fs.FS
|
||||||
|
specialTokenTypes []string
|
||||||
|
want *Tokenizer
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "string chat template",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{}`),
|
||||||
|
"tokenizer_config.json": strings.NewReader(`{
|
||||||
|
"chat_template": "<default template>"
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{Model: "gpt2"},
|
||||||
|
Pre: "default",
|
||||||
|
Template: "<default template>",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "list chat template",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{}`),
|
||||||
|
"tokenizer_config.json": strings.NewReader(`{
|
||||||
|
"chat_template": [
|
||||||
|
{
|
||||||
|
"name": "default",
|
||||||
|
"template": "<default template>"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "tools",
|
||||||
|
"template": "<tools template>"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{Model: "gpt2"},
|
||||||
|
Pre: "default",
|
||||||
|
Template: "<default template>",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "added tokens",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{
|
||||||
|
"added_tokens": [
|
||||||
|
{
|
||||||
|
"id": 999,
|
||||||
|
"content": "<unused999>",
|
||||||
|
"special": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{
|
||||||
|
Model: "gpt2",
|
||||||
|
Tokens: []string{"<unused999>"},
|
||||||
|
Scores: []float32{999},
|
||||||
|
Types: []int32{4},
|
||||||
|
},
|
||||||
|
Pre: "default",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "added tokens overlap vocab",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{
|
||||||
|
"added_tokens": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"content": "<pad>",
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": {
|
||||||
|
"vocab": {
|
||||||
|
"<pad>": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{
|
||||||
|
Model: "gpt2",
|
||||||
|
Tokens: []string{"<pad>"},
|
||||||
|
Scores: []float32{0},
|
||||||
|
Types: []int32{3},
|
||||||
|
},
|
||||||
|
Pre: "default",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "special token types",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{
|
||||||
|
"added_tokens": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"content": "<pad>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"content": "<eos>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"content": "<bos>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"content": "<unk>",
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": {
|
||||||
|
"vocab": {
|
||||||
|
"<pad>": 0,
|
||||||
|
"<eos>": 1,
|
||||||
|
"<bos>": 2,
|
||||||
|
"<unk>": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`),
|
||||||
|
"tokenizer_config.json": strings.NewReader(`{
|
||||||
|
"add_bos_token": true,
|
||||||
|
"add_eos_token": false,
|
||||||
|
"bos_token": "<bos>",
|
||||||
|
"eos_token": "<eos>",
|
||||||
|
"pad_token": "<pad>",
|
||||||
|
"unk_token": "<unk>"
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{
|
||||||
|
Model: "gpt2",
|
||||||
|
Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
|
||||||
|
Scores: []float32{0, 1, 2, 3},
|
||||||
|
Types: []int32{3, 3, 3, 3},
|
||||||
|
},
|
||||||
|
SpecialVocabulary: []*SpecialVocabulary{
|
||||||
|
{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
|
||||||
|
{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
|
||||||
|
{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
|
||||||
|
{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
|
||||||
|
},
|
||||||
|
Pre: "default",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(tt.want, tokenizer); diff != "" {
|
||||||
|
t.Errorf("unexpected tokenizer (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
133
docs/api.md
133
docs/api.md
@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?"
|
"prompt": "Why is the sky blue?"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"response": "The",
|
"response": "The",
|
||||||
"done": false
|
"done": false
|
||||||
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "",
|
"response": "",
|
||||||
"done": true,
|
"done": true,
|
||||||
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"stream": false
|
"stream": false
|
||||||
}'
|
}'
|
||||||
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "The sky is blue because it is the color of the sky.",
|
"response": "The sky is blue because it is the color of the sky.",
|
||||||
"done": true,
|
"done": true,
|
||||||
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"stream": false
|
"stream": false
|
||||||
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||||
"done": true,
|
"done": true,
|
||||||
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"stream": false,
|
"stream": false,
|
||||||
"options": {
|
"options": {
|
||||||
@@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "The sky is blue because it is the color of the sky.",
|
"response": "The sky is blue because it is the color of the sky.",
|
||||||
"done": true,
|
"done": true,
|
||||||
@@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3"
|
"model": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -400,13 +400,40 @@ A single JSON object is returned:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-18T19:52:07.071755Z",
|
"created_at": "2023-12-18T19:52:07.071755Z",
|
||||||
"response": "",
|
"response": "",
|
||||||
"done": true
|
"done": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Unload a model
|
||||||
|
|
||||||
|
If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
|
||||||
|
|
||||||
|
##### Request
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:11434/api/generate -d '{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"keep_alive": 0
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Response
|
||||||
|
|
||||||
|
A single JSON object is returned:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"created_at": "2024-09-12T03:54:03.516566Z",
|
||||||
|
"response": "",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "unload"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Generate a chat completion
|
## Generate a chat completion
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@@ -445,7 +472,7 @@ Send a chat message with a streaming response.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -461,7 +488,7 @@ A stream of JSON objects is returned:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@@ -476,7 +503,7 @@ Final response:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 4883583458,
|
"total_duration": 4883583458,
|
||||||
@@ -494,7 +521,7 @@ Final response:
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -509,7 +536,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "registry.ollama.ai/library/llama3:latest",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@@ -533,7 +560,7 @@ Send a chat message with a conversation history. You can use this same approach
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -557,7 +584,7 @@ A stream of JSON objects is returned:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@@ -571,7 +598,7 @@ Final response:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 8113331500,
|
"total_duration": 8113331500,
|
||||||
@@ -629,7 +656,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -647,7 +674,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "registry.ollama.ai/library/llama3:latest",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@@ -736,6 +763,64 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Load a model
|
||||||
|
|
||||||
|
If the messages array is empty, the model will be loaded into memory.
|
||||||
|
|
||||||
|
##### Request
|
||||||
|
|
||||||
|
```
|
||||||
|
curl http://localhost:11434/api/chat -d '{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"messages": []
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Response
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"created_at":"2024-09-12T21:17:29.110811Z",
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": ""
|
||||||
|
},
|
||||||
|
"done_reason": "load",
|
||||||
|
"done": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Unload a model
|
||||||
|
|
||||||
|
If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
|
||||||
|
|
||||||
|
##### Request
|
||||||
|
|
||||||
|
```
|
||||||
|
curl http://localhost:11434/api/chat -d '{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"messages": [],
|
||||||
|
"keep_alive": 0
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Response
|
||||||
|
|
||||||
|
A single JSON object is returned:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "llama3.1",
|
||||||
|
"created_at":"2024-09-12T21:33:17.547535Z",
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": ""
|
||||||
|
},
|
||||||
|
"done_reason": "unload",
|
||||||
|
"done": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Create a Model
|
## Create a Model
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@@ -904,7 +989,7 @@ Show information about a model including details, modelfile, template, parameter
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/show -d '{
|
curl http://localhost:11434/api/show -d '{
|
||||||
"name": "llama3"
|
"name": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -965,7 +1050,7 @@ Copy a model. Creates a model with another name from an existing model.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/copy -d '{
|
curl http://localhost:11434/api/copy -d '{
|
||||||
"source": "llama3",
|
"source": "llama3.1",
|
||||||
"destination": "llama3-backup"
|
"destination": "llama3-backup"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
@@ -1020,7 +1105,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/pull -d '{
|
curl http://localhost:11434/api/pull -d '{
|
||||||
"name": "llama3"
|
"name": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
25
docs/faq.md
25
docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"options": {
|
"options": {
|
||||||
"num_ctx": 4096
|
"num_ctx": 4096
|
||||||
@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
|
|||||||
|
|
||||||
## How do I use Ollama behind a proxy?
|
## How do I use Ollama behind a proxy?
|
||||||
|
|
||||||
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
|
||||||
|
|
||||||
### How do I use Ollama behind a proxy in Docker?
|
### How do I use Ollama behind a proxy in Docker?
|
||||||
|
|
||||||
@@ -191,6 +194,8 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
|||||||
|
|
||||||
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
||||||
|
|
||||||
|
> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
|
||||||
|
|
||||||
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
||||||
|
|
||||||
## How can I use Ollama in Visual Studio Code?
|
## How can I use Ollama in Visual Studio Code?
|
||||||
@@ -232,9 +237,13 @@ ollama run llama3.1 ""
|
|||||||
|
|
||||||
## How do I keep a model loaded in memory or make it unload immediately?
|
## How do I keep a model loaded in memory or make it unload immediately?
|
||||||
|
|
||||||
By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you are making numerous requests to the LLM. You may, however, want to free up the memory before the 5 minutes have elapsed or keep the model loaded indefinitely. Use the `keep_alive` parameter with either the `/api/generate` and `/api/chat` API endpoints to control how long the model is left in memory.
|
By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:
|
||||||
|
|
||||||
The `keep_alive` parameter can be set to:
|
```shell
|
||||||
|
ollama stop llama3.1
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
|
||||||
* a duration string (such as "10m" or "24h")
|
* a duration string (such as "10m" or "24h")
|
||||||
* a number in seconds (such as 3600)
|
* a number in seconds (such as 3600)
|
||||||
* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
|
* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
|
||||||
@@ -242,17 +251,17 @@ The `keep_alive` parameter can be set to:
|
|||||||
|
|
||||||
For example, to preload a model and leave it in memory use:
|
For example, to preload a model and leave it in memory use:
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
|
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
|
||||||
```
|
```
|
||||||
|
|
||||||
To unload the model and free up memory use:
|
To unload the model and free up memory use:
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
|
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
||||||
|
|
||||||
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
|
The `keep_alive` API parameter with the `/api/generate` and `/api/chat` API endpoints will override the `OLLAMA_KEEP_ALIVE` setting.
|
||||||
|
|
||||||
## How do I manage the maximum number of requests the Ollama server can queue?
|
## How do I manage the maximum number of requests the Ollama server can queue?
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
|
|||||||
| 9.0 | NVIDIA | `H100` |
|
| 9.0 | NVIDIA | `H100` |
|
||||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||||
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
|
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050` |
|
||||||
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
||||||
| 8.0 | NVIDIA | `A100` `A30` |
|
| 8.0 | NVIDIA | `A100` `A30` |
|
||||||
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |
|
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |
|
||||||
|
|||||||
BIN
docs/images/ollama-keys.png
Normal file
BIN
docs/images/ollama-keys.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 150 KiB |
BIN
docs/images/signup.png
Normal file
BIN
docs/images/signup.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 80 KiB |
188
docs/import.md
188
docs/import.md
@@ -1,44 +1,129 @@
|
|||||||
# Import
|
# Importing a model
|
||||||
|
|
||||||
GGUF models and select Safetensors models can be imported directly into Ollama.
|
## Table of Contents
|
||||||
|
|
||||||
## Import GGUF
|
* [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
|
||||||
|
* [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
|
||||||
|
* [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
|
||||||
|
* [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
|
||||||
|
|
||||||
A binary GGUF file can be imported directly into Ollama through a Modelfile.
|
## Importing a fine tuned adapter from Safetensors weights
|
||||||
|
|
||||||
|
First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM /path/to/file.gguf
|
FROM <base model name>
|
||||||
|
ADAPTER /path/to/safetensors/adapter/directory
|
||||||
```
|
```
|
||||||
|
|
||||||
## Import Safetensors
|
Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.
|
||||||
|
|
||||||
If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
|
Now run `ollama create` from the directory where the `Modelfile` was created:
|
||||||
|
|
||||||
- LlamaForCausalLM
|
```bash
|
||||||
- MistralForCausalLM
|
ollama create my-model
|
||||||
- MixtralForCausalLM
|
```
|
||||||
- GemmaForCausalLM
|
|
||||||
- Phi3ForCausalLM
|
Lastly, test the model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama run my-model
|
||||||
|
```
|
||||||
|
|
||||||
|
Ollama supports importing adapters based on several different model architectures including:
|
||||||
|
|
||||||
|
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
||||||
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
||||||
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
|
||||||
|
You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
|
||||||
|
|
||||||
|
* Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
|
||||||
|
* [Unsloth](https://github.com/unslothai/unsloth)
|
||||||
|
* [MLX](https://github.com/ml-explore/mlx)
|
||||||
|
|
||||||
|
|
||||||
|
## Importing a model from Safetensors weights
|
||||||
|
|
||||||
|
First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM /path/to/safetensors/directory
|
FROM /path/to/safetensors/directory
|
||||||
```
|
```
|
||||||
|
|
||||||
For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
|
If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
|
||||||
|
|
||||||
## Automatic Quantization
|
Now run the `ollama create` command from the directory where you created the `Modelfile`:
|
||||||
|
|
||||||
> [!NOTE]
|
```shell
|
||||||
> Automatic quantization requires v0.1.35 or higher.
|
ollama create my-model
|
||||||
|
```
|
||||||
|
|
||||||
Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
|
Lastly, test the model:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ollama run my-model
|
||||||
|
```
|
||||||
|
|
||||||
|
Ollama supports importing models for several different architectures including:
|
||||||
|
|
||||||
|
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
||||||
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
||||||
|
* Gemma (including Gemma 1 and Gemma 2); and
|
||||||
|
* Phi3
|
||||||
|
|
||||||
|
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
|
||||||
|
|
||||||
|
|
||||||
|
## Importing a GGUF based model or adapter
|
||||||
|
|
||||||
|
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
||||||
|
|
||||||
|
* converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp;
|
||||||
|
* converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
|
||||||
|
* downloading a model or adapter from a place such as HuggingFace
|
||||||
|
|
||||||
|
To import a GGUF model, create a `Modelfile` containg:
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
FROM /path/to/file.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
For a GGUF adapter, create the `Modelfile` with:
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
FROM <model name>
|
||||||
|
ADAPTER /path/to/file.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
|
||||||
|
|
||||||
|
* a model from Ollama
|
||||||
|
* a GGUF file
|
||||||
|
* a Safetensors based model
|
||||||
|
|
||||||
|
Once you have created your `Modelfile`, use the `ollama create` command to build the model.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ollama create my-model
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quantizing a Model
|
||||||
|
|
||||||
|
Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
|
||||||
|
|
||||||
|
Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
|
||||||
|
|
||||||
|
First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM /path/to/my/gemma/f16/model
|
FROM /path/to/my/gemma/f16/model
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Use `ollama create` to then create the quantized model.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
$ ollama create -q Q4_K_M mymodel
|
$ ollama create --quantize q4_K_M mymodel
|
||||||
transferring model data
|
transferring model data
|
||||||
quantizing F16 model to Q4_K_M
|
quantizing F16 model to Q4_K_M
|
||||||
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
|
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
|
||||||
@@ -49,42 +134,53 @@ success
|
|||||||
|
|
||||||
### Supported Quantizations
|
### Supported Quantizations
|
||||||
|
|
||||||
- `Q4_0`
|
- `q4_0`
|
||||||
- `Q4_1`
|
- `q4_1`
|
||||||
- `Q5_0`
|
- `q5_0`
|
||||||
- `Q5_1`
|
- `q5_1`
|
||||||
- `Q8_0`
|
- `q8_0`
|
||||||
|
|
||||||
#### K-means Quantizations
|
#### K-means Quantizations
|
||||||
|
|
||||||
- `Q3_K_S`
|
- `q3_K_S`
|
||||||
- `Q3_K_M`
|
- `q3_K_M`
|
||||||
- `Q3_K_L`
|
- `q3_K_L`
|
||||||
- `Q4_K_S`
|
- `q4_K_S`
|
||||||
- `Q4_K_M`
|
- `q4_K_M`
|
||||||
- `Q5_K_S`
|
- `q5_K_S`
|
||||||
- `Q5_K_M`
|
- `q5_K_M`
|
||||||
- `Q6_K`
|
- `q6_K`
|
||||||
|
|
||||||
## Template Detection
|
|
||||||
|
|
||||||
> [!NOTE]
|
## Sharing your model on ollama.com
|
||||||
> Template detection requires v0.1.42 or higher.
|
|
||||||
|
|
||||||
Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
|
You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.
|
||||||
|
|
||||||
```dockerfile
|
First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
|
||||||
FROM /path/to/my/gemma/model
|
|
||||||
```
|
<img src="images/signup.png" alt="Sign-Up" width="40%">
|
||||||
|
|
||||||
|
The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
|
||||||
|
|
||||||
|
Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
|
||||||
|
|
||||||
|
Follow the directions on the page to determine where your Ollama Public Key is located.
|
||||||
|
|
||||||
|
<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
|
||||||
|
|
||||||
|
Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
|
||||||
|
|
||||||
|
To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
|
||||||
|
your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
$ ollama create mymodel
|
ollama cp mymodel myuser/mymodel
|
||||||
transferring model data
|
ollama push myuser/mymodel
|
||||||
using autodetected template gemma-instruct
|
```
|
||||||
creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
|
|
||||||
creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
|
Once your model has been pushed, other users can pull and run it by using the command:
|
||||||
writing manifest
|
|
||||||
success
|
```shell
|
||||||
|
ollama run myuser/mymodel
|
||||||
```
|
```
|
||||||
|
|
||||||
Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.
|
|
||||||
|
|||||||
111
docs/linux.md
111
docs/linux.md
@@ -1,40 +1,59 @@
|
|||||||
# Ollama on Linux
|
# Linux
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
Install Ollama running this one-liner:
|
To install Ollama, run the following command:
|
||||||
|
|
||||||
>
|
```shell
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -fsSL https://ollama.com/install.sh | sh
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
## AMD Radeon GPU support
|
|
||||||
|
|
||||||
While AMD has contributed the `amdgpu` driver upstream to the official linux
|
|
||||||
kernel source, the version is older and may not support all ROCm features. We
|
|
||||||
recommend you install the latest driver from
|
|
||||||
https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
|
||||||
GPU.
|
|
||||||
|
|
||||||
## Manual install
|
## Manual install
|
||||||
|
|
||||||
### Download the `ollama` binary
|
Download and extract the package:
|
||||||
|
|
||||||
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
|
```shell
|
||||||
|
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
|
||||||
|
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
Start Ollama:
|
||||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
|
||||||
sudo chmod +x /usr/bin/ollama
|
```shell
|
||||||
|
ollama serve
|
||||||
|
```
|
||||||
|
|
||||||
|
In another terminal, verify that Ollama is running:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ollama -v
|
||||||
|
```
|
||||||
|
|
||||||
|
### AMD GPU install
|
||||||
|
|
||||||
|
If you have an AMD GPU, also download and extract the additional ROCm package:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
|
||||||
|
sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
|
||||||
|
```
|
||||||
|
|
||||||
|
### ARM64 install
|
||||||
|
|
||||||
|
Download and extract the ARM64-specific package:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
|
||||||
|
sudo tar -C /usr -xzf ollama-linux-arm64.tgz
|
||||||
```
|
```
|
||||||
|
|
||||||
### Adding Ollama as a startup service (recommended)
|
### Adding Ollama as a startup service (recommended)
|
||||||
|
|
||||||
Create a user for Ollama:
|
Create a user and group for Ollama:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
|
||||||
|
sudo usermod -a -G ollama $(whoami)
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a service file in `/etc/systemd/system/ollama.service`:
|
Create a service file in `/etc/systemd/system/ollama.service`:
|
||||||
@@ -50,6 +69,7 @@ User=ollama
|
|||||||
Group=ollama
|
Group=ollama
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=3
|
RestartSec=3
|
||||||
|
Environment="PATH=$PATH"
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=default.target
|
WantedBy=default.target
|
||||||
@@ -57,47 +77,54 @@ WantedBy=default.target
|
|||||||
|
|
||||||
Then start the service:
|
Then start the service:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo systemctl daemon-reload
|
sudo systemctl daemon-reload
|
||||||
sudo systemctl enable ollama
|
sudo systemctl enable ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install CUDA drivers (optional – for Nvidia GPUs)
|
### Install CUDA drivers (optional)
|
||||||
|
|
||||||
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
||||||
|
|
||||||
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install ROCm (optional - for Radeon GPUs)
|
### Install AMD ROCm drivers (optional)
|
||||||
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
|
|
||||||
|
|
||||||
Make sure to install ROCm v6
|
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
|
||||||
|
|
||||||
### Start Ollama
|
### Start Ollama
|
||||||
|
|
||||||
Start Ollama using `systemd`:
|
Start Ollama and verify it is running:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo systemctl start ollama
|
sudo systemctl start ollama
|
||||||
|
sudo systemctl status ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
## Update
|
> [!NOTE]
|
||||||
|
> While AMD has contributed the `amdgpu` driver upstream to the official linux
|
||||||
|
> kernel source, the version is older and may not support all ROCm features. We
|
||||||
|
> recommend you install the latest driver from
|
||||||
|
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
||||||
|
> GPU.
|
||||||
|
|
||||||
Update ollama by running the install script again:
|
## Updating
|
||||||
|
|
||||||
```bash
|
Update Ollama by running the install script again:
|
||||||
|
|
||||||
|
```shell
|
||||||
curl -fsSL https://ollama.com/install.sh | sh
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Or by downloading the ollama binary:
|
Or by re-downloading Ollama:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
|
||||||
sudo chmod +x /usr/bin/ollama
|
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
|
||||||
```
|
```
|
||||||
|
|
||||||
## Installing specific versions
|
## Installing specific versions
|
||||||
@@ -106,15 +133,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s
|
|||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```shell
|
||||||
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
|
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
|
||||||
```
|
```
|
||||||
|
|
||||||
## Viewing logs
|
## Viewing logs
|
||||||
|
|
||||||
To view logs of Ollama running as a startup service, run:
|
To view logs of Ollama running as a startup service, run:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
journalctl -e -u ollama
|
journalctl -e -u ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -122,7 +149,7 @@ journalctl -e -u ollama
|
|||||||
|
|
||||||
Remove the ollama service:
|
Remove the ollama service:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo systemctl stop ollama
|
sudo systemctl stop ollama
|
||||||
sudo systemctl disable ollama
|
sudo systemctl disable ollama
|
||||||
sudo rm /etc/systemd/system/ollama.service
|
sudo rm /etc/systemd/system/ollama.service
|
||||||
@@ -130,13 +157,13 @@ sudo rm /etc/systemd/system/ollama.service
|
|||||||
|
|
||||||
Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
|
Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo rm $(which ollama)
|
sudo rm $(which ollama)
|
||||||
```
|
```
|
||||||
|
|
||||||
Remove the downloaded models and Ollama service user and group:
|
Remove the downloaded models and Ollama service user and group:
|
||||||
|
|
||||||
```bash
|
```shell
|
||||||
sudo rm -r /usr/share/ollama
|
sudo rm -r /usr/share/ollama
|
||||||
sudo userdel ollama
|
sudo userdel ollama
|
||||||
sudo groupdel ollama
|
sudo groupdel ollama
|
||||||
|
|||||||
@@ -11,8 +11,9 @@ A model file is the blueprint to create and share models with Ollama.
|
|||||||
- [Examples](#examples)
|
- [Examples](#examples)
|
||||||
- [Instructions](#instructions)
|
- [Instructions](#instructions)
|
||||||
- [FROM (Required)](#from-required)
|
- [FROM (Required)](#from-required)
|
||||||
- [Build from llama3](#build-from-llama3)
|
- [Build from existing model](#build-from-existing-model)
|
||||||
- [Build from a bin file](#build-from-a-bin-file)
|
- [Build from a Safetensors model](#build-from-a-safetensors-model)
|
||||||
|
- [Build from a GGUF file](#build-from-a-gguf-file)
|
||||||
- [PARAMETER](#parameter)
|
- [PARAMETER](#parameter)
|
||||||
- [Valid Parameters and Values](#valid-parameters-and-values)
|
- [Valid Parameters and Values](#valid-parameters-and-values)
|
||||||
- [TEMPLATE](#template)
|
- [TEMPLATE](#template)
|
||||||
@@ -49,7 +50,7 @@ INSTRUCTION arguments
|
|||||||
An example of a `Modelfile` creating a mario blueprint:
|
An example of a `Modelfile` creating a mario blueprint:
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
FROM llama3
|
FROM llama3.1
|
||||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
||||||
@@ -71,10 +72,10 @@ More examples are available in the [examples directory](../examples).
|
|||||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
> ollama show --modelfile llama3
|
> ollama show --modelfile llama3.1
|
||||||
# Modelfile generated by "ollama show"
|
# Modelfile generated by "ollama show"
|
||||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||||
# FROM llama3:latest
|
# FROM llama3.1:latest
|
||||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||||
|
|
||||||
@@ -99,22 +100,39 @@ The `FROM` instruction defines the base model to use when creating a model.
|
|||||||
FROM <model name>:<tag>
|
FROM <model name>:<tag>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Build from llama3
|
#### Build from existing model
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
FROM llama3
|
FROM llama3.1
|
||||||
```
|
```
|
||||||
|
|
||||||
A list of available base models:
|
A list of available base models:
|
||||||
<https://github.com/ollama/ollama#model-library>
|
<https://github.com/ollama/ollama#model-library>
|
||||||
|
Additional models can be found at:
|
||||||
|
<https://ollama.com/library>
|
||||||
|
|
||||||
#### Build from a `bin` file
|
#### Build from a Safetensors model
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
FROM ./ollama-model.bin
|
FROM <model directory>
|
||||||
```
|
```
|
||||||
|
|
||||||
This bin file location should be specified as an absolute path or relative to the `Modelfile` location.
|
The model directory should contain the Safetensors weights for a supported architecture.
|
||||||
|
|
||||||
|
Currently supported model architectures:
|
||||||
|
* Llama (including Llama 2, Llama 3, and Llama 3.1)
|
||||||
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
||||||
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
* Phi3
|
||||||
|
|
||||||
|
#### Build from a GGUF file
|
||||||
|
|
||||||
|
```modelfile
|
||||||
|
FROM ./ollama-model.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
|
||||||
|
|
||||||
|
|
||||||
### PARAMETER
|
### PARAMETER
|
||||||
|
|
||||||
@@ -174,10 +192,23 @@ SYSTEM """<system message>"""
|
|||||||
|
|
||||||
### ADAPTER
|
### ADAPTER
|
||||||
|
|
||||||
The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
|
The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
|
||||||
|
|
||||||
|
#### Safetensor adapter
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
ADAPTER ./ollama-lora.bin
|
ADAPTER <path to safetensor adapter>
|
||||||
|
```
|
||||||
|
|
||||||
|
Currently supported Safetensor adapters:
|
||||||
|
* Llama (including Llama 2, Llama 3, and Llama 3.1)
|
||||||
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
||||||
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
|
||||||
|
#### GGUF adapter
|
||||||
|
|
||||||
|
```modelfile
|
||||||
|
ADAPTER ./ollama-lora.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
### LICENSE
|
### LICENSE
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
|
|||||||
'content': 'Say this is a test',
|
'content': 'Say this is a test',
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
model='llama3',
|
model='llama3.1',
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
|
|||||||
)
|
)
|
||||||
|
|
||||||
completion = client.completions.create(
|
completion = client.completions.create(
|
||||||
model="llama3",
|
model="llama3.1",
|
||||||
prompt="Say this is a test",
|
prompt="Say this is a test",
|
||||||
)
|
)
|
||||||
|
|
||||||
list_completion = client.models.list()
|
list_completion = client.models.list()
|
||||||
|
|
||||||
model = client.models.retrieve("llama3")
|
model = client.models.retrieve("llama3.1")
|
||||||
|
|
||||||
embeddings = client.embeddings.create(
|
embeddings = client.embeddings.create(
|
||||||
model="all-minilm",
|
model="all-minilm",
|
||||||
@@ -74,7 +74,7 @@ const openai = new OpenAI({
|
|||||||
|
|
||||||
const chatCompletion = await openai.chat.completions.create({
|
const chatCompletion = await openai.chat.completions.create({
|
||||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||||
model: 'llama3',
|
model: 'llama3.1',
|
||||||
})
|
})
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
const response = await openai.chat.completions.create({
|
||||||
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
|
|||||||
})
|
})
|
||||||
|
|
||||||
const completion = await openai.completions.create({
|
const completion = await openai.completions.create({
|
||||||
model: "llama3",
|
model: "llama3.1",
|
||||||
prompt: "Say this is a test.",
|
prompt: "Say this is a test.",
|
||||||
})
|
})
|
||||||
|
|
||||||
const listCompletion = await openai.models.list()
|
const listCompletion = await openai.models.list()
|
||||||
|
|
||||||
const model = await openai.models.retrieve("llama3")
|
const model = await openai.models.retrieve("llama3.1")
|
||||||
|
|
||||||
const embedding = await openai.embeddings.create({
|
const embedding = await openai.embeddings.create({
|
||||||
model: "all-minilm",
|
model: "all-minilm",
|
||||||
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
|
|||||||
curl http://localhost:11434/v1/chat/completions \
|
curl http://localhost:11434/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
|
|||||||
curl http://localhost:11434/v1/completions \
|
curl http://localhost:11434/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Say this is a test"
|
"prompt": "Say this is a test"
|
||||||
}'
|
}'
|
||||||
|
|
||||||
curl http://localhost:11434/v1/models
|
curl http://localhost:11434/v1/models
|
||||||
|
|
||||||
curl http://localhost:11434/v1/models/llama3
|
curl http://localhost:11434/v1/models/llama3.1
|
||||||
|
|
||||||
curl http://localhost:11434/v1/embeddings \
|
curl http://localhost:11434/v1/embeddings \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
@@ -182,7 +182,6 @@ curl http://localhost:11434/v1/embeddings \
|
|||||||
- [x] Reproducible outputs
|
- [x] Reproducible outputs
|
||||||
- [x] Vision
|
- [x] Vision
|
||||||
- [x] Tools (streaming support coming soon)
|
- [x] Tools (streaming support coming soon)
|
||||||
- [ ] Vision
|
|
||||||
- [ ] Logprobs
|
- [ ] Logprobs
|
||||||
|
|
||||||
#### Supported request fields
|
#### Supported request fields
|
||||||
@@ -275,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
|
|||||||
Before using a model, pull it locally `ollama pull`:
|
Before using a model, pull it locally `ollama pull`:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
ollama pull llama3
|
ollama pull llama3.1
|
||||||
```
|
```
|
||||||
|
|
||||||
### Default model names
|
### Default model names
|
||||||
@@ -283,7 +282,7 @@ ollama pull llama3
|
|||||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama cp llama3 gpt-3.5-turbo
|
ollama cp llama3.1 gpt-3.5-turbo
|
||||||
```
|
```
|
||||||
|
|
||||||
Afterwards, this new model name can be specified the `model` field:
|
Afterwards, this new model name can be specified the `model` field:
|
||||||
@@ -301,3 +300,28 @@ curl http://localhost:11434/v1/chat/completions \
|
|||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Setting the context size
|
||||||
|
|
||||||
|
The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
|
||||||
|
|
||||||
|
```modelfile
|
||||||
|
FROM <some model>
|
||||||
|
PARAMETER num_ctx <context size>
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:11434/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "mymodel",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello!"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
|
|||||||
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM llama3
|
FROM llama3.1
|
||||||
|
|
||||||
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
||||||
|
|
||||||
@@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template
|
|||||||
ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
|
ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
|
||||||
|
|
||||||
```gotmpl
|
```gotmpl
|
||||||
{{- if .System }}<|im_start|>system
|
|
||||||
{{ .System }}<|im_end|>
|
|
||||||
{{ end }}
|
|
||||||
{{- range .Messages }}<|im_start|>{{ .Role }}
|
{{- range .Messages }}<|im_start|>{{ .Role }}
|
||||||
{{ .Content }}<|im_end|>
|
{{ .Content }}<|im_end|>
|
||||||
{{ end }}<|im_start|>assistant
|
{{ end }}<|im_start|>assistant
|
||||||
{{ else }}
|
|
||||||
{{ if .System }}<|im_start|>system
|
|
||||||
{{ .System }}<|im_end|>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example Tools
|
### Example Tools
|
||||||
|
|||||||
@@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an
|
|||||||
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
||||||
|
|
||||||
|
|
||||||
|
## AMD GPU Discovery
|
||||||
|
|
||||||
|
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||||
|
|
||||||
|
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
||||||
|
|
||||||
|
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||||
|
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||||
|
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
|
||||||
|
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
|
||||||
|
|
||||||
## Windows Terminal Errors
|
## Windows Terminal Errors
|
||||||
|
|
||||||
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
|
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
|
|||||||
|
|
||||||
Here's a quick example showing API access from `powershell`
|
Here's a quick example showing API access from `powershell`
|
||||||
```powershell
|
```powershell
|
||||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||||
```
|
```
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
@@ -48,6 +48,9 @@ the explorer window by hitting `<cmd>+R` and type in:
|
|||||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||||
|
|
||||||
|
## Uninstall
|
||||||
|
|
||||||
|
The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
|
||||||
|
|
||||||
## Standalone CLI
|
## Standalone CLI
|
||||||
|
|
||||||
|
|||||||
@@ -30,9 +30,7 @@ func Host() *url.URL {
|
|||||||
defaultPort = "443"
|
defaultPort = "443"
|
||||||
}
|
}
|
||||||
|
|
||||||
// trim trailing slashes
|
hostport, path, _ := strings.Cut(hostport, "/")
|
||||||
hostport = strings.TrimRight(hostport, "/")
|
|
||||||
|
|
||||||
host, port, err := net.SplitHostPort(hostport)
|
host, port, err := net.SplitHostPort(hostport)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
host, port = "127.0.0.1", defaultPort
|
host, port = "127.0.0.1", defaultPort
|
||||||
@@ -45,15 +43,13 @@ func Host() *url.URL {
|
|||||||
|
|
||||||
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
||||||
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
||||||
return &url.URL{
|
port = defaultPort
|
||||||
Scheme: scheme,
|
|
||||||
Host: net.JoinHostPort(host, defaultPort),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &url.URL{
|
return &url.URL{
|
||||||
Scheme: scheme,
|
Scheme: scheme,
|
||||||
Host: net.JoinHostPort(host, port),
|
Host: net.JoinHostPort(host, port),
|
||||||
|
Path: path,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -116,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
|
|||||||
return keepAlive
|
return keepAlive
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
||||||
|
// Zero or Negative values are treated as infinite.
|
||||||
|
// Default is 5 minutes.
|
||||||
|
func LoadTimeout() (loadTimeout time.Duration) {
|
||||||
|
loadTimeout = 5 * time.Minute
|
||||||
|
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
||||||
|
if d, err := time.ParseDuration(s); err == nil {
|
||||||
|
loadTimeout = d
|
||||||
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
||||||
|
loadTimeout = time.Duration(n) * time.Second
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if loadTimeout <= 0 {
|
||||||
|
return time.Duration(math.MaxInt64)
|
||||||
|
}
|
||||||
|
|
||||||
|
return loadTimeout
|
||||||
|
}
|
||||||
|
|
||||||
func Bool(k string) func() bool {
|
func Bool(k string) func() bool {
|
||||||
return func() bool {
|
return func() bool {
|
||||||
if s := Var(k); s != "" {
|
if s := Var(k); s != "" {
|
||||||
@@ -163,53 +179,6 @@ var (
|
|||||||
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
||||||
)
|
)
|
||||||
|
|
||||||
func RunnersDir() (p string) {
|
|
||||||
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
|
||||||
if runtime.GOOS != "windows" {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
if p == "" {
|
|
||||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// On Windows we do not carry the payloads inside the main executable
|
|
||||||
exe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var paths []string
|
|
||||||
for _, root := range []string{filepath.Dir(exe), cwd} {
|
|
||||||
paths = append(paths,
|
|
||||||
root,
|
|
||||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
|
||||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try a few variations to improve developer experience when building from source in the local tree
|
|
||||||
for _, path := range paths {
|
|
||||||
candidate := filepath.Join(path, "ollama_runners")
|
|
||||||
if _, err := os.Stat(candidate); err == nil {
|
|
||||||
p = candidate
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
|
||||||
func Uint(key string, defaultValue uint) func() uint {
|
func Uint(key string, defaultValue uint) func() uint {
|
||||||
return func() uint {
|
return func() uint {
|
||||||
if s := Var(key); s != "" {
|
if s := Var(key); s != "" {
|
||||||
@@ -235,6 +204,23 @@ var (
|
|||||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
|
return func() uint64 {
|
||||||
|
if s := Var(key); s != "" {
|
||||||
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
||||||
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
||||||
|
} else {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set aside VRAM per GPU
|
||||||
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||||
|
|
||||||
type EnvVar struct {
|
type EnvVar struct {
|
||||||
Name string
|
Name string
|
||||||
Value any
|
Value any
|
||||||
@@ -245,9 +231,11 @@ func AsMap() map[string]EnvVar {
|
|||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||||
|
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||||
@@ -255,10 +243,22 @@ func AsMap() map[string]EnvVar {
|
|||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
||||||
|
|
||||||
|
// Informational
|
||||||
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||||
|
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
||||||
|
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if runtime.GOOS != "windows" {
|
||||||
|
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
||||||
|
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
||||||
|
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
||||||
|
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
||||||
|
}
|
||||||
|
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
||||||
@@ -267,6 +267,7 @@ func AsMap() map[string]EnvVar {
|
|||||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -282,3 +283,12 @@ func Values() map[string]string {
|
|||||||
func Var(key string) string {
|
func Var(key string) string {
|
||||||
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// On windows, we keep the binary at the top directory, but
|
||||||
|
// other platforms use a "bin" directory, so this returns ".."
|
||||||
|
func LibRelativeToExe() string {
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
return "."
|
||||||
|
}
|
||||||
|
return ".."
|
||||||
|
}
|
||||||
|
|||||||
@@ -13,34 +13,35 @@ func TestHost(t *testing.T) {
|
|||||||
value string
|
value string
|
||||||
expect string
|
expect string
|
||||||
}{
|
}{
|
||||||
"empty": {"", "127.0.0.1:11434"},
|
"empty": {"", "http://127.0.0.1:11434"},
|
||||||
"only address": {"1.2.3.4", "1.2.3.4:11434"},
|
"only address": {"1.2.3.4", "http://1.2.3.4:11434"},
|
||||||
"only port": {":1234", ":1234"},
|
"only port": {":1234", "http://:1234"},
|
||||||
"address and port": {"1.2.3.4:1234", "1.2.3.4:1234"},
|
"address and port": {"1.2.3.4:1234", "http://1.2.3.4:1234"},
|
||||||
"hostname": {"example.com", "example.com:11434"},
|
"hostname": {"example.com", "http://example.com:11434"},
|
||||||
"hostname and port": {"example.com:1234", "example.com:1234"},
|
"hostname and port": {"example.com:1234", "http://example.com:1234"},
|
||||||
"zero port": {":0", ":0"},
|
"zero port": {":0", "http://:0"},
|
||||||
"too large port": {":66000", ":11434"},
|
"too large port": {":66000", "http://:11434"},
|
||||||
"too small port": {":-1", ":11434"},
|
"too small port": {":-1", "http://:11434"},
|
||||||
"ipv6 localhost": {"[::1]", "[::1]:11434"},
|
"ipv6 localhost": {"[::1]", "http://[::1]:11434"},
|
||||||
"ipv6 world open": {"[::]", "[::]:11434"},
|
"ipv6 world open": {"[::]", "http://[::]:11434"},
|
||||||
"ipv6 no brackets": {"::1", "[::1]:11434"},
|
"ipv6 no brackets": {"::1", "http://[::1]:11434"},
|
||||||
"ipv6 + port": {"[::1]:1337", "[::1]:1337"},
|
"ipv6 + port": {"[::1]:1337", "http://[::1]:1337"},
|
||||||
"extra space": {" 1.2.3.4 ", "1.2.3.4:11434"},
|
"extra space": {" 1.2.3.4 ", "http://1.2.3.4:11434"},
|
||||||
"extra quotes": {"\"1.2.3.4\"", "1.2.3.4:11434"},
|
"extra quotes": {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
|
||||||
"extra space+quotes": {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
|
"extra space+quotes": {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
|
||||||
"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
|
"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
|
||||||
"http": {"http://1.2.3.4", "1.2.3.4:80"},
|
"http": {"http://1.2.3.4", "http://1.2.3.4:80"},
|
||||||
"http port": {"http://1.2.3.4:4321", "1.2.3.4:4321"},
|
"http port": {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
|
||||||
"https": {"https://1.2.3.4", "1.2.3.4:443"},
|
"https": {"https://1.2.3.4", "https://1.2.3.4:443"},
|
||||||
"https port": {"https://1.2.3.4:4321", "1.2.3.4:4321"},
|
"https port": {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
|
||||||
|
"proxy path": {"https://example.com/ollama", "https://example.com:443/ollama"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for name, tt := range cases {
|
for name, tt := range cases {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
t.Setenv("OLLAMA_HOST", tt.value)
|
t.Setenv("OLLAMA_HOST", tt.value)
|
||||||
if host := Host(); host.Host != tt.expect {
|
if host := Host(); host.String() != tt.expect {
|
||||||
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
|
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -214,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLoadTimeout(t *testing.T) {
|
||||||
|
defaultTimeout := 5 * time.Minute
|
||||||
|
cases := map[string]time.Duration{
|
||||||
|
"": defaultTimeout,
|
||||||
|
"1s": time.Second,
|
||||||
|
"1m": time.Minute,
|
||||||
|
"1h": time.Hour,
|
||||||
|
"5m0s": defaultTimeout,
|
||||||
|
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
|
||||||
|
"0": time.Duration(math.MaxInt64),
|
||||||
|
"60": 60 * time.Second,
|
||||||
|
"120": 2 * time.Minute,
|
||||||
|
"3600": time.Hour,
|
||||||
|
"-0": time.Duration(math.MaxInt64),
|
||||||
|
"-1": time.Duration(math.MaxInt64),
|
||||||
|
"-1m": time.Duration(math.MaxInt64),
|
||||||
|
// invalid values
|
||||||
|
" ": defaultTimeout,
|
||||||
|
"???": defaultTimeout,
|
||||||
|
"1d": defaultTimeout,
|
||||||
|
"1y": defaultTimeout,
|
||||||
|
"1w": defaultTimeout,
|
||||||
|
}
|
||||||
|
|
||||||
|
for tt, expect := range cases {
|
||||||
|
t.Run(tt, func(t *testing.T) {
|
||||||
|
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
|
||||||
|
if actual := LoadTimeout(); actual != expect {
|
||||||
|
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestVar(t *testing.T) {
|
func TestVar(t *testing.T) {
|
||||||
cases := map[string]string{
|
cases := map[string]string{
|
||||||
"value": "value",
|
"value": "value",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
langchain==0.0.274
|
langchain==0.0.274
|
||||||
gpt4all==1.0.8
|
gpt4all==1.0.8
|
||||||
chromadb==0.4.7
|
chromadb==0.5.0
|
||||||
llama-cpp-python==0.1.81
|
llama-cpp-python==0.1.81
|
||||||
urllib3==2.0.4
|
urllib3==2.0.4
|
||||||
PyMuPDF==1.23.5
|
PyMuPDF==1.23.5
|
||||||
|
|||||||
93
examples/python-grounded-factuality-rag-check/README.md
Normal file
93
examples/python-grounded-factuality-rag-check/README.md
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# RAG Hallucination Checker using Bespoke-Minicheck
|
||||||
|
|
||||||
|
This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.1` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations.
|
||||||
|
|
||||||
|
## Running the Example
|
||||||
|
|
||||||
|
1. Ensure `all-minilm` (embedding) `llama3.1` (chat) and `bespoke-minicheck` (check) models installed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama pull all-minilm
|
||||||
|
ollama pull llama3.1
|
||||||
|
ollama pull bespoke-minicheck
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install the dependencies.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Run the example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Expected Output
|
||||||
|
|
||||||
|
```text
|
||||||
|
Enter the URL of an article you want to chat with, or press Enter for default example:
|
||||||
|
|
||||||
|
Loaded, chunked, and embedded text from https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt.
|
||||||
|
|
||||||
|
Enter your question or type quit: Who is the CEO of openai?
|
||||||
|
|
||||||
|
Retrieved chunks:
|
||||||
|
OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It ’ s being released alongside o1-mini , a smaller , cheaper version . And yes , if you ’ re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence .
|
||||||
|
|
||||||
|
OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It ’ s being released alongside o1-mini , a smaller , cheaper version . And yes , if you ’ re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence . More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week .
|
||||||
|
|
||||||
|
More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
|
||||||
|
|
||||||
|
OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI ’ s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
|
||||||
|
|
||||||
|
LLM Answer:
|
||||||
|
The text does not mention the CEO of OpenAI. It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
|
||||||
|
|
||||||
|
LLM Claim: The text does not mention the CEO of OpenAI.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? Yes
|
||||||
|
|
||||||
|
LLM Claim: It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? No
|
||||||
|
```
|
||||||
|
|
||||||
|
The second claim is unsupported since the text mentions the research lead.
|
||||||
|
|
||||||
|
Another tricky example:
|
||||||
|
|
||||||
|
```text
|
||||||
|
|
||||||
|
Enter your question or type quit: what sets o1 apart from gpt-4o?
|
||||||
|
|
||||||
|
Retrieved chunks:
|
||||||
|
OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI ’ s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
|
||||||
|
|
||||||
|
He says OpenAI also tested o1 against a qualifying exam for the International Mathematics Olympiad , and while GPT-4o only correctly solved only 13 percent of problems , o1 scored 83 percent . “ We can ’ t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn ’ t do as well on factual knowledge about the world .
|
||||||
|
|
||||||
|
More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
|
||||||
|
|
||||||
|
“ We can ’ t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn ’ t do as well on factual knowledge about the world . It also doesn ’ t have the ability to browse the web or process files and images . Still , the company believes it represents a brand-new class of capabilities . It was named o1 to indicate “ resetting the counter back to 1. ” “ I ’ m gon na be honest : I think we ’ re terrible at naming , traditionally , ” McGrew says .
|
||||||
|
LLM Answer: According to the text, several things set o1 apart from GPT-4o:
|
||||||
|
|
||||||
|
* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
|
||||||
|
* The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
|
||||||
|
* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
|
||||||
|
* However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
|
||||||
|
|
||||||
|
LLM Claim: According to the text, several things set o1 apart from GPT-4o:
|
||||||
|
|
||||||
|
* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? Yes
|
||||||
|
|
||||||
|
LLM Claim: * The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? Yes
|
||||||
|
|
||||||
|
LLM Claim: * o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? No
|
||||||
|
|
||||||
|
LLM Claim: * However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
|
||||||
|
Is this claim supported by the context according to bespoke-minicheck? Yes
|
||||||
|
```
|
||||||
|
|
||||||
|
We see that the third claim "* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance." is not supported by the context. This is because the context only mentions that o1 "is claimed to perform" which is different from "has been shown to perform".
|
||||||
137
examples/python-grounded-factuality-rag-check/main.py
Normal file
137
examples/python-grounded-factuality-rag-check/main.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
import ollama
|
||||||
|
import warnings
|
||||||
|
from mattsollamatools import chunker
|
||||||
|
from newspaper import Article
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.neighbors import NearestNeighbors
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore", category=FutureWarning, module="transformers.tokenization_utils_base"
|
||||||
|
)
|
||||||
|
nltk.download("punkt", quiet=True)
|
||||||
|
|
||||||
|
|
||||||
|
def getArticleText(url):
|
||||||
|
"""Gets the text of an article from a URL.
|
||||||
|
|
||||||
|
Often there are a bunch of ads and menus on pages for a news article.
|
||||||
|
This uses newspaper3k to get just the text of just the article.
|
||||||
|
"""
|
||||||
|
article = Article(url)
|
||||||
|
article.download()
|
||||||
|
article.parse()
|
||||||
|
return article.text
|
||||||
|
|
||||||
|
|
||||||
|
def knn_search(question_embedding, embeddings, k=5):
|
||||||
|
"""Performs K-nearest neighbors (KNN) search"""
|
||||||
|
X = np.array(
|
||||||
|
[item["embedding"] for article in embeddings for item in article["embeddings"]]
|
||||||
|
)
|
||||||
|
source_texts = [
|
||||||
|
item["source"] for article in embeddings for item in article["embeddings"]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Fit a KNN model on the embeddings
|
||||||
|
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
|
||||||
|
knn.fit(X)
|
||||||
|
|
||||||
|
# Find the indices and distances of the k-nearest neighbors.
|
||||||
|
_, indices = knn.kneighbors(question_embedding, n_neighbors=k)
|
||||||
|
|
||||||
|
# Get the indices and source texts of the best matches
|
||||||
|
best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
|
||||||
|
|
||||||
|
return best_matches
|
||||||
|
|
||||||
|
|
||||||
|
def check(document, claim):
|
||||||
|
"""Checks if the claim is supported by the document by calling bespoke-minicheck.
|
||||||
|
|
||||||
|
Returns Yes/yes if the claim is supported by the document, No/no otherwise.
|
||||||
|
Support for logits will be added in the future.
|
||||||
|
|
||||||
|
bespoke-minicheck's system prompt is defined as:
|
||||||
|
'Determine whether the provided claim is consistent with the corresponding
|
||||||
|
document. Consistency in this context implies that all information presented in the claim
|
||||||
|
is substantiated by the document. If not, it should be considered inconsistent. Please
|
||||||
|
assess the claim's consistency with the document by responding with either "Yes" or "No".'
|
||||||
|
|
||||||
|
bespoke-minicheck's user prompt is defined as:
|
||||||
|
"Document: {document}\nClaim: {claim}"
|
||||||
|
"""
|
||||||
|
prompt = f"Document: {document}\nClaim: {claim}"
|
||||||
|
response = ollama.generate(
|
||||||
|
model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
|
||||||
|
)
|
||||||
|
return response["response"].strip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
allEmbeddings = []
|
||||||
|
default_url = "https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt"
|
||||||
|
user_input = input(
|
||||||
|
"Enter the URL of an article you want to chat with, or press Enter for default example: "
|
||||||
|
)
|
||||||
|
article_url = user_input.strip() if user_input.strip() else default_url
|
||||||
|
article = {}
|
||||||
|
article["embeddings"] = []
|
||||||
|
article["url"] = article_url
|
||||||
|
text = getArticleText(article_url)
|
||||||
|
chunks = chunker(text)
|
||||||
|
|
||||||
|
# Embed (batch) chunks using ollama
|
||||||
|
embeddings = ollama.embed(model="all-minilm", input=chunks)["embeddings"]
|
||||||
|
|
||||||
|
for chunk, embedding in zip(chunks, embeddings):
|
||||||
|
item = {}
|
||||||
|
item["source"] = chunk
|
||||||
|
item["embedding"] = embedding
|
||||||
|
item["sourcelength"] = len(chunk)
|
||||||
|
article["embeddings"].append(item)
|
||||||
|
|
||||||
|
allEmbeddings.append(article)
|
||||||
|
|
||||||
|
print(f"\nLoaded, chunked, and embedded text from {article_url}.\n")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Input a question from the user
|
||||||
|
# For example, "Who is the chief research officer?"
|
||||||
|
question = input("Enter your question or type quit: ")
|
||||||
|
|
||||||
|
if question.lower() == "quit":
|
||||||
|
break
|
||||||
|
|
||||||
|
# Embed the user's question using ollama.embed
|
||||||
|
question_embedding = ollama.embed(model="all-minilm", input=question)[
|
||||||
|
"embeddings"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Perform KNN search to find the best matches (indices and source text)
|
||||||
|
best_matches = knn_search(question_embedding, allEmbeddings, k=4)
|
||||||
|
|
||||||
|
sourcetext = "\n\n".join([source_text for (_, source_text) in best_matches])
|
||||||
|
|
||||||
|
print(f"\nRetrieved chunks: \n{sourcetext}\n")
|
||||||
|
|
||||||
|
# Give the retreived chunks and question to the chat model
|
||||||
|
system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
|
||||||
|
|
||||||
|
ollama_response = ollama.generate(
|
||||||
|
model="llama3.1",
|
||||||
|
prompt=question,
|
||||||
|
system=system_prompt,
|
||||||
|
options={"stream": False},
|
||||||
|
)
|
||||||
|
|
||||||
|
answer = ollama_response["response"]
|
||||||
|
print(f"LLM Answer:\n{answer}\n")
|
||||||
|
|
||||||
|
# Check each sentence in the response for grounded factuality
|
||||||
|
if answer:
|
||||||
|
for claim in nltk.sent_tokenize(answer):
|
||||||
|
print(f"LLM Claim: {claim}")
|
||||||
|
print(
|
||||||
|
f"Is this claim supported by the context according to bespoke-minicheck? {check(sourcetext, claim)}\n"
|
||||||
|
)
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
ollama
|
||||||
|
lxml==5.3.0
|
||||||
|
lxml_html_clean==0.2.2
|
||||||
|
mattsollamatools==0.0.25
|
||||||
|
newspaper3k==0.2.8
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==1.26.4
|
||||||
|
scikit-learn==1.5.2
|
||||||
53
examples/python-grounded-factuality-simple-check/main.py
Normal file
53
examples/python-grounded-factuality-simple-check/main.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Simple example to demonstrate how to use the bespoke-minicheck model."""
|
||||||
|
|
||||||
|
import ollama
|
||||||
|
|
||||||
|
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||||
|
|
||||||
|
|
||||||
|
def check(document, claim):
|
||||||
|
"""Checks if the claim is supported by the document by calling bespoke-minicheck.
|
||||||
|
|
||||||
|
Returns Yes/yes if the claim is supported by the document, No/no otherwise.
|
||||||
|
Support for logits will be added in the future.
|
||||||
|
|
||||||
|
bespoke-minicheck's system prompt is defined as:
|
||||||
|
'Determine whether the provided claim is consistent with the corresponding
|
||||||
|
document. Consistency in this context implies that all information presented in the claim
|
||||||
|
is substantiated by the document. If not, it should be considered inconsistent. Please
|
||||||
|
assess the claim's consistency with the document by responding with either "Yes" or "No".'
|
||||||
|
|
||||||
|
bespoke-minicheck's user prompt is defined as:
|
||||||
|
"Document: {document}\nClaim: {claim}"
|
||||||
|
"""
|
||||||
|
prompt = f"Document: {document}\nClaim: {claim}"
|
||||||
|
response = ollama.generate(
|
||||||
|
model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
|
||||||
|
)
|
||||||
|
return response["response"].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_input(prompt):
|
||||||
|
user_input = input(prompt)
|
||||||
|
if not user_input:
|
||||||
|
exit()
|
||||||
|
print()
|
||||||
|
return user_input
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
while True:
|
||||||
|
# Get a document from the user (e.g. "Ryan likes running and biking.")
|
||||||
|
document = get_user_input("Enter a document: ")
|
||||||
|
# Get a claim from the user (e.g. "Ryan likes to run.")
|
||||||
|
claim = get_user_input("Enter a claim: ")
|
||||||
|
# Check if the claim is supported by the document
|
||||||
|
grounded_factuality_check = check(document, claim)
|
||||||
|
print(
|
||||||
|
f"Is the claim supported by the document according to bespoke-minicheck? {grounded_factuality_check}"
|
||||||
|
)
|
||||||
|
print("\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
54
examples/python-grounded-factuality-simple-check/readme.md
Normal file
54
examples/python-grounded-factuality-simple-check/readme.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Simple Bespoke-Minicheck Example
|
||||||
|
|
||||||
|
`bespoke-minicheck` is a model for checking if a claim is supported by a document. It is used through the **generate** endpoint, which is called in this example with a `prompt` that includes the expected formatting of the user input.
|
||||||
|
|
||||||
|
## Running the Example
|
||||||
|
|
||||||
|
1. Ensure you have the `bespoke-minicheck` model installed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama pull bespoke-minicheck
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install the dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Run the program:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Enter a document and a claim when prompted:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
Enter a document: Roses are red.
|
||||||
|
|
||||||
|
Enter a claim: Roses are blue.
|
||||||
|
```
|
||||||
|
|
||||||
|
The claim and document are then given to the `bespoke-minicheck` as inputs, which then generates a response (Yes or No) on whether the claim is supported by the document.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
Is the claim supported by the document according to bespoke-minicheck? No
|
||||||
|
```
|
||||||
|
|
||||||
|
## More Examples
|
||||||
|
|
||||||
|
Document ([source](https://en.wikipedia.org/wiki/Apple_I)):
|
||||||
|
> The Apple Computer 1 (Apple-1[a]), later known predominantly as the Apple I(written with a Roman numeral),[b] is an 8-bit motherboard-only personal computer designed by Steve Wozniak[5][6] and released by the Apple Computer Company (now Apple Inc.) in 1976. The company was initially formed to sell the Apple I – its first product – and would later become the world's largest technology company.[7] The idea of starting a company and selling the computer came from Wozniak's friend and Apple co-founder Steve Jobs.[8][9] One of the main innovations of the Apple I was that it included video display terminal circuitry on its circuit board, allowing it to connect to a low-cost composite video monitor or television, instead of an expensive computer terminal, compared to most existing computers at the time.
|
||||||
|
|
||||||
|
Claim:
|
||||||
|
>The Apple I is a 16-bit computer.
|
||||||
|
|
||||||
|
Expected output:
|
||||||
|
>Is the claim supported by the document according to bespoke-minicheck? **No**
|
||||||
|
|
||||||
|
Claim:
|
||||||
|
>Apple was originally called the Apple Computer Company.
|
||||||
|
|
||||||
|
Expected output:
|
||||||
|
>Is the claim supported by the document according to bespoke-minicheck? **Yes**
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
ollama
|
||||||
@@ -4,5 +4,5 @@ SYSTEM """
|
|||||||
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
|
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PARAMETER TEMPERATURE 0.3
|
PARAMETER temperature 0.3
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory.
|
|||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
Requests==2.31.0
|
Requests>=2.32.3
|
||||||
|
|||||||
2
go.mod
2
go.mod
@@ -1,6 +1,6 @@
|
|||||||
module github.com/ollama/ollama
|
module github.com/ollama/ollama
|
||||||
|
|
||||||
go 1.22.0
|
go 1.22.5
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/containerd/console v1.0.3
|
github.com/containerd/console v1.0.3
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
|
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
|
||||||
@@ -54,7 +56,7 @@ func commonAMDValidateLibDir() (string, error) {
|
|||||||
// Installer payload location if we're running the installed binary
|
// Installer payload location if we're running the installed binary
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||||
return rocmTargetDir, nil
|
return rocmTargetDir, nil
|
||||||
|
|||||||
@@ -34,10 +34,10 @@ type HipLib struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewHipLib() (*HipLib, error) {
|
func NewHipLib() (*HipLib, error) {
|
||||||
// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs/ this repo will consist with v5.7
|
// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
|
||||||
h, err := windows.LoadLibrary("amdhip64.dll")
|
h, err := windows.LoadLibrary("amdhip64_6.dll")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to load amdhip64.dll, please make sure to upgrade to the latest amd driver: %w", err)
|
return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
|
||||||
}
|
}
|
||||||
hl := &HipLib{}
|
hl := &HipLib{}
|
||||||
hl.dll = h
|
hl.dll = h
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
|||||||
if len(resp) == 0 {
|
if len(resp) == 0 {
|
||||||
slog.Info("no compatible amdgpu devices detected")
|
slog.Info("no compatible amdgpu devices detected")
|
||||||
}
|
}
|
||||||
|
if err := verifyKFDDriverAccess(); err != nil {
|
||||||
|
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
|
|||||||
}
|
}
|
||||||
return usedMemory, nil
|
return usedMemory, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func verifyKFDDriverAccess() error {
|
||||||
|
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||||
|
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, fs.ErrPermission) {
|
||||||
|
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
|
||||||
|
} else if errors.Is(err, fs.ErrNotExist) {
|
||||||
|
// Container runtime failure?
|
||||||
|
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
|
||||||
|
}
|
||||||
|
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||||
|
}
|
||||||
|
fd.Close()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ const (
|
|||||||
var (
|
var (
|
||||||
// Used to validate if the given ROCm lib is usable
|
// Used to validate if the given ROCm lib is usable
|
||||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
|
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
|
||||||
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
|
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
|
||||||
)
|
)
|
||||||
|
|
||||||
func AMDGetGPUInfo() []RocmGPUInfo {
|
func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
|
|||||||
// Installer payload (if we're running from some other location)
|
// Installer payload (if we're running from some other location)
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
||||||
rocmTargetDir := filepath.Join(appDir, "rocm")
|
rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
||||||
return rocmTargetDir, nil
|
return rocmTargetDir, nil
|
||||||
|
|||||||
148
gpu/assets.go
148
gpu/assets.go
@@ -1,148 +0,0 @@
|
|||||||
package gpu
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
lock sync.Mutex
|
|
||||||
payloadsDir = ""
|
|
||||||
)
|
|
||||||
|
|
||||||
func PayloadsDir() (string, error) {
|
|
||||||
lock.Lock()
|
|
||||||
defer lock.Unlock()
|
|
||||||
var err error
|
|
||||||
if payloadsDir == "" {
|
|
||||||
runnersDir := envconfig.RunnersDir()
|
|
||||||
|
|
||||||
if runnersDir != "" {
|
|
||||||
payloadsDir = runnersDir
|
|
||||||
return payloadsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
|
||||||
cleanupTmpDirs()
|
|
||||||
tmpDir := envconfig.TmpDir()
|
|
||||||
if tmpDir == "" {
|
|
||||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
err = os.MkdirAll(tmpDir, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track our pid so we can clean up orphaned tmpdirs
|
|
||||||
n := filepath.Join(tmpDir, "ollama.pid")
|
|
||||||
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
|
|
||||||
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// We create a distinct subdirectory for payloads within the tmpdir
|
|
||||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
|
||||||
payloadsDir = filepath.Join(tmpDir, "runners")
|
|
||||||
}
|
|
||||||
return payloadsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Best effort to clean up prior tmpdirs
|
|
||||||
func cleanupTmpDirs() {
|
|
||||||
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, match := range matches {
|
|
||||||
raw, err := os.ReadFile(match)
|
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
|
||||||
slog.Debug("not a ollama runtime directory, skipping", "path", match)
|
|
||||||
continue
|
|
||||||
} else if err != nil {
|
|
||||||
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
pid, err := strconv.Atoi(string(raw))
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("invalid pid, skipping", "path", match, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
p, err := os.FindProcess(pid)
|
|
||||||
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
|
||||||
slog.Warn("process still running, skipping", "pid", pid, "path", match)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := os.Remove(match); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
runners := filepath.Join(filepath.Dir(match), "runners")
|
|
||||||
if err := os.RemoveAll(runners); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := os.Remove(filepath.Dir(match)); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func Cleanup() {
|
|
||||||
lock.Lock()
|
|
||||||
defer lock.Unlock()
|
|
||||||
runnersDir := envconfig.RunnersDir()
|
|
||||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
|
||||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
|
||||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
|
||||||
slog.Debug("cleaning up", "dir", tmpDir)
|
|
||||||
err := os.RemoveAll(tmpDir)
|
|
||||||
if err != nil {
|
|
||||||
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
|
|
||||||
time.Sleep(1000 * time.Millisecond)
|
|
||||||
err = os.RemoveAll(tmpDir)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func UpdatePath(dir string) {
|
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
tmpDir := filepath.Dir(dir)
|
|
||||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
|
||||||
i := 0
|
|
||||||
for _, comp := range pathComponents {
|
|
||||||
if strings.EqualFold(comp, dir) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Remove any other prior paths to our temp dir
|
|
||||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
|
||||||
pathComponents[i] = comp
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
|
||||||
slog.Info("updating", "PATH", newPath)
|
|
||||||
os.Setenv("PATH", newPath)
|
|
||||||
}
|
|
||||||
// linux and darwin rely on rpath
|
|
||||||
}
|
|
||||||
@@ -4,9 +4,17 @@ package gpu
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
ids := []string{}
|
ids := []string{}
|
||||||
for _, info := range gpuInfo {
|
for _, info := range gpuInfo {
|
||||||
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|||||||
}
|
}
|
||||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||||
|
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
||||||
|
if CudaTegra != "" {
|
||||||
|
ver := strings.Split(CudaTegra, ".")
|
||||||
|
if len(ver) > 0 {
|
||||||
|
return "jetpack" + ver[0]
|
||||||
|
}
|
||||||
|
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
||||||
|
r := regexp.MustCompile(` R(\d+) `)
|
||||||
|
m := r.FindSubmatch(data)
|
||||||
|
if len(m) != 2 {
|
||||||
|
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
||||||
|
} else {
|
||||||
|
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
||||||
|
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
||||||
|
// https://developer.nvidia.com/embedded/jetpack-archive
|
||||||
|
switch l4t {
|
||||||
|
case 35:
|
||||||
|
return "jetpack5"
|
||||||
|
case 36:
|
||||||
|
return "jetpack6"
|
||||||
|
default:
|
||||||
|
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
||||||
|
return "v11"
|
||||||
|
}
|
||||||
|
return "v12"
|
||||||
|
}
|
||||||
|
|||||||
79
gpu/gpu.go
79
gpu/gpu.go
@@ -64,10 +64,6 @@ var RocmComputeMin = 9
|
|||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initCudaHandles() *cudaHandles {
|
func initCudaHandles() *cudaHandles {
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
@@ -97,10 +93,9 @@ func initCudaHandles() *cudaHandles {
|
|||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
||||||
}
|
}
|
||||||
tmpDir, _ := PayloadsDir()
|
libDir := LibraryDir()
|
||||||
if tmpDir != "" {
|
if libDir != "" {
|
||||||
// TODO - add "payloads" for subprocess
|
cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
|
||||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
|
|
||||||
}
|
}
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
||||||
|
|
||||||
@@ -215,7 +210,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability,
|
Variant: cpuCapability.String(),
|
||||||
ID: "0",
|
ID: "0",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@@ -229,11 +224,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
return GpuInfoList{cpus[0].GpuInfo}
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
}
|
}
|
||||||
|
|
||||||
// On windows we bundle the nvidia library one level above the runner dir
|
depPath := LibraryDir()
|
||||||
depPath := ""
|
|
||||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
|
||||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load ALL libraries
|
// Load ALL libraries
|
||||||
cHandles = initCudaHandles()
|
cHandles = initCudaHandles()
|
||||||
@@ -269,11 +260,23 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||||
|
gpuInfo.computeMajor = int(memInfo.major)
|
||||||
|
gpuInfo.computeMinor = int(memInfo.minor)
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
gpuInfo.DependencyPath = depPath
|
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
|
||||||
gpuInfo.DriverMajor = driverMajor
|
gpuInfo.DriverMajor = driverMajor
|
||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
|
variant := cudaVariant(gpuInfo)
|
||||||
|
if depPath != "" {
|
||||||
|
gpuInfo.DependencyPath = depPath
|
||||||
|
// Check for variant specific directory
|
||||||
|
if variant != "" {
|
||||||
|
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||||
|
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
|
gpuInfo.Variant = variant
|
||||||
|
|
||||||
// query the management library as well so we can record any skew between the two
|
// query the management library as well so we can record any skew between the two
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||||
@@ -306,13 +309,6 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
if envconfig.IntelGPU() {
|
if envconfig.IntelGPU() {
|
||||||
oHandles = initOneAPIHandles()
|
oHandles = initOneAPIHandles()
|
||||||
if oHandles != nil && oHandles.oneapi != nil {
|
if oHandles != nil && oHandles.oneapi != nil {
|
||||||
|
|
||||||
// On windows we bundle the oneapi library one level above the runner dir
|
|
||||||
depPath = ""
|
|
||||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
|
||||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
|
|
||||||
}
|
|
||||||
|
|
||||||
for d := range oHandles.oneapi.num_drivers {
|
for d := range oHandles.oneapi.num_drivers {
|
||||||
if oHandles.oneapi == nil {
|
if oHandles.oneapi == nil {
|
||||||
// shouldn't happen
|
// shouldn't happen
|
||||||
@@ -467,10 +463,12 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||||
var ldPaths []string
|
var ldPaths []string
|
||||||
var patterns []string
|
|
||||||
gpuLibPaths := []string{}
|
gpuLibPaths := []string{}
|
||||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||||
|
|
||||||
|
// Start with our bundled libraries
|
||||||
|
patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
|
||||||
|
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
||||||
@@ -479,13 +477,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
|||||||
default:
|
default:
|
||||||
return gpuLibPaths
|
return gpuLibPaths
|
||||||
}
|
}
|
||||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
|
||||||
|
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||||
for _, ldPath := range ldPaths {
|
for _, ldPath := range ldPaths {
|
||||||
d, err := filepath.Abs(ldPath)
|
d, err := filepath.Abs(ldPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
patterns = append(patterns, filepath.Join(d, baseLibName))
|
||||||
}
|
}
|
||||||
patterns = append(patterns, defaultPatterns...)
|
patterns = append(patterns, defaultPatterns...)
|
||||||
slog.Debug("gpu library search", "globs", patterns)
|
slog.Debug("gpu library search", "globs", patterns)
|
||||||
@@ -641,3 +640,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|||||||
return "", ""
|
return "", ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LibraryDir() string {
|
||||||
|
// On Windows/linux we bundle the dependencies at the same level as the executable
|
||||||
|
appExe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to lookup executable path", "error", err)
|
||||||
|
}
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to lookup working directory", "error", err)
|
||||||
|
}
|
||||||
|
// Scan for any of our dependeices, and pick first match
|
||||||
|
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
|
||||||
|
libDep := filepath.Join("lib", "ollama")
|
||||||
|
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, libDep)
|
||||||
|
}
|
||||||
|
// Developer mode, local build
|
||||||
|
if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog.Warn("unable to locate gpu dependency libraries")
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUCapability(),
|
Variant: GetCPUCapability().String(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
|
|||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUCapability(),
|
Variant: GetCPUCapability().String(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ var (
|
|||||||
CudartMgmtName = "libcudart.so*"
|
CudartMgmtName = "libcudart.so*"
|
||||||
NvcudaMgmtName = "libcuda.so*"
|
NvcudaMgmtName = "libcuda.so*"
|
||||||
NvmlMgmtName = "" // not currently wired on linux
|
NvmlMgmtName = "" // not currently wired on linux
|
||||||
OneapiMgmtName = "libze_intel_gpu.so"
|
OneapiMgmtName = "libze_intel_gpu.so*"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
|||||||
@@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestByLibrary(t *testing.T) {
|
||||||
|
type testCase struct {
|
||||||
|
input []GpuInfo
|
||||||
|
expect int
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := map[string]*testCase{
|
||||||
|
"empty": {input: []GpuInfo{}, expect: 0},
|
||||||
|
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
|
||||||
|
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range testCases {
|
||||||
|
t.Run(k, func(t *testing.T) {
|
||||||
|
resp := (GpuInfoList)(v.input).ByLibrary()
|
||||||
|
if len(resp) != v.expect {
|
||||||
|
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
||||||
|
|||||||
11
gpu/types.go
11
gpu/types.go
@@ -19,7 +19,7 @@ type GpuInfo struct {
|
|||||||
Library string `json:"library,omitempty"`
|
Library string `json:"library,omitempty"`
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||||
Variant CPUCapability `json:"variant"`
|
Variant string `json:"variant"`
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
@@ -55,6 +55,8 @@ type CudaGPUInfo struct {
|
|||||||
GpuInfo
|
GpuInfo
|
||||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
OSOverhead uint64 // Memory overhead between the driver library and management library
|
||||||
index int //nolint:unused,nolintlint
|
index int //nolint:unused,nolintlint
|
||||||
|
computeMajor int //nolint:unused,nolintlint
|
||||||
|
computeMinor int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type CudaGPUInfoList []CudaGPUInfo
|
type CudaGPUInfoList []CudaGPUInfo
|
||||||
|
|
||||||
@@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||||||
for _, info := range l {
|
for _, info := range l {
|
||||||
found := false
|
found := false
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != CPUCapabilityNone {
|
if info.Variant != CPUCapabilityNone.String() {
|
||||||
requested += "_" + info.Variant.String()
|
requested += "_" + info.Variant
|
||||||
}
|
}
|
||||||
for i, lib := range libs {
|
for i, lib := range libs {
|
||||||
if lib == requested {
|
if lib == requested {
|
||||||
@@ -92,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !found {
|
if !found {
|
||||||
libs = append(libs, info.Library)
|
libs = append(libs, requested)
|
||||||
resp = append(resp, []GpuInfo{info})
|
resp = append(resp, []GpuInfo{info})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() {
|
|||||||
slog.Info("inference compute",
|
slog.Info("inference compute",
|
||||||
"id", g.ID,
|
"id", g.ID,
|
||||||
"library", g.Library,
|
"library", g.Library,
|
||||||
|
"variant", g.Variant,
|
||||||
"compute", g.Compute,
|
"compute", g.Compute,
|
||||||
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
||||||
"name", g.Name,
|
"name", g.Name,
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
|||||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 8 {
|
if res.PromptEvalCount != 6 {
|
||||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
|||||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 16 {
|
if res.PromptEvalCount != 12 {
|
||||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
5
llm/ext_server/CMakeLists.txt
vendored
5
llm/ext_server/CMakeLists.txt
vendored
@@ -1,12 +1,13 @@
|
|||||||
set(TARGET ollama_llama_server)
|
set(TARGET ollama_llama_server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
|
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp httplib.h)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
24596
llm/ext_server/json.hpp
vendored
24596
llm/ext_server/json.hpp
vendored
File diff suppressed because it is too large
Load Diff
43
llm/ext_server/server.cpp
vendored
43
llm/ext_server/server.cpp
vendored
@@ -262,7 +262,7 @@ struct server_slot {
|
|||||||
char buffer[512];
|
char buffer[512];
|
||||||
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||||
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
snprintf(buffer, sizeof(buffer), "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_prompt_processing, n_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
@@ -276,7 +276,7 @@ struct server_slot {
|
|||||||
|
|
||||||
t_token = t_token_generation / n_decoded;
|
t_token = t_token_generation / n_decoded;
|
||||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_token_generation, n_decoded,
|
t_token_generation, n_decoded,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
@@ -288,7 +288,7 @@ struct server_slot {
|
|||||||
{"n_tokens_second", n_tokens_second},
|
{"n_tokens_second", n_tokens_second},
|
||||||
});
|
});
|
||||||
|
|
||||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
snprintf(buffer, sizeof(buffer), " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
@@ -425,7 +425,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_add_bos_token(model);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -913,7 +913,9 @@ struct llama_server_context
|
|||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
|
if (!llama_token_is_eog(model, result.tok))
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
|
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
@@ -954,6 +956,8 @@ struct llama_server_context
|
|||||||
if (!incomplete)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
|
||||||
|
if (!llama_token_is_eog(model, result.tok)) {
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
bool is_stop_full = false;
|
bool is_stop_full = false;
|
||||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||||
@@ -979,6 +983,10 @@ struct llama_server_context
|
|||||||
slot.n_sent_text += result.text_to_send.size();
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
// add the token to slot queue and cache
|
// add the token to slot queue and cache
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
|
}
|
||||||
|
|
||||||
if (slot.params.stream)
|
if (slot.params.stream)
|
||||||
{
|
{
|
||||||
@@ -1031,7 +1039,7 @@ struct llama_server_context
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -1117,9 +1125,7 @@ struct llama_server_context
|
|||||||
{"multimodal", multimodal}
|
{"multimodal", multimodal}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
|
||||||
res.result_json["content"] = tkn.text_to_send;
|
res.result_json["content"] = tkn.text_to_send;
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
{
|
{
|
||||||
@@ -1429,7 +1435,13 @@ struct llama_server_context
|
|||||||
switch (task.type)
|
switch (task.type)
|
||||||
{
|
{
|
||||||
case TASK_TYPE_COMPLETION: {
|
case TASK_TYPE_COMPLETION: {
|
||||||
server_slot *slot = prefix_slot(task.data["prompt"]);
|
server_slot *slot = nullptr;
|
||||||
|
if (task.embedding_mode) {
|
||||||
|
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
|
||||||
|
slot = slots[0].available() ? &slots[0] : nullptr;
|
||||||
|
} else {
|
||||||
|
slot = prefix_slot(task.data["prompt"]);
|
||||||
|
}
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
// if no slot is available, we defer this task for processing later
|
// if no slot is available, we defer this task for processing later
|
||||||
@@ -2008,7 +2020,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
|
||||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
@@ -2281,7 +2293,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.cpuparams.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--grp-attn-n" || arg == "-gan")
|
else if (arg == "--grp-attn-n" || arg == "-gan")
|
||||||
{
|
{
|
||||||
@@ -2309,7 +2321,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_threads_batch = std::stoi(argv[i]);
|
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--threads-http")
|
else if (arg == "--threads-http")
|
||||||
{
|
{
|
||||||
@@ -2620,6 +2632,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||||
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||||
|
|
||||||
if (invalid_param)
|
if (invalid_param)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
@@ -2769,8 +2786,8 @@ int main(int argc, char **argv) {
|
|||||||
{"commit", LLAMA_COMMIT}});
|
{"commit", LLAMA_COMMIT}});
|
||||||
|
|
||||||
LOG_INFO("system info", {
|
LOG_INFO("system info", {
|
||||||
{"n_threads", params.n_threads},
|
{"n_threads", params.cpuparams.n_threads},
|
||||||
{"n_threads_batch", params.n_threads_batch},
|
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
||||||
{"total_threads", std::thread::hardware_concurrency()},
|
{"total_threads", std::thread::hardware_concurrency()},
|
||||||
{"system_info", llama_print_system_info()},
|
{"system_info", llama_print_system_info()},
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -9,11 +9,14 @@ init_vars() {
|
|||||||
ARCH="arm64"
|
ARCH="arm64"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
|
echo "GOARCH must be set"
|
||||||
|
echo "this script is meant to be run from within go generate"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
LLAMACPP_DIR=../llama.cpp
|
LLAMACPP_DIR=../llama.cpp
|
||||||
CMAKE_DEFS=""
|
CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
|
||||||
CMAKE_TARGETS="--target ollama_llama_server"
|
CMAKE_TARGETS="--target ollama_llama_server"
|
||||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||||
@@ -27,6 +30,8 @@ init_vars() {
|
|||||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||||
NO_WHOLE_ARCHIVE=""
|
NO_WHOLE_ARCHIVE=""
|
||||||
GCC_ARCH="-arch ${ARCH}"
|
GCC_ARCH="-arch ${ARCH}"
|
||||||
|
DIST_BASE=../../dist/darwin-${GOARCH}/
|
||||||
|
PAYLOAD_BASE=../../build/darwin/${GOARCH}
|
||||||
;;
|
;;
|
||||||
"Linux")
|
"Linux")
|
||||||
LIB_EXT="so"
|
LIB_EXT="so"
|
||||||
@@ -35,6 +40,8 @@ init_vars() {
|
|||||||
|
|
||||||
# Cross compiling not supported on linux - Use docker
|
# Cross compiling not supported on linux - Use docker
|
||||||
GCC_ARCH=""
|
GCC_ARCH=""
|
||||||
|
DIST_BASE=../../dist/linux-${GOARCH}/
|
||||||
|
PAYLOAD_BASE=../../build/linux/${GOARCH}
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
;;
|
;;
|
||||||
@@ -42,6 +49,8 @@ init_vars() {
|
|||||||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||||
fi
|
fi
|
||||||
|
GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
|
||||||
|
RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
|
||||||
}
|
}
|
||||||
|
|
||||||
git_module_setup() {
|
git_module_setup() {
|
||||||
@@ -60,51 +69,68 @@ git_module_setup() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
apply_patches() {
|
apply_patches() {
|
||||||
# Wire up our CMakefile
|
|
||||||
if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
|
|
||||||
echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -n "$(ls -A ../patches/*.diff)" ]; then
|
|
||||||
# apply temporary patches until fix is upstream
|
# apply temporary patches until fix is upstream
|
||||||
for patch in ../patches/*.diff; do
|
for patch in ../patches/*.patch; do
|
||||||
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
|
git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
|
||||||
(cd ${LLAMACPP_DIR}; git checkout ${file})
|
|
||||||
done
|
done
|
||||||
done
|
|
||||||
for patch in ../patches/*.diff; do
|
|
||||||
(cd ${LLAMACPP_DIR} && git apply ${patch})
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
build() {
|
build() {
|
||||||
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
||||||
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
||||||
|
# remove unnecessary build artifacts
|
||||||
|
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
|
||||||
}
|
}
|
||||||
|
|
||||||
compress() {
|
dist() {
|
||||||
echo "Compressing payloads to reduce overall binary size..."
|
[ -z "${RUNNER}" ] && exit 1
|
||||||
pids=""
|
mkdir -p ${RUNNER_BASE}/${RUNNER}/
|
||||||
rm -rf ${BUILD_DIR}/bin/*.gz
|
|
||||||
for f in ${BUILD_DIR}/bin/* ; do
|
for f in ${BUILD_DIR}/bin/* ; do
|
||||||
gzip -n --best -f ${f} &
|
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||||
pids+=" $!"
|
|
||||||
done
|
done
|
||||||
# check for lib directory
|
# check for lib directory
|
||||||
if [ -d ${BUILD_DIR}/lib ]; then
|
if [ -d ${BUILD_DIR}/lib ]; then
|
||||||
for f in ${BUILD_DIR}/lib/* ; do
|
for f in ${BUILD_DIR}/lib/* ; do
|
||||||
gzip -n --best -f ${f} &
|
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||||
pids+=" $!"
|
done
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
|
||||||
|
compress() {
|
||||||
|
[ -z "${RUNNER}" ] && exit 1
|
||||||
|
echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
|
||||||
|
rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
|
||||||
|
mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
|
||||||
|
for f in ${BUILD_DIR}/bin/* ; do
|
||||||
|
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||||
|
compress_pids+=" $!"
|
||||||
|
done
|
||||||
|
# check for lib directory
|
||||||
|
if [ -d ${BUILD_DIR}/lib ]; then
|
||||||
|
for f in ${BUILD_DIR}/lib/* ; do
|
||||||
|
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||||
|
compress_pids+=" $!"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
for pid in ${pids}; do
|
}
|
||||||
|
|
||||||
|
wait_for_compress() {
|
||||||
|
for pid in ${compress_pids}; do
|
||||||
wait $pid
|
wait $pid
|
||||||
done
|
done
|
||||||
echo "Finished compression"
|
echo "Finished compression"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
install() {
|
||||||
|
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
||||||
|
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
|
||||||
|
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
||||||
|
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# Keep the local tree clean after we're done with the build
|
# Keep the local tree clean after we're done with the build
|
||||||
cleanup() {
|
cleanup() {
|
||||||
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
|
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
compress_pids=""
|
||||||
echo "Starting darwin generate script"
|
echo "Starting darwin generate script"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
@@ -18,7 +19,7 @@ sign() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||||
|
|
||||||
case "${GOARCH}" in
|
case "${GOARCH}" in
|
||||||
"amd64")
|
"amd64")
|
||||||
@@ -38,7 +39,8 @@ case "${GOARCH}" in
|
|||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
RUNNER=cpu
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
@@ -50,7 +52,8 @@ case "${GOARCH}" in
|
|||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
RUNNER=cpu_avx
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
@@ -62,7 +65,8 @@ case "${GOARCH}" in
|
|||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
RUNNER=cpu_avx2
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||||
build
|
build
|
||||||
@@ -83,7 +87,8 @@ case "${GOARCH}" in
|
|||||||
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
|
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/metal"
|
RUNNER="metal"
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
@@ -98,4 +103,5 @@ case "${GOARCH}" in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
cleanup
|
cleanup
|
||||||
|
wait_for_compress
|
||||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
compress_pids=""
|
||||||
|
|
||||||
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
||||||
amdGPUs() {
|
amdGPUs() {
|
||||||
@@ -60,7 +61,7 @@ if [ -z "${CUDACXX}" ]; then
|
|||||||
export CUDACXX=$(command -v nvcc)
|
export CUDACXX=$(command -v nvcc)
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
@@ -86,10 +87,13 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
RUNNER="cpu"
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building custom CPU"
|
echo "Building custom CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
else
|
else
|
||||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||||
@@ -102,16 +106,19 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||||
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||||
|
|
||||||
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
||||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||||
#
|
#
|
||||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
RUNNER=cpu
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -126,9 +133,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
RUNNER=cpu_avx
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -139,9 +149,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
RUNNER=cpu_avx2
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@@ -169,7 +182,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||||
init_vars
|
init_vars
|
||||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||||
if [ -n "${CUDA_MAJOR}" ]; then
|
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
|
||||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||||
fi
|
fi
|
||||||
if [ "${ARCH}" == "arm64" ]; then
|
if [ "${ARCH}" == "arm64" ]; then
|
||||||
@@ -187,29 +200,21 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||||
echo "Building custom CUDA GPU"
|
echo "Building custom CUDA GPU"
|
||||||
else
|
else
|
||||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||||
fi
|
fi
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
export CUDAFLAGS="-t8"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
||||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
RUNNER=cuda${CUDA_VARIANT}
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
|
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||||
|
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
# Carry the CUDA libs as payloads to help reduce dependency burden on users
|
dist
|
||||||
#
|
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
||||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
mkdir -p "${CUDA_DIST_DIR}"
|
||||||
# downloading them in the install script.
|
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
||||||
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
|
cp -a "${lib}" "${CUDA_DIST_DIR}"
|
||||||
for lib in libcudart.so libcublas.so libcublasLt.so ; do
|
|
||||||
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
|
|
||||||
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
|
|
||||||
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
|
|
||||||
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
|
|
||||||
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
|
|
||||||
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
|
|
||||||
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
|
|
||||||
else
|
|
||||||
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
|
|
||||||
fi
|
|
||||||
done
|
done
|
||||||
compress
|
compress
|
||||||
|
|
||||||
@@ -226,22 +231,27 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
|||||||
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
||||||
CC=icx
|
CC=icx
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
RUNNER=oneapi
|
||||||
EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
|
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
||||||
|
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||||
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
||||||
build
|
build
|
||||||
|
|
||||||
# copy oneAPI dependencies
|
# copy oneAPI dependencies
|
||||||
|
mkdir -p "${ONEAPI_DIST_DIR}"
|
||||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
||||||
cp "${dep}" "${BUILD_DIR}/bin/"
|
cp -a "${dep}" "${ONEAPI_DIST_DIR}"
|
||||||
done
|
done
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -263,31 +273,35 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|||||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
||||||
fi
|
fi
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
||||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
||||||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
||||||
echo "Building custom ROCM GPU"
|
echo "Building custom ROCM GPU"
|
||||||
fi
|
fi
|
||||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
RUNNER=rocm${ROCM_VARIANT}
|
||||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
|
# ROCm dependencies are too large to fit into a unified bundle
|
||||||
|
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
||||||
|
# TODO figure out how to disable runpath (rpath)
|
||||||
|
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
|
||||||
|
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||||
build
|
build
|
||||||
|
|
||||||
# Record the ROCM dependencies
|
# copy the ROCM dependencies
|
||||||
rm -f "${BUILD_DIR}/bin/deps.txt"
|
mkdir -p "${ROCM_DIST_DIR}"
|
||||||
touch "${BUILD_DIR}/bin/deps.txt"
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
|
||||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
|
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
||||||
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
|
if [ $(readlink -f "${dep}") != "${dep}" ] ; then
|
||||||
done
|
cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
|
||||||
# bomb out if for some reason we didn't get a few deps
|
|
||||||
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
|
|
||||||
cat "${BUILD_DIR}/bin/deps.txt"
|
|
||||||
echo "ERROR: deps file short"
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
|
done
|
||||||
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cleanup
|
cleanup
|
||||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
wait_for_compress
|
||||||
|
echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ function init_vars {
|
|||||||
)
|
)
|
||||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
|
||||||
md "$script:DIST_BASE" -ea 0 > $null
|
md "$script:DIST_BASE" -ea 0 > $null
|
||||||
if ($env:CGO_CFLAGS -contains "-g") {
|
if ($env:CGO_CFLAGS -contains "-g") {
|
||||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||||
@@ -101,29 +101,9 @@ function git_module_setup {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function apply_patches {
|
function apply_patches {
|
||||||
# Wire up our CMakefile
|
|
||||||
if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
|
|
||||||
Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Apply temporary patches until fix is upstream
|
# Apply temporary patches until fix is upstream
|
||||||
$patches = Get-ChildItem "../patches/*.diff"
|
foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
|
||||||
foreach ($patch in $patches) {
|
git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
|
||||||
# Extract file paths from the patch file
|
|
||||||
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
|
|
||||||
$parts = $_ -split ' '
|
|
||||||
($parts[1] -split '/', 2)[1]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Checkout each file
|
|
||||||
foreach ($file in $filePaths) {
|
|
||||||
git -C "${script:llamacppDir}" checkout $file
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Apply each patch
|
|
||||||
foreach ($patch in $patches) {
|
|
||||||
git -C "${script:llamacppDir}" apply $patch.FullName
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,7 +115,7 @@ function build {
|
|||||||
if ($cmakeDefs -contains "-G") {
|
if ($cmakeDefs -contains "-G") {
|
||||||
$extra=@("-j8")
|
$extra=@("-j8")
|
||||||
} else {
|
} else {
|
||||||
$extra= @("--", "/p:CL_MPcount=8")
|
$extra= @("--", "/maxCpuCount:8")
|
||||||
}
|
}
|
||||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
||||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
||||||
@@ -279,7 +259,7 @@ function build_cuda() {
|
|||||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||||
# Then build cuda as a dynamically loaded library
|
# Then build cuda as a dynamically loaded library
|
||||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
$script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
|
||||||
if ($null -ne $script:CUDA_VERSION) {
|
if ($null -ne $script:CUDA_VERSION) {
|
||||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||||
}
|
}
|
||||||
@@ -291,9 +271,9 @@ function build_cuda() {
|
|||||||
"-DGGML_CUDA=ON",
|
"-DGGML_CUDA=ON",
|
||||||
"-DGGML_AVX=on",
|
"-DGGML_AVX=on",
|
||||||
"-DGGML_AVX2=off",
|
"-DGGML_AVX2=off",
|
||||||
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
|
"-DCMAKE_CUDA_FLAGS=-t6",
|
||||||
"-DCMAKE_CUDA_FLAGS=-t8",
|
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
|
||||||
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
|
"-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
|
||||||
)
|
)
|
||||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||||
@@ -304,12 +284,11 @@ function build_cuda() {
|
|||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
|
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping CUDA generation step"
|
write-host "Skipping CUDA generation step"
|
||||||
}
|
}
|
||||||
@@ -343,18 +322,17 @@ function build_oneapi() {
|
|||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
|
||||||
} else {
|
} else {
|
||||||
Write-Host "Skipping oneAPI generation step"
|
Write-Host "Skipping oneAPI generation step"
|
||||||
}
|
}
|
||||||
@@ -375,7 +353,7 @@ function build_rocm() {
|
|||||||
"-DCMAKE_C_COMPILER=clang.exe",
|
"-DCMAKE_C_COMPILER=clang.exe",
|
||||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||||
"-DGGML_HIPBLAS=on",
|
"-DGGML_HIPBLAS=on",
|
||||||
"-DLLAMA_CUDA_NO_PEER_COPY=on",
|
"-DGGML_CUDA_NO_PEER_COPY=on",
|
||||||
"-DHIP_PLATFORM=amd",
|
"-DHIP_PLATFORM=amd",
|
||||||
"-DGGML_AVX=on",
|
"-DGGML_AVX=on",
|
||||||
"-DGGML_AVX2=off",
|
"-DGGML_AVX2=off",
|
||||||
@@ -404,12 +382,11 @@ function build_rocm() {
|
|||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
|
||||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping ROCm generation step"
|
write-host "Skipping ROCm generation step"
|
||||||
}
|
}
|
||||||
|
|||||||
14
llm/ggml.go
14
llm/ggml.go
@@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
|
|||||||
return "unknown"
|
return "unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (kv KV) Kind() string {
|
||||||
|
if s, ok := kv["general.type"].(string); ok {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
func (kv KV) ParameterCount() uint64 {
|
func (kv KV) ParameterCount() uint64 {
|
||||||
return kv.u64("general.parameter_count")
|
return kv.u64("general.parameter_count")
|
||||||
}
|
}
|
||||||
@@ -352,11 +360,13 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
case "llama":
|
case "llama":
|
||||||
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
|
fullOffload = max(
|
||||||
|
4*batch*(1+4*embedding+context*(1+heads)),
|
||||||
|
4*batch*(embedding+vocab),
|
||||||
|
)
|
||||||
|
|
||||||
partialOffload = 4 * batch * embedding
|
partialOffload = 4 * batch * embedding
|
||||||
partialOffload += max(
|
partialOffload += max(
|
||||||
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
|
||||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
|
|||||||
Submodule llm/llama.cpp updated: 1e6f6554aa...8962422b1c
@@ -1,11 +1,7 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/darwin/arm64/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
package llm
|
|
||||||
|
|
||||||
import (
|
|
||||||
"embed"
|
|
||||||
"syscall"
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:embed build/darwin/x86_64/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
||||||
@@ -1,11 +1,7 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/linux/*/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
||||||
|
|||||||
@@ -1,13 +1,9 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
// unused on windows
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
// Overflow that didn't fit into the GPU
|
// Overflow that didn't fit into the GPU
|
||||||
var overflow uint64
|
var overflow uint64
|
||||||
|
|
||||||
|
overhead := envconfig.GpuOverhead()
|
||||||
availableList := make([]string, len(gpus))
|
availableList := make([]string, len(gpus))
|
||||||
for i, gpu := range gpus {
|
for i, gpu := range gpus {
|
||||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||||
@@ -164,8 +166,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
slog.Debug("gpu has too little memory to allocate any layers",
|
||||||
|
"id", gpus[i].ID,
|
||||||
|
"library", gpus[i].Library,
|
||||||
|
"variant", gpus[i].Variant,
|
||||||
|
"compute", gpus[i].Compute,
|
||||||
|
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
||||||
|
"name", gpus[i].Name,
|
||||||
|
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
||||||
|
"available", format.HumanBytes2(gpus[i].FreeMemory),
|
||||||
|
"minimum_memory", gpus[i].MinimumMemory,
|
||||||
|
"layer_size", format.HumanBytes2(layerSize),
|
||||||
|
"gpu_zer_overhead", format.HumanBytes2(gzo),
|
||||||
|
"partial_offload", format.HumanBytes2(graphPartialOffload),
|
||||||
|
"full_offload", format.HumanBytes2(graphFullOffload),
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
@@ -196,7 +212,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[i%j]
|
g := gpusWithSpace[i%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+layerSize {
|
if (g.g.FreeMemory - overhead) > used+layerSize {
|
||||||
gpuAllocations[g.i] += layerSize
|
gpuAllocations[g.i] += layerSize
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
@@ -219,7 +235,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[layerCount%j]
|
g := gpusWithSpace[layerCount%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+memoryLayerOutput {
|
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
|
||||||
gpuAllocations[g.i] += memoryLayerOutput
|
gpuAllocations[g.i] += memoryLayerOutput
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
@@ -306,6 +322,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m MemoryEstimate) log() {
|
func (m MemoryEstimate) log() {
|
||||||
|
overhead := envconfig.GpuOverhead()
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to "+m.inferenceLibrary,
|
"offload to "+m.inferenceLibrary,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
@@ -323,6 +340,7 @@ func (m MemoryEstimate) log() {
|
|||||||
"memory",
|
"memory",
|
||||||
// memory available by GPU for offloading
|
// memory available by GPU for offloading
|
||||||
"available", m.availableList,
|
"available", m.availableList,
|
||||||
|
"gpu_overhead", format.HumanBytes2(overhead),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"required",
|
"required",
|
||||||
// memory required for full offloading
|
// memory required for full offloading
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
assert.Len(t, tensors, inputLayerCount+1)
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
err = WriteGGUF(f, KV{
|
err = WriteGGUF(f, KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"general.name": "name",
|
|
||||||
"llama.context_length": uint32(32),
|
"llama.context_length": uint32(32),
|
||||||
"llama.embedding_length": uint32(4096),
|
"llama.embedding_length": uint32(4096),
|
||||||
"llama.block_count": uint32(inputLayerCount),
|
"llama.block_count": uint32(inputLayerCount),
|
||||||
|
|||||||
22
llm/patches/0000-cmakelist.patch
Normal file
22
llm/patches/0000-cmakelist.patch
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
From 8b8d83ffca775840acc5dc700f3b3703e9f5cfe4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Michael Yang <mxyng@pm.me>
|
||||||
|
Date: Fri, 23 Aug 2024 11:27:48 -0700
|
||||||
|
Subject: [PATCH] patch cmakelist
|
||||||
|
|
||||||
|
---
|
||||||
|
CMakeLists.txt | 2 ++
|
||||||
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||||
|
index a3132063..6a2a9912 100644
|
||||||
|
--- a/CMakeLists.txt
|
||||||
|
+++ b/CMakeLists.txt
|
||||||
|
@@ -199,3 +199,5 @@ if (LLAMA_BUILD_EXAMPLES)
|
||||||
|
add_subdirectory(examples)
|
||||||
|
add_subdirectory(pocs)
|
||||||
|
endif()
|
||||||
|
+
|
||||||
|
+add_subdirectory(../ext_server ext_server) # ollama
|
||||||
|
--
|
||||||
|
2.45.2
|
||||||
|
|
||||||
@@ -1,8 +1,18 @@
|
|||||||
|
From 2cfaa0a04faa9c87ba8f1ac8527eb953e69c6cde Mon Sep 17 00:00:00 2001
|
||||||
|
From: Michael Yang <mxyng@pm.me>
|
||||||
|
Date: Mon, 16 Sep 2024 15:53:10 -0700
|
||||||
|
Subject: [PATCH] 01-load-progress.diff
|
||||||
|
|
||||||
|
---
|
||||||
|
common/common.cpp | 2 ++
|
||||||
|
common/common.h | 7 +++++++
|
||||||
|
2 files changed, 9 insertions(+)
|
||||||
|
|
||||||
diff --git a/common/common.cpp b/common/common.cpp
|
diff --git a/common/common.cpp b/common/common.cpp
|
||||||
index 2c05a4d4..927f0e3d 100644
|
index 9fa18472..48ff41e9 100644
|
||||||
--- a/common/common.cpp
|
--- a/common/common.cpp
|
||||||
+++ b/common/common.cpp
|
+++ b/common/common.cpp
|
||||||
@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
@@ -2573,6 +2573,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
@@ -12,10 +22,10 @@ index 2c05a4d4..927f0e3d 100644
|
|||||||
mparams.kv_overrides = NULL;
|
mparams.kv_overrides = NULL;
|
||||||
} else {
|
} else {
|
||||||
diff --git a/common/common.h b/common/common.h
|
diff --git a/common/common.h b/common/common.h
|
||||||
index 65c0ef81..ebca2c77 100644
|
index cb5e7f6d..d8f043f7 100644
|
||||||
--- a/common/common.h
|
--- a/common/common.h
|
||||||
+++ b/common/common.h
|
+++ b/common/common.h
|
||||||
@@ -184,6 +184,13 @@ struct gpt_params {
|
@@ -204,6 +204,13 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
@@ -29,3 +39,6 @@ index 65c0ef81..ebca2c77 100644
|
|||||||
// embedding
|
// embedding
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
--
|
||||||
|
2.46.0
|
||||||
|
|
||||||
@@ -1,5 +1,14 @@
|
|||||||
|
From ba4bba80a744f76ac67b8234451c259a3c5da83b Mon Sep 17 00:00:00 2001
|
||||||
|
From: Michael Yang <mxyng@pm.me>
|
||||||
|
Date: Mon, 16 Sep 2024 15:53:11 -0700
|
||||||
|
Subject: [PATCH] 02-clip-log.diff
|
||||||
|
|
||||||
|
---
|
||||||
|
examples/llava/clip.cpp | 1 +
|
||||||
|
1 file changed, 1 insertion(+)
|
||||||
|
|
||||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||||
index e431c7f7..f077e688 100644
|
index 9b890571..cb51793d 100644
|
||||||
--- a/examples/llava/clip.cpp
|
--- a/examples/llava/clip.cpp
|
||||||
+++ b/examples/llava/clip.cpp
|
+++ b/examples/llava/clip.cpp
|
||||||
@@ -3,6 +3,7 @@
|
@@ -3,6 +3,7 @@
|
||||||
@@ -10,3 +19,6 @@ index e431c7f7..f077e688 100644
|
|||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
--
|
||||||
|
2.46.0
|
||||||
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user