mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Compare commits
39 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8b2fc1078b | ||
|
|
6cdf27d154 | ||
|
|
5c18e66384 | ||
|
|
35096a7eff | ||
|
|
81d55d3e4d | ||
|
|
a14f76491d | ||
|
|
760cfa27e5 | ||
|
|
c9a5aca3da | ||
|
|
d5da2ab7e8 | ||
|
|
1c04117114 | ||
|
|
8b4b243f5f | ||
|
|
b42a596425 | ||
|
|
219d6c92a1 | ||
|
|
4759d879f2 | ||
|
|
d875e99e46 | ||
|
|
8a35bb926e | ||
|
|
a0ea067b63 | ||
|
|
4efb98cb4f | ||
|
|
0679d491fe | ||
|
|
c25ffde91d | ||
|
|
17b386a891 | ||
|
|
549c2bdfcf | ||
|
|
67691e410d | ||
|
|
5b3393b6a2 | ||
|
|
d7eb05b936 | ||
|
|
636a743c2b | ||
|
|
df011054fa | ||
|
|
ac07160c8d | ||
|
|
6606e4243c | ||
|
|
65973ceb64 | ||
|
|
bebef1e50d | ||
|
|
d48c1c5a44 | ||
|
|
8a29cf27ac | ||
|
|
36a8372b28 | ||
|
|
4e94227b5d | ||
|
|
479d551766 | ||
|
|
76b2b723b2 | ||
|
|
b8d77cdeab | ||
|
|
14a68a0ca9 |
2
.github/workflows/test.yaml
vendored
2
.github/workflows/test.yaml
vendored
@@ -281,7 +281,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
- uses: golangci/golangci-lint-action@v6
|
- uses: golangci/golangci-lint-action@v6
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 10m0s -v
|
||||||
test:
|
test:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
|||||||
66
Dockerfile
66
Dockerfile
@@ -5,6 +5,8 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
|||||||
ARG CUDA_VERSION_12=12.4.0
|
ARG CUDA_VERSION_12=12.4.0
|
||||||
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
ARG ROCM_VERSION=6.1.2
|
ARG ROCM_VERSION=6.1.2
|
||||||
|
ARG JETPACK_6=r36.2.0
|
||||||
|
ARG JETPACK_5=r35.4.1
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
||||||
#
|
#
|
||||||
@@ -13,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
|
|||||||
#
|
#
|
||||||
### Then incremental builds will be much faster in this container
|
### Then incremental builds will be much faster in this container
|
||||||
#
|
#
|
||||||
# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
||||||
#
|
#
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@@ -76,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
|
|||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
||||||
make -C llama -j $(expr $(nproc) / 2 ) ; \
|
make -j $(expr $(nproc) / 2 ) ; \
|
||||||
else \
|
else \
|
||||||
make -C llama -j 5 ; \
|
make -j 5 ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
||||||
@@ -90,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
|
|||||||
ARG CUDA_V12_ARCHITECTURES
|
ARG CUDA_V12_ARCHITECTURES
|
||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -C llama -j 8
|
make -j 5
|
||||||
|
|
||||||
|
# Jetsons need to be built in discrete stages
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v11 \
|
||||||
|
CUDA_ARCHITECTURES="72;87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack5 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v12 \
|
||||||
|
CUDA_ARCHITECTURES="87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack6 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
|
||||||
|
|
||||||
|
|
||||||
# Intermediate stages used for ./scripts/build_linux.sh
|
# Intermediate stages used for ./scripts/build_linux.sh
|
||||||
@@ -134,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
|
|||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
||||||
RUN cd dist/linux-$GOARCH && \
|
RUN cd dist/linux-$GOARCH && \
|
||||||
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack5 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack6 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS dist-amd64
|
FROM --platform=linux/amd64 scratch AS dist-amd64
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
||||||
@@ -180,16 +229,19 @@ RUN rm -rf \
|
|||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
|
||||||
|
|
||||||
|
|
||||||
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
||||||
@@ -198,7 +250,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
|||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
|
|||||||
15
README.md
15
README.md
@@ -66,9 +66,11 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam
|
|||||||
Here are some example models that can be downloaded:
|
Here are some example models that can be downloaded:
|
||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
| ------------------ | ---------- | ----- | -------------------------------- |
|
||||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||||
|
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
|
||||||
|
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
|
||||||
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
||||||
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
||||||
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
||||||
@@ -349,8 +351,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||||
|
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
|
||||||
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
||||||
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
|
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
|
||||||
|
- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
|
||||||
|
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
@@ -377,6 +382,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||||
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
||||||
|
- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
@@ -433,6 +439,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
||||||
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||||
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
|
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||||
|
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
@@ -470,11 +479,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
|
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
|
||||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||||
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||||
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
||||||
|
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {
|
|||||||
|
|
||||||
// ClientFromEnvironment creates a new [Client] using configuration from the
|
// ClientFromEnvironment creates a new [Client] using configuration from the
|
||||||
// environment variable OLLAMA_HOST, which points to the network host and
|
// environment variable OLLAMA_HOST, which points to the network host and
|
||||||
// port on which the ollama service is listenting. The format of this variable
|
// port on which the ollama service is listening. The format of this variable
|
||||||
// is:
|
// is:
|
||||||
//
|
//
|
||||||
// <scheme>://<host>:<port>
|
// <scheme>://<host>:<port>
|
||||||
|
|||||||
12
api/types.go
12
api/types.go
@@ -12,7 +12,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// StatusError is an error with and HTTP status code.
|
// StatusError is an error with an HTTP status code and message.
|
||||||
type StatusError struct {
|
type StatusError struct {
|
||||||
StatusCode int
|
StatusCode int
|
||||||
Status string
|
Status string
|
||||||
@@ -57,7 +57,7 @@ type GenerateRequest struct {
|
|||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
|
|
||||||
// Context is the context parameter returned from a previous call to
|
// Context is the context parameter returned from a previous call to
|
||||||
// Generate call. It can be used to keep a short conversational memory.
|
// [Client.Generate]. It can be used to keep a short conversational memory.
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
|
||||||
// Stream specifies whether the response is streaming; it is true by default.
|
// Stream specifies whether the response is streaming; it is true by default.
|
||||||
@@ -90,14 +90,14 @@ type ChatRequest struct {
|
|||||||
// Messages is the messages of the chat - can be used to keep a chat memory.
|
// Messages is the messages of the chat - can be used to keep a chat memory.
|
||||||
Messages []Message `json:"messages"`
|
Messages []Message `json:"messages"`
|
||||||
|
|
||||||
// Stream enable streaming of returned response; true by default.
|
// Stream enables streaming of returned responses; true by default.
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
// Format is the format to return the response in (e.g. "json").
|
// Format is the format to return the response in (e.g. "json").
|
||||||
Format string `json:"format"`
|
Format string `json:"format"`
|
||||||
|
|
||||||
// KeepAlive controls how long the model will stay loaded into memory
|
// KeepAlive controls how long the model will stay loaded into memory
|
||||||
// followin the request.
|
// following the request.
|
||||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||||
|
|
||||||
// Tools is an optional list of tools the model has access to.
|
// Tools is an optional list of tools the model has access to.
|
||||||
@@ -203,8 +203,8 @@ type Metrics struct {
|
|||||||
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Options specified in [GenerateRequest], if you add a new option here add it
|
// Options specified in [GenerateRequest]. If you add a new option here, also
|
||||||
// to the API docs also.
|
// add it to the API docs.
|
||||||
type Options struct {
|
type Options struct {
|
||||||
Runner
|
Runner
|
||||||
|
|
||||||
|
|||||||
@@ -361,7 +361,7 @@ func (t *winTray) showMenu() error {
|
|||||||
|
|
||||||
boolRet, _, err = pTrackPopupMenu.Call(
|
boolRet, _, err = pTrackPopupMenu.Call(
|
||||||
uintptr(t.menus[0]),
|
uintptr(t.menus[0]),
|
||||||
TPM_BOTTOMALIGN|TPM_LEFTALIGN,
|
TPM_BOTTOMALIGN|TPM_LEFTALIGN|TPM_RIGHTBUTTON,
|
||||||
uintptr(p.X),
|
uintptr(p.X),
|
||||||
uintptr(p.Y),
|
uintptr(p.Y),
|
||||||
0,
|
0,
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ const (
|
|||||||
SW_HIDE = 0
|
SW_HIDE = 0
|
||||||
TPM_BOTTOMALIGN = 0x0020
|
TPM_BOTTOMALIGN = 0x0020
|
||||||
TPM_LEFTALIGN = 0x0000
|
TPM_LEFTALIGN = 0x0000
|
||||||
|
TPM_RIGHTBUTTON = 0x0002
|
||||||
WM_CLOSE = 0x0010
|
WM_CLOSE = 0x0010
|
||||||
WM_USER = 0x0400
|
WM_USER = 0x0400
|
||||||
WS_CAPTION = 0x00C00000
|
WS_CAPTION = 0x00C00000
|
||||||
|
|||||||
@@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
case "parameters":
|
case "parameters":
|
||||||
fmt.Println(resp.Parameters)
|
fmt.Println(resp.Parameters)
|
||||||
case "system":
|
case "system":
|
||||||
fmt.Println(resp.System)
|
fmt.Print(resp.System)
|
||||||
case "template":
|
case "template":
|
||||||
fmt.Println(resp.Template)
|
fmt.Print(resp.Template)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuInfo.DependencyPath = libDir
|
gpuInfo.DependencyPath = []string{libDir}
|
||||||
|
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
// Only load supported list once
|
// Only load supported list once
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
UnreliableFreeMemory: true,
|
UnreliableFreeMemory: true,
|
||||||
|
|
||||||
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: []string{libDir},
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: gfx,
|
Compute: gfx,
|
||||||
|
|||||||
@@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability.String(),
|
Variant: cpuCapability.String(),
|
||||||
ID: "0",
|
ID: "0",
|
||||||
DependencyPath: depPath,
|
DependencyPath: []string{depPath},
|
||||||
},
|
},
|
||||||
CPUs: details,
|
CPUs: details,
|
||||||
},
|
},
|
||||||
@@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
variant := cudaVariant(gpuInfo)
|
variant := cudaVariant(gpuInfo)
|
||||||
if depPath != "" {
|
if depPath != "" {
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
// Check for variant specific directory
|
// Check for variant specific directory
|
||||||
if variant != "" {
|
if variant != "" {
|
||||||
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||||
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||||
DependencyPath string `json:"lib_path,omitempty"`
|
DependencyPath []string `json:"lib_path,omitempty"`
|
||||||
|
|
||||||
// Extra environment variables specific to the GPU as list of [key,value]
|
// Extra environment variables specific to the GPU as list of [key,value]
|
||||||
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
||||||
|
|||||||
@@ -50,6 +50,9 @@ sudo systemctl restart docker
|
|||||||
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
|
||||||
|
|
||||||
### AMD GPU
|
### AMD GPU
|
||||||
|
|
||||||
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing adapters based on several different model architectures including:
|
Ollama supports importing adapters based on several different model architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
|
||||||
@@ -67,14 +67,12 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing models for several different architectures including:
|
Ollama supports importing models for several different architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
||||||
* Gemma (including Gemma 1 and Gemma 2); and
|
* Gemma (including Gemma 1 and Gemma 2); and
|
||||||
* Phi3
|
* Phi3
|
||||||
|
|
||||||
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
|
This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
|
||||||
|
|
||||||
|
|
||||||
## Importing a GGUF based model or adapter
|
## Importing a GGUF based model or adapter
|
||||||
|
|
||||||
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
||||||
|
|||||||
@@ -112,6 +112,21 @@ sudo systemctl status ollama
|
|||||||
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
||||||
> GPU.
|
> GPU.
|
||||||
|
|
||||||
|
## Customizing
|
||||||
|
|
||||||
|
To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo systemctl edit ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Service]
|
||||||
|
Environment="OLLAMA_DEBUG=1"
|
||||||
|
```
|
||||||
|
|
||||||
## Updating
|
## Updating
|
||||||
|
|
||||||
Update Ollama by running the install script again:
|
Update Ollama by running the install script again:
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ FROM <model directory>
|
|||||||
The model directory should contain the Safetensors weights for a supported architecture.
|
The model directory should contain the Safetensors weights for a supported architecture.
|
||||||
|
|
||||||
Currently supported model architectures:
|
Currently supported model architectures:
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1)
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
* Phi3
|
* Phi3
|
||||||
|
|||||||
@@ -95,7 +95,9 @@ If none of those resolve the problem, gather additional information and file an
|
|||||||
|
|
||||||
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||||
|
|
||||||
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
|
||||||
|
|
||||||
|
If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker. Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
|
||||||
|
|
||||||
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||||
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||||
|
|||||||
6
go.mod
6
go.mod
@@ -12,7 +12,7 @@ require (
|
|||||||
github.com/spf13/cobra v1.7.0
|
github.com/spf13/cobra v1.7.0
|
||||||
github.com/stretchr/testify v1.9.0
|
github.com/stretchr/testify v1.9.0
|
||||||
github.com/x448/float16 v0.8.4
|
github.com/x448/float16 v0.8.4
|
||||||
golang.org/x/sync v0.3.0
|
golang.org/x/sync v0.9.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@@ -22,7 +22,7 @@ require (
|
|||||||
github.com/mattn/go-runewidth v0.0.14
|
github.com/mattn/go-runewidth v0.0.14
|
||||||
github.com/nlpodyssey/gopickle v0.3.0
|
github.com/nlpodyssey/gopickle v0.3.0
|
||||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||||
golang.org/x/image v0.14.0
|
golang.org/x/image v0.22.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@@ -73,7 +73,7 @@ require (
|
|||||||
golang.org/x/net v0.25.0 // indirect
|
golang.org/x/net v0.25.0 // indirect
|
||||||
golang.org/x/sys v0.20.0
|
golang.org/x/sys v0.20.0
|
||||||
golang.org/x/term v0.20.0
|
golang.org/x/term v0.20.0
|
||||||
golang.org/x/text v0.15.0
|
golang.org/x/text v0.20.0
|
||||||
google.golang.org/protobuf v1.34.1
|
google.golang.org/protobuf v1.34.1
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
6
go.sum
6
go.sum
@@ -232,6 +232,8 @@ golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+o
|
|||||||
golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
|
golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
|
||||||
golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
|
golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
|
||||||
|
golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
|
||||||
|
golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
|
||||||
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
|
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
|
||||||
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
|
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
|
||||||
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
||||||
@@ -267,6 +269,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
|
|||||||
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
|
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
|
||||||
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||||
|
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
|
||||||
|
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
@@ -293,6 +297,8 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
|||||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
|
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
|
||||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
|
||||||
|
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
|
||||||
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ package llama
|
|||||||
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
|
#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
|
||||||
|
#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
|
||||||
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
||||||
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
||||||
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
||||||
@@ -36,8 +38,8 @@ package llama
|
|||||||
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
||||||
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
||||||
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
||||||
@@ -598,6 +600,10 @@ func (c *Context) SetCrossAttention(state bool) {
|
|||||||
C.llama_set_cross_attention(c.c, C.bool(state))
|
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Context) Synchronize() {
|
||||||
|
C.llama_synchronize(c.c)
|
||||||
|
}
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
|
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
|
||||||
type SamplingContext struct {
|
type SamplingContext struct {
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
include make/common-defs.make
|
include make/common-defs.make
|
||||||
|
|
||||||
HIP_ARCHS_COMMON := gfx803 gfx900 gfx902 gfx90c:xnack- gfx940 gfx941 gfx942 gfx1010 gfx1010:xnack- gfx1012 gfx1012:xnack- gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103
|
HIP_ARCHS_COMMON := gfx803 gfx900 gfx902 gfx90c:xnack- gfx906:xnack- gfx90a:xnack- gfx1010:xnack- gfx1012:xnack- gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103
|
||||||
HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx90c:xnack- gfx1010:xnack- gfx1012:xnack-
|
HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx90c:xnack- gfx1010:xnack- gfx1012:xnack-
|
||||||
|
|
||||||
ifeq ($(OS),windows)
|
ifeq ($(OS),windows)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
|||||||
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
||||||
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
||||||
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
|
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
|
||||||
|
|
||||||
ifeq ($(OS),linux)
|
ifeq ($(OS),linux)
|
||||||
CUDA_PATH?=/usr/local/cuda
|
CUDA_PATH?=/usr/local/cuda
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
@@ -22,7 +23,11 @@ type InputCache struct {
|
|||||||
lc *llama.Context
|
lc *llama.Context
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
|
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
|
||||||
|
if kvSize/numSlots < 1 {
|
||||||
|
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
|
||||||
|
}
|
||||||
|
|
||||||
slots := make([]InputCacheSlot, numSlots)
|
slots := make([]InputCacheSlot, numSlots)
|
||||||
|
|
||||||
for i := range slots {
|
for i := range slots {
|
||||||
@@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
|
|||||||
slots: slots,
|
slots: slots,
|
||||||
multiUserCache: multiUserCache,
|
multiUserCache: multiUserCache,
|
||||||
lc: lc,
|
lc: lc,
|
||||||
}
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Locking: Operations on InputCacheSlot (including finding one
|
// Locking: Operations on InputCacheSlot (including finding one
|
||||||
@@ -58,7 +63,7 @@ type InputCacheSlot struct {
|
|||||||
lastUsed time.Time
|
lastUsed time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
|
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
|
||||||
var slot *InputCacheSlot
|
var slot *InputCacheSlot
|
||||||
var numPast int
|
var numPast int
|
||||||
var err error
|
var err error
|
||||||
@@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
slot, numPast, err = c.findBestCacheSlot(prompt)
|
slot, numPast, err = c.findBestCacheSlot(prompt)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, 0, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cachePrompt {
|
if !cachePrompt {
|
||||||
@@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
prompt = prompt[numPast:]
|
prompt = prompt[numPast:]
|
||||||
slot.Inputs = slot.Inputs[:numPast]
|
slot.Inputs = slot.Inputs[:numPast]
|
||||||
|
|
||||||
return slot, prompt, numPast, nil
|
return slot, prompt, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
||||||
@@ -194,14 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
|
|||||||
return count
|
return count
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
|
// Frees up space in the KV cache by deleting the oldest half of history and shifting
|
||||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
// the newest half into that space (saving numKeep inputs at the beginning).
|
||||||
// server.cpp doesn't handle this, though we can be more graceful
|
//
|
||||||
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
|
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
|
||||||
c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
|
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
|
||||||
|
targetFree := (c.numCtx - numKeep) / 2
|
||||||
|
targetFree = max(targetFree, 1)
|
||||||
|
|
||||||
for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
|
currentFree := c.numCtx - len(slot.Inputs)
|
||||||
slot.Inputs[i-numDiscard] = slot.Inputs[i]
|
discard := targetFree - currentFree
|
||||||
|
|
||||||
|
if discard <= 0 {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
|
|
||||||
|
slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
|
||||||
|
"keep", numKeep, "discard", discard)
|
||||||
|
|
||||||
|
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||||
|
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
|
||||||
|
c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
|
||||||
|
|
||||||
|
for i := numKeep + discard; i < len(slot.Inputs); i++ {
|
||||||
|
slot.Inputs[i-discard] = slot.Inputs[i]
|
||||||
|
}
|
||||||
|
slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
)
|
)
|
||||||
@@ -34,9 +36,6 @@ type input struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Sequence struct {
|
type Sequence struct {
|
||||||
// number of inputs evaluated
|
|
||||||
numPast int
|
|
||||||
|
|
||||||
// batch index
|
// batch index
|
||||||
iBatch int
|
iBatch int
|
||||||
|
|
||||||
@@ -112,21 +111,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
|
|||||||
params.numKeep = len(inputs)
|
params.numKeep = len(inputs)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !params.embedding {
|
if s.model.AddBOSToken() {
|
||||||
// Subtracting 4 ensures that at least 1 input can be discarded during shift
|
params.numKeep += 1
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx-4)
|
|
||||||
params.numKeep += s.bosToken
|
|
||||||
} else {
|
|
||||||
// Embeddings are 1 shot - just truncate to the context window, without ever shifting
|
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate to fit in context window
|
// Ensure that at least 1 input can be discarded during shift
|
||||||
|
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
|
||||||
|
|
||||||
if len(inputs) > s.cache.numCtx {
|
if len(inputs) > s.cache.numCtx {
|
||||||
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
|
slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
|
||||||
newInputs := inputs[:params.numKeep]
|
|
||||||
newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
|
|
||||||
inputs = newInputs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var sc *llama.SamplingContext
|
var sc *llama.SamplingContext
|
||||||
@@ -170,7 +163,6 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
|||||||
|
|
||||||
for i, part := range parts {
|
for i, part := range parts {
|
||||||
// text - tokenize
|
// text - tokenize
|
||||||
if strings.TrimSpace(part) != "" {
|
|
||||||
tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
|
tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -179,7 +171,6 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
|||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
inputs = append(inputs, input{token: t})
|
inputs = append(inputs, input{token: t})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// image - generate image embedding
|
// image - generate image embedding
|
||||||
if i < len(matches) {
|
if i < len(matches) {
|
||||||
@@ -212,41 +203,51 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Server struct {
|
type Server struct {
|
||||||
model *llama.Model
|
// is the server ready to process requests?
|
||||||
lc *llama.Context
|
// protects access to model and image
|
||||||
|
ready sync.WaitGroup
|
||||||
|
|
||||||
// required for image embeddings
|
// loaded model
|
||||||
|
model *llama.Model
|
||||||
|
|
||||||
|
// image model context for multi-modal models
|
||||||
image *ImageContext
|
image *ImageContext
|
||||||
|
|
||||||
|
// status for external health reporting - loading, ready to serve, etc.
|
||||||
|
status ServerStatus
|
||||||
|
|
||||||
|
// current progress on loading the model
|
||||||
|
progress float32
|
||||||
|
|
||||||
|
// number of simultaneous requests to handle
|
||||||
|
parallel int
|
||||||
|
|
||||||
|
// maximum number of elements in a batch (per sequence)
|
||||||
// TODO (jmorganca): make this n_batch
|
// TODO (jmorganca): make this n_batch
|
||||||
batchSize int
|
batchSize int
|
||||||
|
|
||||||
// parallel is the number of parallel requests to handle
|
// protects access to everything below this line
|
||||||
parallel int
|
// this is context state needed for decoding
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
// seqs is the list of parallel sequences being evaluated
|
// indicates that data is ready for processing
|
||||||
// TODO (jmorganca): this can probably be moved into run()
|
cond *sync.Cond
|
||||||
|
|
||||||
|
// decoding state
|
||||||
|
lc *llama.Context
|
||||||
|
|
||||||
|
// the list of simultaneous sequences being evaluated
|
||||||
seqs []*Sequence
|
seqs []*Sequence
|
||||||
|
|
||||||
|
// seqs can have a maximum of parallel entries, which
|
||||||
|
// is enfoced by seqSem
|
||||||
|
seqsSem *semaphore.Weighted
|
||||||
|
|
||||||
// KV cache
|
// KV cache
|
||||||
cache *InputCache
|
cache *InputCache
|
||||||
|
|
||||||
// does this model require a beginning of sequence token?
|
|
||||||
bosToken int
|
|
||||||
|
|
||||||
// next sequence for prompt processing to avoid starvation
|
// next sequence for prompt processing to avoid starvation
|
||||||
nextSeq int
|
nextSeq int
|
||||||
|
|
||||||
// is the server ready to process requests?
|
|
||||||
ready sync.WaitGroup
|
|
||||||
|
|
||||||
mu sync.Mutex
|
|
||||||
|
|
||||||
cond *sync.Cond
|
|
||||||
|
|
||||||
progress float32
|
|
||||||
|
|
||||||
status ServerStatus
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) allNil() bool {
|
func (s *Server) allNil() bool {
|
||||||
@@ -258,18 +259,6 @@ func (s *Server) allNil() bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) shiftContext(seq *Sequence) {
|
|
||||||
numLeft := seq.numPast - seq.numKeep
|
|
||||||
numDiscard := numLeft / 2
|
|
||||||
|
|
||||||
slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
|
|
||||||
"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
|
|
||||||
|
|
||||||
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
|
|
||||||
|
|
||||||
seq.numPast -= numDiscard
|
|
||||||
}
|
|
||||||
|
|
||||||
func flushPending(seq *Sequence) bool {
|
func flushPending(seq *Sequence) bool {
|
||||||
joined := strings.Join(seq.pendingResponses, "")
|
joined := strings.Join(seq.pendingResponses, "")
|
||||||
seq.pendingResponses = []string{}
|
seq.pendingResponses = []string{}
|
||||||
@@ -368,18 +357,33 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If an error occurred during the processing of a previous batch then we may have emptied the inputs
|
||||||
|
// without adding a new one. In this case, end the sequence rather than infinite looping.
|
||||||
|
if len(seq.inputs) == 0 {
|
||||||
|
slog.Error("removing sequence due to no input tokens", "index", seqIdx, "cache id", seq.cache.Id)
|
||||||
|
s.removeSequence(seqIdx, "error")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// if past the num predict limit
|
// if past the num predict limit
|
||||||
if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
|
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
|
||||||
s.removeSequence(seqIdx, "limit")
|
s.removeSequence(seqIdx, "limit")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if seq.numPast+len(seq.inputs) > s.cache.numCtx {
|
var numInputsProcessed int
|
||||||
s.shiftContext(seq)
|
shifted := false
|
||||||
|
|
||||||
|
for i, input := range seq.inputs {
|
||||||
|
if len(seq.cache.Inputs)+1 > s.cache.numCtx {
|
||||||
|
if !shifted {
|
||||||
|
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||||
|
shifted = true
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var numInputsProcessed int
|
|
||||||
for i, input := range seq.inputs {
|
|
||||||
embedding := input.embed != nil
|
embedding := input.embed != nil
|
||||||
|
|
||||||
// If we don't currently have a batch, use one of the correct type and
|
// If we don't currently have a batch, use one of the correct type and
|
||||||
@@ -403,13 +407,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
crossAttention = seq.crossAttention
|
crossAttention = seq.crossAttention
|
||||||
batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
|
batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
|
||||||
seq.numPast++
|
seq.cache.Inputs = append(seq.cache.Inputs, input)
|
||||||
numInputsProcessed++
|
numInputsProcessed++
|
||||||
}
|
}
|
||||||
|
|
||||||
if numInputsProcessed > 0 {
|
if numInputsProcessed > 0 {
|
||||||
seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
|
|
||||||
seq.inputs = seq.inputs[numInputsProcessed:]
|
seq.inputs = seq.inputs[numInputsProcessed:]
|
||||||
seq.iBatch = batch.NumTokens() - 1
|
seq.iBatch = batch.NumTokens() - 1
|
||||||
}
|
}
|
||||||
@@ -427,6 +430,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if crossAttention {
|
||||||
|
// synchronize state to ensure the cross attention batch is complete.
|
||||||
|
// needed specifically for multi-GPU systems otherwise an inflight
|
||||||
|
// task may be incorrectly invalidated causing a crash
|
||||||
|
s.lc.Synchronize()
|
||||||
|
}
|
||||||
|
|
||||||
for i, seq := range s.seqs {
|
for i, seq := range s.seqs {
|
||||||
if seq == nil {
|
if seq == nil {
|
||||||
continue
|
continue
|
||||||
@@ -627,12 +637,17 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (jmorganca): add to sequence queue instead of
|
// Ensure that a place to put the sequence is available
|
||||||
// failing if a slot isn't available
|
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
|
||||||
|
slog.Error("Failed to acquire semaphore", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer s.seqsSem.Release(1)
|
||||||
|
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
@@ -711,11 +726,17 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
|
// Ensure that a place to put the sequence is available
|
||||||
|
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
|
||||||
|
slog.Error("Failed to acquire semaphore", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer s.seqsSem.Release(1)
|
||||||
|
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
@@ -802,10 +823,6 @@ func (s *Server) loadModel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.model.AddBOSToken() {
|
|
||||||
s.bosToken = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ppath != "" {
|
if ppath != "" {
|
||||||
var err error
|
var err error
|
||||||
s.image, err = NewImageContext(s.lc, ppath)
|
s.image, err = NewImageContext(s.lc, ppath)
|
||||||
@@ -814,7 +831,10 @@ func (s *Server) loadModel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
s.status = ServerStatusReady
|
s.status = ServerStatusReady
|
||||||
s.ready.Done()
|
s.ready.Done()
|
||||||
@@ -867,6 +887,7 @@ func main() {
|
|||||||
batchSize: *batchSize,
|
batchSize: *batchSize,
|
||||||
parallel: *parallel,
|
parallel: *parallel,
|
||||||
seqs: make([]*Sequence, *parallel),
|
seqs: make([]*Sequence, *parallel),
|
||||||
|
seqsSem: semaphore.NewWeighted(int64(*parallel)),
|
||||||
status: ServerStatusLoadingModel,
|
status: ServerStatusLoadingModel,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -306,9 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
|
|
||||||
// Note: we always put the dependency path first
|
// Note: we always put the dependency path first
|
||||||
// since this was the exact version we compiled/linked against
|
// since this was the exact version we compiled/linked against
|
||||||
if gpus[0].DependencyPath != "" {
|
if gpus[0].DependencyPath != nil {
|
||||||
// assume gpus from the same library have the same dependency path
|
// assume gpus from the same library have the same dependency path
|
||||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
|
||||||
}
|
}
|
||||||
|
|
||||||
server := filepath.Join(dir, "ollama_llama_server")
|
server := filepath.Join(dir, "ollama_llama_server")
|
||||||
@@ -1092,9 +1092,11 @@ func (s *llmServer) EstimatedTotal() uint64 {
|
|||||||
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
|
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
|
||||||
for i, gpu := range s.gpus {
|
for i, gpu := range s.gpus {
|
||||||
if gpu.ID == gpuID {
|
if gpu.ID == gpuID {
|
||||||
|
if i < len(s.estimate.GPUSizes) {
|
||||||
return s.estimate.GPUSizes[i]
|
return s.estimate.GPUSizes[i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -65,9 +65,22 @@ var (
|
|||||||
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
|
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ParserError struct {
|
||||||
|
LineNumber int
|
||||||
|
Msg string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *ParserError) Error() string {
|
||||||
|
if e.LineNumber > 0 {
|
||||||
|
return fmt.Sprintf("(line %d): %s", e.LineNumber, e.Msg)
|
||||||
|
}
|
||||||
|
return e.Msg
|
||||||
|
}
|
||||||
|
|
||||||
func ParseFile(r io.Reader) (*File, error) {
|
func ParseFile(r io.Reader) (*File, error) {
|
||||||
var cmd Command
|
var cmd Command
|
||||||
var curr state
|
var curr state
|
||||||
|
var currLine int = 1
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
var role string
|
var role string
|
||||||
|
|
||||||
@@ -84,11 +97,18 @@ func ParseFile(r io.Reader) (*File, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if isNewline(r) {
|
||||||
|
currLine++
|
||||||
|
}
|
||||||
|
|
||||||
next, r, err := parseRuneForState(r, curr)
|
next, r, err := parseRuneForState(r, curr)
|
||||||
if errors.Is(err, io.ErrUnexpectedEOF) {
|
if errors.Is(err, io.ErrUnexpectedEOF) {
|
||||||
return nil, fmt.Errorf("%w: %s", err, b.String())
|
return nil, fmt.Errorf("%w: %s", err, b.String())
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
return nil, err
|
return nil, &ParserError{
|
||||||
|
LineNumber: currLine,
|
||||||
|
Msg: err.Error(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the state transition, some transitions need to be intercepted and redirected
|
// process the state transition, some transitions need to be intercepted and redirected
|
||||||
@@ -96,7 +116,10 @@ func ParseFile(r io.Reader) (*File, error) {
|
|||||||
switch curr {
|
switch curr {
|
||||||
case stateName:
|
case stateName:
|
||||||
if !isValidCommand(b.String()) {
|
if !isValidCommand(b.String()) {
|
||||||
return nil, errInvalidCommand
|
return nil, &ParserError{
|
||||||
|
LineNumber: currLine,
|
||||||
|
Msg: errInvalidCommand.Error(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// next state sometimes depends on the current buffer value
|
// next state sometimes depends on the current buffer value
|
||||||
@@ -117,7 +140,10 @@ func ParseFile(r io.Reader) (*File, error) {
|
|||||||
cmd.Name = b.String()
|
cmd.Name = b.String()
|
||||||
case stateMessage:
|
case stateMessage:
|
||||||
if !isValidMessageRole(b.String()) {
|
if !isValidMessageRole(b.String()) {
|
||||||
return nil, errInvalidMessageRole
|
return nil, &ParserError{
|
||||||
|
LineNumber: currLine,
|
||||||
|
Msg: errInvalidMessageRole.Error(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
role = b.String()
|
role = b.String()
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package parser
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -180,8 +181,15 @@ func TestParseFileBadCommand(t *testing.T) {
|
|||||||
FROM foo
|
FROM foo
|
||||||
BADCOMMAND param1 value1
|
BADCOMMAND param1 value1
|
||||||
`
|
`
|
||||||
|
parserError := &ParserError{
|
||||||
|
LineNumber: 3,
|
||||||
|
Msg: errInvalidCommand.Error(),
|
||||||
|
}
|
||||||
|
|
||||||
_, err := ParseFile(strings.NewReader(input))
|
_, err := ParseFile(strings.NewReader(input))
|
||||||
require.ErrorIs(t, err, errInvalidCommand)
|
if !errors.As(err, &parserError) {
|
||||||
|
t.Errorf("unexpected error: expected: %s, actual: %s", parserError.Error(), err.Error())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseFileMessages(t *testing.T) {
|
func TestParseFileMessages(t *testing.T) {
|
||||||
@@ -245,7 +253,10 @@ FROM foo
|
|||||||
MESSAGE badguy I'm a bad guy!
|
MESSAGE badguy I'm a bad guy!
|
||||||
`,
|
`,
|
||||||
nil,
|
nil,
|
||||||
errInvalidMessageRole,
|
&ParserError{
|
||||||
|
LineNumber: 3,
|
||||||
|
Msg: errInvalidMessageRole.Error(),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`
|
`
|
||||||
@@ -264,13 +275,35 @@ MESSAGE system`,
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, tt := range cases {
|
||||||
t.Run("", func(t *testing.T) {
|
t.Run("", func(t *testing.T) {
|
||||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
modelfile, err := ParseFile(strings.NewReader(tt.input))
|
||||||
require.ErrorIs(t, err, c.err)
|
|
||||||
if modelfile != nil {
|
if modelfile != nil {
|
||||||
assert.Equal(t, c.expected, modelfile.Commands)
|
assert.Equal(t, tt.expected, modelfile.Commands)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if tt.err == nil {
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("expected no error, but got %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
switch tt.err.(type) {
|
||||||
|
case *ParserError:
|
||||||
|
var pErr *ParserError
|
||||||
|
if errors.As(err, &pErr) {
|
||||||
|
// got the correct type of error
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if errors.Is(err, tt.err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Fatalf("unexpected error: expected: %v, actual: %v", tt.err, err)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,9 +4,12 @@
|
|||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
|
red="$( (/usr/bin/tput bold; /usr/bin/tput setaf 1; :) 2>&-)"
|
||||||
|
plain="$( (/usr/bin/tput sgr0; :) 2>&-)"
|
||||||
|
|
||||||
status() { echo ">>> $*" >&2; }
|
status() { echo ">>> $*" >&2; }
|
||||||
error() { echo "ERROR $*"; exit 1; }
|
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||||
warning() { echo "WARNING: $*"; }
|
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||||
|
|
||||||
TEMP_DIR=$(mktemp -d)
|
TEMP_DIR=$(mktemp -d)
|
||||||
cleanup() { rm -rf $TEMP_DIR; }
|
cleanup() { rm -rf $TEMP_DIR; }
|
||||||
@@ -93,6 +96,22 @@ else
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Check for NVIDIA JetPack systems with additional downloads
|
||||||
|
if [ -f /etc/nv_tegra_release ] ; then
|
||||||
|
if grep R36 /etc/nv_tegra_release > /dev/null ; then
|
||||||
|
status "Downloading JetPack 6 components"
|
||||||
|
curl --fail --show-error --location --progress-bar \
|
||||||
|
"https://ollama.com/download/ollama-linux-${ARCH}-jetpack6.tgz${VER_PARAM}" | \
|
||||||
|
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||||
|
elif grep R35 /etc/nv_tegra_release > /dev/null ; then
|
||||||
|
status "Downloading JetPack 5 components"
|
||||||
|
curl --fail --show-error --location --progress-bar \
|
||||||
|
"https://ollama.com/download/ollama-linux-${ARCH}-jetpack5.tgz${VER_PARAM}" | \
|
||||||
|
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||||
|
else
|
||||||
|
warning "Unsupported JetPack version detected. GPU may not be supported"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
install_success() {
|
install_success() {
|
||||||
status 'The Ollama API is now available at 127.0.0.1:11434.'
|
status 'The Ollama API is now available at 127.0.0.1:11434.'
|
||||||
@@ -146,6 +165,12 @@ EOF
|
|||||||
start_service() { $SUDO systemctl restart ollama; }
|
start_service() { $SUDO systemctl restart ollama; }
|
||||||
trap start_service EXIT
|
trap start_service EXIT
|
||||||
;;
|
;;
|
||||||
|
*)
|
||||||
|
warning "systemd is not running"
|
||||||
|
if [ "$IS_WSL2" = true ]; then
|
||||||
|
warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -163,6 +188,13 @@ if [ "$IS_WSL2" = true ]; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Don't attempt to install drivers on Jetson systems
|
||||||
|
if [ -f /etc/nv_tegra_release ] ; then
|
||||||
|
status "NVIDIA JetPack ready."
|
||||||
|
install_success
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# Install GPU dependencies on Linux
|
# Install GPU dependencies on Linux
|
||||||
if ! available lspci && ! available lshw; then
|
if ! available lspci && ! available lshw; then
|
||||||
warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."
|
warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
|
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
|
||||||
|
|
||||||
createImg := func(width, height int) ([]byte, error) {
|
createImg := func(width, height int) ([]byte, error) {
|
||||||
img := image.NewRGBA(image.Rect(0, 0, 5, 5))
|
img := image.NewRGBA(image.Rect(0, 0, width, height))
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
|
|
||||||
if err := png.Encode(&buf, img); err != nil {
|
if err := png.Encode(&buf, img); err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user