mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 15:08:27 +00:00
Compare commits
47 Commits
v0.1.34-al
...
v0.1.35-al
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
33d0209023 | ||
|
|
879e2caf8c | ||
|
|
c4014e73a2 | ||
|
|
be9efdb981 | ||
|
|
074dc3b9d8 | ||
|
|
86f9b582d5 | ||
|
|
4142c3ef7c | ||
|
|
6602e793c0 | ||
|
|
ea0fdaed28 | ||
|
|
1eb382da5a | ||
|
|
bb6fd02298 | ||
|
|
7e2bceceee | ||
|
|
30a7d7096c | ||
|
|
200a18820e | ||
|
|
e03637176d | ||
|
|
c02db93243 | ||
|
|
ffa4d5134a | ||
|
|
302d7fdbf3 | ||
|
|
cf442cd57e | ||
|
|
0e1ba65855 | ||
|
|
6aad333c63 | ||
|
|
4fcc84e67a | ||
|
|
3ae2f441e0 | ||
|
|
2abb3f6424 | ||
|
|
ce3b212d12 | ||
|
|
83d6d46e29 | ||
|
|
354ad9254e | ||
|
|
58876091f7 | ||
|
|
dc18eee39d | ||
|
|
8727a9c140 | ||
|
|
d0425f26cf | ||
|
|
cfa84b8470 | ||
|
|
1580ed4c06 | ||
|
|
a7ee84fc31 | ||
|
|
84ac7ce139 | ||
|
|
788b092c49 | ||
|
|
5cde17a096 | ||
|
|
c3837eb08c | ||
|
|
8cc0ee2efe | ||
|
|
d5eec16d23 | ||
|
|
a3906a6173 | ||
|
|
daa1a032f7 | ||
|
|
6042e8bc57 | ||
|
|
920a4b0794 | ||
|
|
c496967e56 | ||
|
|
c942e4a07b | ||
|
|
bd54b08261 |
@@ -331,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
|
|
||||||
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
||||||
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
||||||
|
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
||||||
|
|
||||||
### Libraries
|
### Libraries
|
||||||
|
|
||||||
@@ -357,7 +358,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
|
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
|
||||||
- [Testcontainers](https://testcontainers.com/modules/ollama/)
|
- [Testcontainers](https://testcontainers.com/modules/ollama/)
|
||||||
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
||||||
|
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
|
||||||
|
- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript)
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
|
|||||||
33
api/types.go
33
api/types.go
@@ -4,6 +4,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
@@ -116,6 +117,7 @@ type ChatResponse struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
Message Message `json:"message"`
|
Message Message `json:"message"`
|
||||||
|
DoneReason string `json:"done_reason,omitempty"`
|
||||||
|
|
||||||
Done bool `json:"done"`
|
Done bool `json:"done"`
|
||||||
|
|
||||||
@@ -161,7 +163,6 @@ type Runner struct {
|
|||||||
UseNUMA bool `json:"numa,omitempty"`
|
UseNUMA bool `json:"numa,omitempty"`
|
||||||
NumCtx int `json:"num_ctx,omitempty"`
|
NumCtx int `json:"num_ctx,omitempty"`
|
||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGQA int `json:"num_gqa,omitempty"`
|
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
@@ -171,11 +172,6 @@ type Runner struct {
|
|||||||
UseMMap bool `json:"use_mmap,omitempty"`
|
UseMMap bool `json:"use_mmap,omitempty"`
|
||||||
UseMLock bool `json:"use_mlock,omitempty"`
|
UseMLock bool `json:"use_mlock,omitempty"`
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
|
|
||||||
// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
|
|
||||||
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
|
|
||||||
// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
|
|
||||||
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||||
@@ -205,10 +201,13 @@ type CreateRequest struct {
|
|||||||
Path string `json:"path"`
|
Path string `json:"path"`
|
||||||
Modelfile string `json:"modelfile"`
|
Modelfile string `json:"modelfile"`
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
Quantization string `json:"quantization,omitempty"`
|
Quantize string `json:"quantize,omitempty"`
|
||||||
|
|
||||||
// Name is deprecated, see Model
|
// Name is deprecated, see Model
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
|
||||||
|
// Quantization is deprecated, see Quantize
|
||||||
|
Quantization string `json:"quantization,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeleteRequest is the request passed to [Client.Delete].
|
// DeleteRequest is the request passed to [Client.Delete].
|
||||||
@@ -314,6 +313,9 @@ type GenerateResponse struct {
|
|||||||
// Done specifies if the response is complete.
|
// Done specifies if the response is complete.
|
||||||
Done bool `json:"done"`
|
Done bool `json:"done"`
|
||||||
|
|
||||||
|
// DoneReason is the reason the model stopped generating text.
|
||||||
|
DoneReason string `json:"done_reason,omitempty"`
|
||||||
|
|
||||||
// Context is an encoding of the conversation used in this response; this
|
// Context is an encoding of the conversation used in this response; this
|
||||||
// can be sent in the next request to keep a conversational memory.
|
// can be sent in the next request to keep a conversational memory.
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
@@ -359,8 +361,6 @@ func (m *Metrics) Summary() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ErrInvalidOpts is returned when invalid options are passed to the client.
|
|
||||||
var ErrInvalidOpts = errors.New("invalid options")
|
|
||||||
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
||||||
|
|
||||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||||
@@ -376,9 +376,13 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
invalidOpts := []string{}
|
|
||||||
for key, val := range m {
|
for key, val := range m {
|
||||||
if opt, ok := jsonOpts[key]; ok {
|
opt, ok := jsonOpts[key]
|
||||||
|
if !ok {
|
||||||
|
slog.Warn("invalid option provided", "option", opt.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
field := valueOpts.FieldByName(opt.Name)
|
field := valueOpts.FieldByName(opt.Name)
|
||||||
if field.IsValid() && field.CanSet() {
|
if field.IsValid() && field.CanSet() {
|
||||||
if val == nil {
|
if val == nil {
|
||||||
@@ -435,14 +439,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
|
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
invalidOpts = append(invalidOpts, key)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(invalidOpts) > 0 {
|
|
||||||
return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -475,7 +473,6 @@ func DefaultOptions() Options {
|
|||||||
NumCtx: 2048,
|
NumCtx: 2048,
|
||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumGQA: 1,
|
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
|
|||||||
@@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
quantization, _ := cmd.Flags().GetString("quantization")
|
quantize, _ := cmd.Flags().GetString("quantize")
|
||||||
|
|
||||||
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
|
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize}
|
||||||
if err := client.Create(cmd.Context(), &request, fn); err != nil {
|
if err := client.Create(cmd.Context(), &request, fn); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command {
|
|||||||
}
|
}
|
||||||
|
|
||||||
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
|
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
|
||||||
createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
|
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
||||||
|
|
||||||
showCmd := &cobra.Command{
|
showCmd := &cobra.Command{
|
||||||
Use: "show MODEL",
|
Use: "show MODEL",
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
* [Importing models](./import.md)
|
* [Importing models](./import.md)
|
||||||
* [Linux Documentation](./linux.md)
|
* [Linux Documentation](./linux.md)
|
||||||
* [Windows Documentation](./windows.md)
|
* [Windows Documentation](./windows.md)
|
||||||
* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
|
* [Docker Documentation](./docker.md)
|
||||||
|
|
||||||
### Reference
|
### Reference
|
||||||
|
|
||||||
|
|||||||
@@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"numa": false,
|
"numa": false,
|
||||||
"num_ctx": 1024,
|
"num_ctx": 1024,
|
||||||
"num_batch": 2,
|
"num_batch": 2,
|
||||||
"num_gqa": 1,
|
|
||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
"low_vram": false,
|
"low_vram": false,
|
||||||
@@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"vocab_only": false,
|
"vocab_only": false,
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
"use_mlock": false,
|
"use_mlock": false,
|
||||||
"rope_frequency_base": 1.1,
|
|
||||||
"rope_frequency_scale": 0.8,
|
|
||||||
"num_thread": 8
|
"num_thread": 8
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
|
|||||||
71
docs/docker.md
Normal file
71
docs/docker.md
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# Ollama Docker image
|
||||||
|
|
||||||
|
### CPU only
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
### Nvidia GPU
|
||||||
|
Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
|
||||||
|
|
||||||
|
#### Install with Apt
|
||||||
|
1. Configure the repository
|
||||||
|
```bash
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
|
||||||
|
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
|
||||||
|
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
||||||
|
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
sudo apt-get update
|
||||||
|
```
|
||||||
|
2. Install the NVIDIA Container Toolkit packages
|
||||||
|
```bash
|
||||||
|
sudo apt-get install -y nvidia-container-toolkit
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Install with Yum or Dnf
|
||||||
|
1. Configure the repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
|
||||||
|
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install the NVIDIA Container Toolkit packages
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo yum install -y nvidia-container-toolkit
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Configure Docker to use Nvidia driver
|
||||||
|
```
|
||||||
|
sudo nvidia-ctk runtime configure --runtime=docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Start the container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
### AMD GPU
|
||||||
|
|
||||||
|
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run model locally
|
||||||
|
|
||||||
|
Now you can run a model:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it ollama ollama run llama3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Try different models
|
||||||
|
|
||||||
|
More models can be found on the [Ollama library](https://ollama.com/library).
|
||||||
@@ -83,3 +83,22 @@ If your system is configured with the "noexec" flag where Ollama stores its
|
|||||||
temporary executable files, you can specify an alternate location by setting
|
temporary executable files, you can specify an alternate location by setting
|
||||||
OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example
|
OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example
|
||||||
OLLAMA_TMPDIR=/usr/share/ollama/
|
OLLAMA_TMPDIR=/usr/share/ollama/
|
||||||
|
|
||||||
|
## Container fails to run on NVIDIA GPU
|
||||||
|
|
||||||
|
Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md)
|
||||||
|
|
||||||
|
Sometimes the container runtime can have difficulties initializing the GPU.
|
||||||
|
When you check the server logs, this can show up as various error codes, such
|
||||||
|
as "3" (not initialized), "46" (device unavailable), "100" (no device), "999"
|
||||||
|
(unknown), or others. The following troubleshooting techniques may help resolve
|
||||||
|
the problem
|
||||||
|
|
||||||
|
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
|
||||||
|
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
|
||||||
|
- Try rebooting
|
||||||
|
- Make sure you're running the latest nvidia drivers
|
||||||
|
|
||||||
|
If none of those resolve the problem, gather additional information and file an issue:
|
||||||
|
- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
|
||||||
|
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
||||||
|
|||||||
@@ -5,13 +5,13 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le
|
|||||||
To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
|
To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm install langchain
|
npm install @langchain/community
|
||||||
```
|
```
|
||||||
|
|
||||||
Now we can start building out our JavaScript:
|
Now we can start building out our JavaScript:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
import { Ollama } from "langchain/llms/ollama";
|
import { Ollama } from "@langchain/community/llms/ollama";
|
||||||
|
|
||||||
const ollama = new Ollama({
|
const ollama = new Ollama({
|
||||||
baseUrl: "http://localhost:11434",
|
baseUrl: "http://localhost:11434",
|
||||||
|
|||||||
@@ -1,10 +0,0 @@
|
|||||||
# Bash Shell examples
|
|
||||||
|
|
||||||
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
|
|
||||||
|
|
||||||
`ollama run llama3 < sourcequestions.txt`
|
|
||||||
|
|
||||||
This concept is used in the following example.
|
|
||||||
|
|
||||||
## Compare Models
|
|
||||||
`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
#! /usr/bin/env bash
|
|
||||||
# Compare multiple models by running them with the same questions
|
|
||||||
|
|
||||||
NUMBEROFCHOICES=4
|
|
||||||
SELECTIONS=()
|
|
||||||
declare -a SUMS=()
|
|
||||||
|
|
||||||
# Get the list of models
|
|
||||||
CHOICES=$(ollama list | awk '{print $1}')
|
|
||||||
|
|
||||||
# Select which models to run as a comparison
|
|
||||||
echo "Select $NUMBEROFCHOICES models to compare:"
|
|
||||||
select ITEM in $CHOICES; do
|
|
||||||
if [[ -n $ITEM ]]; then
|
|
||||||
echo "You have selected $ITEM"
|
|
||||||
SELECTIONS+=("$ITEM")
|
|
||||||
((COUNT++))
|
|
||||||
if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Invalid selection"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Loop through each of the selected models
|
|
||||||
for ITEM in "${SELECTIONS[@]}"; do
|
|
||||||
echo "--------------------------------------------------------------"
|
|
||||||
echo "Loading the model $ITEM into memory"
|
|
||||||
ollama run "$ITEM" ""
|
|
||||||
echo "--------------------------------------------------------------"
|
|
||||||
echo "Running the questions through the model $ITEM"
|
|
||||||
COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
|
|
||||||
|
|
||||||
# eval duration is sometimes listed in seconds and sometimes in milliseconds.
|
|
||||||
# Add up the values for each model
|
|
||||||
SUM=$(echo "$COMMAND_OUTPUT" | awk '
|
|
||||||
/eval duration:/ {
|
|
||||||
value = $3
|
|
||||||
if (index(value, "ms") > 0) {
|
|
||||||
gsub("ms", "", value)
|
|
||||||
value /= 1000
|
|
||||||
} else {
|
|
||||||
gsub("s", "", value)
|
|
||||||
}
|
|
||||||
sum += value
|
|
||||||
}
|
|
||||||
END { print sum }')
|
|
||||||
|
|
||||||
|
|
||||||
SUMS+=("All questions for $ITEM completed in $SUM seconds")
|
|
||||||
done
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "--------------------------------------------------------------"
|
|
||||||
echo -e "Sums of eval durations for each run:"
|
|
||||||
for val in "${SUMS[@]}"; do
|
|
||||||
echo "$val"
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--------------------------------------------------------------"
|
|
||||||
echo "Comparison complete. Now you can decide"
|
|
||||||
echo "which model is best."
|
|
||||||
echo "--------------------------------------------------------------"
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
Why is the sky blue
|
|
||||||
What is a black hole
|
|
||||||
Explain the big bang theory like I am 5?
|
|
||||||
What is the quickest way to win a game of Monopoly with 3 others?
|
|
||||||
Why does a vacuum bottle keep my coffee hot and my milkshake cold?
|
|
||||||
What is the difference between a meteor, a meteorite, and a meteoroid?
|
|
||||||
Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
|
|
||||||
@@ -3,7 +3,6 @@ package gpu
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"strconv"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
@@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
|
|||||||
hl.dll = 0
|
hl.dll = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hl *HipLib) AMDDriverVersion() (string, error) {
|
func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
|
||||||
if hl.dll == 0 {
|
if hl.dll == 0 {
|
||||||
return "", fmt.Errorf("dll has been unloaded")
|
return 0, 0, fmt.Errorf("dll has been unloaded")
|
||||||
}
|
}
|
||||||
var version int
|
var version int
|
||||||
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
|
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
|
||||||
if status != hipSuccess {
|
if status != hipSuccess {
|
||||||
return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
|
return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
|
||||||
}
|
}
|
||||||
return strconv.Itoa(version), nil
|
|
||||||
|
slog.Debug("hipDriverGetVersion", "version", version)
|
||||||
|
// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
|
||||||
|
driverMajor = version / 1000
|
||||||
|
driverMinor = (version - (driverMajor * 1000)) / 10
|
||||||
|
|
||||||
|
return driverMajor, driverMinor, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hl *HipLib) HipGetDeviceCount() int {
|
func (hl *HipLib) HipGetDeviceCount() int {
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Opportunistic logging of driver version to aid in troubleshooting
|
// Opportunistic logging of driver version to aid in troubleshooting
|
||||||
ver, err := AMDDriverVersion()
|
driverMajor, driverMinor, err := AMDDriverVersion()
|
||||||
if err == nil {
|
if err != nil {
|
||||||
slog.Info("AMD Driver: " + ver)
|
|
||||||
} else {
|
|
||||||
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
||||||
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
|
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
|
||||||
}
|
}
|
||||||
@@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
isCPU := false
|
isCPU := false
|
||||||
var major, minor, patch uint64
|
var major, minor, patch uint64
|
||||||
|
var vendor, device uint64
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := strings.TrimSpace(scanner.Text())
|
line := strings.TrimSpace(scanner.Text())
|
||||||
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
||||||
@@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
slog.Debug("malformed int " + line)
|
slog.Debug("malformed int " + line)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
} else if strings.HasPrefix(line, "vendor_id") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 {
|
||||||
|
slog.Debug("malformed vendor_id", "vendor_id", line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vendor, err = strconv.ParseUint(ver[1], 10, 32)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("malformed vendor_id" + line)
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(line, "device_id") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 {
|
||||||
|
slog.Debug("malformed device_id", "device_id", line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
device, err = strconv.ParseUint(ver[1], 10, 32)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("malformed device_id" + line)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - any other properties we want to extract and record?
|
// TODO - any other properties we want to extract and record?
|
||||||
@@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if int(major) < RocmComputeMin {
|
if int(major) < RocmComputeMin {
|
||||||
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
|
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,12 +230,17 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||||
if totalMemory < IGPUMemLimit {
|
if totalMemory < IGPUMemLimit {
|
||||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
var name string
|
||||||
|
// TODO - PCI ID lookup
|
||||||
|
if vendor > 0 && device > 0 {
|
||||||
|
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
||||||
|
}
|
||||||
|
|
||||||
slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||||
slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
||||||
gpuInfo := GpuInfo{
|
gpuInfo := GpuInfo{
|
||||||
Library: "rocm",
|
Library: "rocm",
|
||||||
memInfo: memInfo{
|
memInfo: memInfo{
|
||||||
@@ -223,11 +248,11 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
FreeMemory: (totalMemory - usedMemory),
|
FreeMemory: (totalMemory - usedMemory),
|
||||||
},
|
},
|
||||||
ID: fmt.Sprintf("%d", gpuID),
|
ID: fmt.Sprintf("%d", gpuID),
|
||||||
// Name: not exposed in sysfs directly, would require pci device id lookup
|
Name: name,
|
||||||
Major: int(major),
|
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||||
Minor: int(minor),
|
|
||||||
Patch: int(patch),
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
|
DriverMajor: driverMajor,
|
||||||
|
DriverMinor: driverMinor,
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
||||||
@@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
slog.Debug("rocm supported GPUs", "types", supported)
|
slog.Debug("rocm supported GPUs", "types", supported)
|
||||||
}
|
}
|
||||||
gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
|
gfx := gpuInfo.Compute
|
||||||
if !slices.Contains[[]string, string](supported, gfx) {
|
if !slices.Contains[[]string, string](supported, gfx) {
|
||||||
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||||
@@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
|
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
||||||
}
|
}
|
||||||
|
|
||||||
// The GPU has passed all the verification steps and is supported
|
// The GPU has passed all the verification steps and is supported
|
||||||
@@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
|
|||||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||||
}
|
}
|
||||||
|
|
||||||
func AMDDriverVersion() (string, error) {
|
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
|
||||||
_, err := os.Stat(DriverVersionFile)
|
_, err = os.Stat(DriverVersionFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
|
return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
|
||||||
}
|
}
|
||||||
fp, err := os.Open(DriverVersionFile)
|
fp, err := os.Open(DriverVersionFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return 0, 0, err
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
verString, err := io.ReadAll(fp)
|
verString, err := io.ReadAll(fp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return 0, 0, err
|
||||||
}
|
}
|
||||||
return strings.TrimSpace(string(verString)), nil
|
|
||||||
|
pattern := `\A(\d+)\.(\d+).*`
|
||||||
|
regex := regexp.MustCompile(pattern)
|
||||||
|
match := regex.FindStringSubmatch(string(verString))
|
||||||
|
if len(match) < 2 {
|
||||||
|
return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
|
||||||
|
}
|
||||||
|
driverMajor, err = strconv.Atoi(match[1])
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, err
|
||||||
|
}
|
||||||
|
driverMinor, err = strconv.Atoi(match[2])
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, err
|
||||||
|
}
|
||||||
|
return driverMajor, driverMinor, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
@@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
defer hl.Release()
|
defer hl.Release()
|
||||||
|
|
||||||
ver, err := hl.AMDDriverVersion()
|
// TODO - this reports incorrect version information, so omitting for now
|
||||||
if err == nil {
|
// driverMajor, driverMinor, err := hl.AMDDriverVersion()
|
||||||
slog.Info("AMD Driver: " + ver)
|
// if err != nil {
|
||||||
} else {
|
// // For now this is benign, but we may eventually need to fail compatibility checks
|
||||||
// For now this is benign, but we may eventually need to fail compatibility checks
|
// slog.Debug("error looking up amd driver version", "error", err)
|
||||||
slog.Debug("error looking up amd driver version", "error", err)
|
// }
|
||||||
}
|
|
||||||
|
|
||||||
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
||||||
count := hl.HipGetDeviceCount()
|
count := hl.HipGetDeviceCount()
|
||||||
@@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("detected hip devices", "count", count)
|
slog.Debug("detected hip devices", "count", count)
|
||||||
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
err = hl.HipSetDevice(i)
|
err = hl.HipSetDevice(i)
|
||||||
@@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
||||||
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
||||||
gfx := string(props.GcnArchName[:n])
|
gfx := string(props.GcnArchName[:n])
|
||||||
slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
|
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
|
||||||
var major, minor, patch string
|
|
||||||
switch len(gfx) {
|
|
||||||
case 6:
|
|
||||||
major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
|
|
||||||
case 7:
|
|
||||||
major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
|
|
||||||
}
|
|
||||||
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
||||||
// TODO Why isn't props.iGPU accurate!?
|
// TODO Why isn't props.iGPU accurate!?
|
||||||
if strings.EqualFold(name, iGPUName) {
|
if strings.EqualFold(name, iGPUName) {
|
||||||
slog.Info("iGPU detected skipping", "id", i)
|
slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
@@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
|
|
||||||
// TODO revisit this once ROCm v6 is available on windows.
|
// TODO revisit this once ROCm v6 is available on windows.
|
||||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
||||||
slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||||
slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
||||||
gpuInfo := GpuInfo{
|
gpuInfo := GpuInfo{
|
||||||
Library: "rocm",
|
Library: "rocm",
|
||||||
memInfo: memInfo{
|
memInfo: memInfo{
|
||||||
@@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: libDir,
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
}
|
Name: name,
|
||||||
if major != "" {
|
Compute: gfx,
|
||||||
gpuInfo.Major, err = strconv.Atoi(major)
|
|
||||||
if err != nil {
|
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
// DriverMajor: driverMajor,
|
||||||
}
|
// DriverMinor: driverMinor,
|
||||||
}
|
|
||||||
if minor != "" {
|
|
||||||
gpuInfo.Minor, err = strconv.Atoi(minor)
|
|
||||||
if err != nil {
|
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if patch != "" {
|
|
||||||
// Patch rev is hex; e.g. gfx90a
|
|
||||||
p, err := strconv.ParseInt(patch, 16, 0)
|
|
||||||
if err != nil {
|
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
|
||||||
} else {
|
|
||||||
gpuInfo.Patch = int(p)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if gpuInfo.Major < RocmComputeMin {
|
|
||||||
slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resp = append(resp, gpuInfo)
|
resp = append(resp, gpuInfo)
|
||||||
|
|||||||
@@ -8,14 +8,14 @@ import (
|
|||||||
|
|
||||||
func GetCPUVariant() string {
|
func GetCPUVariant() string {
|
||||||
if cpu.X86.HasAVX2 {
|
if cpu.X86.HasAVX2 {
|
||||||
slog.Info("CPU has AVX2")
|
slog.Debug("CPU has AVX2")
|
||||||
return "avx2"
|
return "avx2"
|
||||||
}
|
}
|
||||||
if cpu.X86.HasAVX {
|
if cpu.X86.HasAVX {
|
||||||
slog.Info("CPU has AVX")
|
slog.Debug("CPU has AVX")
|
||||||
return "avx"
|
return "avx"
|
||||||
}
|
}
|
||||||
slog.Info("CPU does not have vector extensions")
|
slog.Debug("CPU does not have vector extensions")
|
||||||
// else LCD
|
// else LCD
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|||||||
20
gpu/gpu.go
20
gpu/gpu.go
@@ -31,8 +31,8 @@ type handles struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
cudaMinimumMemory = 256 * format.MebiByte
|
cudaMinimumMemory = 457 * format.MebiByte
|
||||||
rocmMinimumMemory = 256 * format.MebiByte
|
rocmMinimumMemory = 457 * format.MebiByte
|
||||||
)
|
)
|
||||||
|
|
||||||
var gpuMutex sync.Mutex
|
var gpuMutex sync.Mutex
|
||||||
@@ -119,12 +119,12 @@ func initGPUHandles() *handles {
|
|||||||
return gpuHandles
|
return gpuHandles
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("Detecting GPUs")
|
slog.Debug("Detecting GPUs")
|
||||||
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
||||||
if len(nvcudaLibPaths) > 0 {
|
if len(nvcudaLibPaths) > 0 {
|
||||||
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
||||||
if nvcuda != nil {
|
if nvcuda != nil {
|
||||||
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
|
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
|
||||||
gpuHandles.nvcuda = nvcuda
|
gpuHandles.nvcuda = nvcuda
|
||||||
gpuHandles.deviceCount = deviceCount
|
gpuHandles.deviceCount = deviceCount
|
||||||
return gpuHandles
|
return gpuHandles
|
||||||
@@ -135,7 +135,7 @@ func initGPUHandles() *handles {
|
|||||||
if len(cudartLibPaths) > 0 {
|
if len(cudartLibPaths) > 0 {
|
||||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||||
if cudart != nil {
|
if cudart != nil {
|
||||||
slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
|
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
|
||||||
gpuHandles.cudart = cudart
|
gpuHandles.cudart = cudart
|
||||||
gpuHandles.deviceCount = deviceCount
|
gpuHandles.deviceCount = deviceCount
|
||||||
return gpuHandles
|
return gpuHandles
|
||||||
@@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo := GpuInfo{
|
gpuInfo := GpuInfo{
|
||||||
Library: "cuda",
|
Library: "cuda",
|
||||||
}
|
}
|
||||||
|
var driverMajor int
|
||||||
|
var driverMinor int
|
||||||
if gpuHandles.cudart != nil {
|
if gpuHandles.cudart != nil {
|
||||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||||
} else {
|
} else {
|
||||||
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
|
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
|
||||||
|
driverMajor = int(gpuHandles.nvcuda.driver_major)
|
||||||
|
driverMinor = int(gpuHandles.nvcuda.driver_minor)
|
||||||
}
|
}
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||||
@@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
gpuInfo.TotalMemory = uint64(memInfo.total)
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Major = int(memInfo.major)
|
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||||
gpuInfo.Minor = int(memInfo.minor)
|
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = depPath
|
||||||
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
|
gpuInfo.DriverMajor = int(driverMajor)
|
||||||
|
gpuInfo.DriverMinor = int(driverMinor)
|
||||||
|
|
||||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
||||||
resp = append(resp, gpuInfo)
|
resp = append(resp, gpuInfo)
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
metalMinimumMemory = 384 * format.MebiByte
|
metalMinimumMemory = 512 * format.MebiByte
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
func GetGPUInfo() GpuInfoList {
|
||||||
|
|||||||
@@ -39,16 +39,19 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GPU_ID_LEN 64
|
#define GPU_ID_LEN 64
|
||||||
|
#define GPU_NAME_LEN 96
|
||||||
|
|
||||||
typedef struct mem_info {
|
typedef struct mem_info {
|
||||||
char *err; // If non-nill, caller responsible for freeing
|
char *err; // If non-nill, caller responsible for freeing
|
||||||
char gpu_id[GPU_ID_LEN];
|
char gpu_id[GPU_ID_LEN];
|
||||||
|
char gpu_name[GPU_NAME_LEN];
|
||||||
uint64_t total;
|
uint64_t total;
|
||||||
uint64_t free;
|
uint64_t free;
|
||||||
|
|
||||||
// Compute Capability
|
// Compute Capability
|
||||||
int major;
|
int major;
|
||||||
int minor;
|
int minor;
|
||||||
|
int patch;
|
||||||
} mem_info_t;
|
} mem_info_t;
|
||||||
|
|
||||||
void cpu_check_ram(mem_info_t *resp);
|
void cpu_check_ram(mem_info_t *resp);
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) {
|
|||||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||||
resp->total = info.ullTotalPhys;
|
resp->total = info.ullTotalPhys;
|
||||||
resp->free = info.ullAvailPhys;
|
resp->free = info.ullAvailPhys;
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||||
} else {
|
} else {
|
||||||
resp->err = LOAD_ERR();
|
resp->err = LOAD_ERR();
|
||||||
@@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) {
|
|||||||
} else {
|
} else {
|
||||||
resp->total = info.totalram * info.mem_unit;
|
resp->total = info.totalram * info.mem_unit;
|
||||||
resp->free = info.freeram * info.mem_unit;
|
resp->free = info.freeram * info.mem_unit;
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
|
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
|
||||||
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
|
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
|
||||||
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
|
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
|
||||||
|
{"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
|
||||||
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
|
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
|
||||||
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
|
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
|
||||||
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
|
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
|
||||||
@@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int version = 0;
|
int version = 0;
|
||||||
nvcudaDriverVersion_t driverVersion;
|
resp->ch.driver_major = 0;
|
||||||
driverVersion.major = 0;
|
resp->ch.driver_minor = 0;
|
||||||
driverVersion.minor = 0;
|
|
||||||
|
|
||||||
// Report driver version if we're in verbose mode, ignore errors
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||||
} else {
|
} else {
|
||||||
driverVersion.major = version / 1000;
|
resp->ch.driver_major = version / 1000;
|
||||||
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||||
@@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
int major = 0;
|
int major = 0;
|
||||||
int minor = 0;
|
int minor = 0;
|
||||||
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
||||||
@@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
|
||||||
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
|
||||||
|
resp->gpu_name[0] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
// To get memory we have to set (and release) a context
|
// To get memory we have to set (and release) a context
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
|||||||
@@ -44,12 +44,15 @@ typedef void* CUcontext;
|
|||||||
typedef struct nvcuda_handle {
|
typedef struct nvcuda_handle {
|
||||||
void *handle;
|
void *handle;
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
|
int driver_major;
|
||||||
|
int driver_minor;
|
||||||
CUresult (*cuInit)(unsigned int Flags);
|
CUresult (*cuInit)(unsigned int Flags);
|
||||||
CUresult (*cuDriverGetVersion)(int *driverVersion);
|
CUresult (*cuDriverGetVersion)(int *driverVersion);
|
||||||
CUresult (*cuDeviceGetCount)(int *);
|
CUresult (*cuDeviceGetCount)(int *);
|
||||||
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
|
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
|
||||||
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
|
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
|
||||||
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
|
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
|
||||||
|
CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
|
||||||
|
|
||||||
// Context specific aspects
|
// Context specific aspects
|
||||||
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
|
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
|
||||||
|
|||||||
30
gpu/types.go
30
gpu/types.go
@@ -1,5 +1,12 @@
|
|||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
)
|
||||||
|
|
||||||
type memInfo struct {
|
type memInfo struct {
|
||||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||||
@@ -22,9 +29,11 @@ type GpuInfo struct {
|
|||||||
// GPU information
|
// GPU information
|
||||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
||||||
Name string `json:"name"` // user friendly name if available
|
Name string `json:"name"` // user friendly name if available
|
||||||
Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx)
|
Compute string `json:"compute"` // Compute Capability or gfx
|
||||||
Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
|
|
||||||
Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD
|
// Driver Information - TODO no need to put this on each GPU
|
||||||
|
DriverMajor int `json:"driver_major,omitempty"`
|
||||||
|
DriverMinor int `json:"driver_minor,omitempty"`
|
||||||
|
|
||||||
// TODO other performance capability info to help in scheduling decisions
|
// TODO other performance capability info to help in scheduling decisions
|
||||||
}
|
}
|
||||||
@@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Report the GPU information into the log an Info level
|
||||||
|
func (l GpuInfoList) LogDetails() {
|
||||||
|
for _, g := range l {
|
||||||
|
slog.Info("inference compute",
|
||||||
|
"id", g.ID,
|
||||||
|
"library", g.Library,
|
||||||
|
"compute", g.Compute,
|
||||||
|
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
||||||
|
"name", g.Name,
|
||||||
|
"total", format.HumanBytes2(g.TotalMemory),
|
||||||
|
"available", format.HumanBytes2(g.FreeMemory),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sort by Free Space
|
// Sort by Free Space
|
||||||
type ByFreeMemory []GpuInfo
|
type ByFreeMemory []GpuInfo
|
||||||
|
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ func TestMultiModelStress(t *testing.T) {
|
|||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for j := 0; j < 3; j++ {
|
for j := 0; j < 3; j++ {
|
||||||
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
|
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
|
||||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
|
||||||
}
|
}
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ func GetTestEndpoint() (*api.Client, string) {
|
|||||||
var serverMutex sync.Mutex
|
var serverMutex sync.Mutex
|
||||||
var serverReady bool
|
var serverReady bool
|
||||||
|
|
||||||
func startServer(ctx context.Context, ollamaHost string) error {
|
func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
|
||||||
// Make sure the server has been built
|
// Make sure the server has been built
|
||||||
CLIName, err := filepath.Abs("../ollama")
|
CLIName, err := filepath.Abs("../ollama")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -200,7 +200,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
|
|||||||
}
|
}
|
||||||
lifecycle.ServerLogFile = fp.Name()
|
lifecycle.ServerLogFile = fp.Name()
|
||||||
fp.Close()
|
fp.Close()
|
||||||
require.NoError(t, startServer(ctx, testEndpoint))
|
require.NoError(t, startServer(t, ctx, testEndpoint))
|
||||||
}
|
}
|
||||||
|
|
||||||
return client, testEndpoint, func() {
|
return client, testEndpoint, func() {
|
||||||
|
|||||||
86
llm/ext_server/server.cpp
vendored
86
llm/ext_server/server.cpp
vendored
@@ -66,7 +66,7 @@ struct server_params {
|
|||||||
};
|
};
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
bool server_log_json = true;
|
bool server_log_json = false;
|
||||||
|
|
||||||
enum stop_type {
|
enum stop_type {
|
||||||
STOP_FULL,
|
STOP_FULL,
|
||||||
@@ -266,7 +266,7 @@ struct server_slot {
|
|||||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_prompt_processing, n_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_INFO(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
{"t_prompt_processing", t_prompt_processing},
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
@@ -280,7 +280,7 @@ struct server_slot {
|
|||||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_token_generation, n_decoded,
|
t_token_generation, n_decoded,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_INFO(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
{"t_token_generation", t_token_generation},
|
{"t_token_generation", t_token_generation},
|
||||||
@@ -290,7 +290,7 @@ struct server_slot {
|
|||||||
});
|
});
|
||||||
|
|
||||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||||
LOG_INFO(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
{"t_prompt_processing", t_prompt_processing},
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
@@ -371,7 +371,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
if (clp_ctx)
|
if (clp_ctx)
|
||||||
{
|
{
|
||||||
LOG_INFO("freeing clip model", {});
|
LOG_DEBUG("freeing clip model", {});
|
||||||
clip_free(clp_ctx);
|
clip_free(clp_ctx);
|
||||||
clp_ctx = nullptr;
|
clp_ctx = nullptr;
|
||||||
}
|
}
|
||||||
@@ -392,7 +392,7 @@ struct llama_server_context
|
|||||||
params = params_;
|
params = params_;
|
||||||
if (!params.mmproj.empty()) {
|
if (!params.mmproj.empty()) {
|
||||||
multimodal = true;
|
multimodal = true;
|
||||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
LOG_DEBUG("Multi Modal Mode Enabled", {});
|
||||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||||
if(clp_ctx == nullptr) {
|
if(clp_ctx == nullptr) {
|
||||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||||
@@ -445,7 +445,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||||
|
|
||||||
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
|
||||||
for (int i = 0; i < params.n_parallel; i++)
|
for (int i = 0; i < params.n_parallel; i++)
|
||||||
{
|
{
|
||||||
server_slot slot;
|
server_slot slot;
|
||||||
@@ -454,7 +454,7 @@ struct llama_server_context
|
|||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
slot.n_predict = params.n_predict;
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
LOG_INFO("new slot", {
|
LOG_DEBUG("new slot", {
|
||||||
{"slot_id", slot.id},
|
{"slot_id", slot.id},
|
||||||
{"n_ctx_slot", slot.n_ctx}
|
{"n_ctx_slot", slot.n_ctx}
|
||||||
});
|
});
|
||||||
@@ -468,7 +468,7 @@ struct llama_server_context
|
|||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||||
|
|
||||||
LOG_INFO("slot self-extend", {
|
LOG_DEBUG("slot self-extend", {
|
||||||
{"slot_id", slot.id},
|
{"slot_id", slot.id},
|
||||||
{"ga_n", ga_n},
|
{"ga_n", ga_n},
|
||||||
{"ga_w", ga_w}
|
{"ga_w", ga_w}
|
||||||
@@ -827,7 +827,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
all_slots_are_idle = false;
|
all_slots_are_idle = false;
|
||||||
|
|
||||||
LOG_INFO("slot is processing task", {
|
LOG_DEBUG("slot is processing task", {
|
||||||
{"slot_id", slot->id},
|
{"slot_id", slot->id},
|
||||||
{"task_id", slot->task_id},
|
{"task_id", slot->task_id},
|
||||||
});
|
});
|
||||||
@@ -1504,7 +1504,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
slots_data.push_back(slot_data);
|
slots_data.push_back(slot_data);
|
||||||
}
|
}
|
||||||
LOG_INFO("slot data", {
|
LOG_DEBUG("slot data", {
|
||||||
{"task_id", task.id},
|
{"task_id", task.id},
|
||||||
{"n_idle_slots", n_idle_slots},
|
{"n_idle_slots", n_idle_slots},
|
||||||
{"n_processing_slots", n_processing_slots}
|
{"n_processing_slots", n_processing_slots}
|
||||||
@@ -1566,7 +1566,7 @@ struct llama_server_context
|
|||||||
bool update_slots() {
|
bool update_slots() {
|
||||||
if (system_need_update)
|
if (system_need_update)
|
||||||
{
|
{
|
||||||
LOG_INFO("updating system prompt", {});
|
LOG_DEBUG("updating system prompt", {});
|
||||||
system_prompt_update();
|
system_prompt_update();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1576,7 +1576,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
if (system_prompt.empty() && clean_kv_cache)
|
if (system_prompt.empty() && clean_kv_cache)
|
||||||
{
|
{
|
||||||
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -1599,7 +1599,7 @@ struct llama_server_context
|
|||||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||||
const int n_discard = n_left / 2;
|
const int n_discard = n_left / 2;
|
||||||
|
|
||||||
LOG_INFO("slot context shift", {
|
LOG_DEBUG("slot context shift", {
|
||||||
{"slot_id", slot.id},
|
{"slot_id", slot.id},
|
||||||
{"task_id", slot.task_id},
|
{"task_id", slot.task_id},
|
||||||
{"n_keep", n_keep},
|
{"n_keep", n_keep},
|
||||||
@@ -1638,7 +1638,7 @@ struct llama_server_context
|
|||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
slot.t_last_used = ggml_time_us();
|
slot.t_last_used = ggml_time_us();
|
||||||
|
|
||||||
LOG_INFO("slot released", {
|
LOG_DEBUG("slot released", {
|
||||||
{"slot_id", slot.id},
|
{"slot_id", slot.id},
|
||||||
{"task_id", slot.task_id},
|
{"task_id", slot.task_id},
|
||||||
{"n_ctx", n_ctx},
|
{"n_ctx", n_ctx},
|
||||||
@@ -1807,7 +1807,7 @@ struct llama_server_context
|
|||||||
slot.ga_i = ga_i;
|
slot.ga_i = ga_i;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INFO("slot progression", {
|
LOG_DEBUG("slot progression", {
|
||||||
{ "slot_id", slot.id },
|
{ "slot_id", slot.id },
|
||||||
{ "task_id", slot.task_id },
|
{ "task_id", slot.task_id },
|
||||||
{ "n_past", slot.n_past },
|
{ "n_past", slot.n_past },
|
||||||
@@ -1822,7 +1822,7 @@ struct llama_server_context
|
|||||||
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
|
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
|
||||||
{ "slot_id", slot.id },
|
{ "slot_id", slot.id },
|
||||||
{ "task_id", slot.task_id }
|
{ "task_id", slot.task_id }
|
||||||
});
|
});
|
||||||
@@ -1834,7 +1834,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||||
LOG_INFO("kv cache rm [p0, end)", {
|
LOG_DEBUG("kv cache rm [p0, end)", {
|
||||||
{ "slot_id", slot.id },
|
{ "slot_id", slot.id },
|
||||||
{ "task_id", slot.task_id },
|
{ "task_id", slot.task_id },
|
||||||
{ "p0", p0 }
|
{ "p0", p0 }
|
||||||
@@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
else if (arg == "-v" || arg == "--verbose")
|
else if (arg == "-v" || arg == "--verbose")
|
||||||
{
|
{
|
||||||
#if SERVER_VERBOSE != 1
|
|
||||||
LOG_WARNING("server.cpp is not built with verbose logging.", {});
|
|
||||||
#else
|
|
||||||
server_verbose = true;
|
server_verbose = true;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
else if (arg == "--mlock")
|
else if (arg == "--mlock")
|
||||||
{
|
{
|
||||||
@@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
else if (arg == "--log-disable")
|
else if (arg == "--log-disable")
|
||||||
{
|
{
|
||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
LOG_INFO("logging to file is disabled.", {});
|
LOG_DEBUG("logging to file is disabled.", {});
|
||||||
}
|
}
|
||||||
else if (arg == "--slots-endpoint-disable")
|
else if (arg == "--slots-endpoint-disable")
|
||||||
{
|
{
|
||||||
@@ -2727,12 +2723,12 @@ static json format_detokenized_response(std::string content)
|
|||||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
||||||
{
|
{
|
||||||
// skip GH copilot requests when using default port
|
// skip GH copilot requests when using default port
|
||||||
if (req.path == "/v1/health" || req.path == "/v1/completions")
|
if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INFO("request", {
|
LOG_DEBUG("request", {
|
||||||
{"remote_addr", req.remote_addr},
|
{"remote_addr", req.remote_addr},
|
||||||
{"remote_port", req.remote_port},
|
{"remote_port", req.remote_port},
|
||||||
{"status", res.status},
|
{"status", res.status},
|
||||||
@@ -3054,6 +3050,26 @@ int main(int argc, char **argv) {
|
|||||||
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
|
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sparams.n_threads_http < 1) {
|
||||||
|
// +2 threads for monitoring endpoints
|
||||||
|
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||||
|
}
|
||||||
|
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
|
||||||
|
svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
|
||||||
|
|
||||||
|
LOG_INFO("HTTP server listening", log_data);
|
||||||
|
// run the HTTP server in a thread - see comment below
|
||||||
|
std::thread t([&]()
|
||||||
|
{
|
||||||
|
if (!svr.listen_after_bind())
|
||||||
|
{
|
||||||
|
state.store(SERVER_STATE_ERROR);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
});
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
if (!llama.load_model(params))
|
if (!llama.load_model(params))
|
||||||
{
|
{
|
||||||
@@ -3258,26 +3274,6 @@ int main(int argc, char **argv) {
|
|||||||
}*/
|
}*/
|
||||||
//);
|
//);
|
||||||
|
|
||||||
if (sparams.n_threads_http < 1) {
|
|
||||||
// +2 threads for monitoring endpoints
|
|
||||||
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
|
||||||
}
|
|
||||||
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
|
|
||||||
svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
|
|
||||||
|
|
||||||
LOG_INFO("HTTP server listening", log_data);
|
|
||||||
// run the HTTP server in a thread - see comment below
|
|
||||||
std::thread t([&]()
|
|
||||||
{
|
|
||||||
if (!svr.listen_after_bind())
|
|
||||||
{
|
|
||||||
state.store(SERVER_STATE_ERROR);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
});
|
|
||||||
|
|
||||||
llama.queue_tasks.on_new_task(std::bind(
|
llama.queue_tasks.on_new_task(std::bind(
|
||||||
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
||||||
llama.queue_tasks.on_finish_multitask(std::bind(
|
llama.queue_tasks.on_finish_multitask(std::bind(
|
||||||
|
|||||||
13
llm/ext_server/utils.hpp
vendored
13
llm/ext_server/utils.hpp
vendored
@@ -55,9 +55,10 @@ extern bool server_log_json;
|
|||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
enum server_state {
|
enum server_state {
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||||
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
|
|||||||
{"timestamp", time(nullptr)},
|
{"timestamp", time(nullptr)},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (server_log_json) {
|
if (server_log_json) {
|
||||||
log.merge_patch(
|
log.merge_patch(
|
||||||
{
|
{
|
||||||
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,
|
|||||||
|
|
||||||
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
||||||
} else {
|
} else {
|
||||||
char buf[1024];
|
|
||||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
|
||||||
|
|
||||||
if (!extra.empty()) {
|
if (!extra.empty()) {
|
||||||
log.merge_patch(extra);
|
log.merge_patch(extra);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << buf << " |";
|
ss << level << " [" << function << "] " << message << " |";
|
||||||
for (const auto& el : log.items())
|
for (const auto& el : log.items())
|
||||||
{
|
{
|
||||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
|
|||||||
@@ -329,7 +329,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(1+4*embedding+context+context*heads),
|
4*batch*(1+4*embedding+context+context*heads),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
|
partialOffload = max(
|
||||||
|
4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
4*batch*(2+3*embedding+context+context*heads),
|
||||||
|
)
|
||||||
case "stablelm":
|
case "stablelm":
|
||||||
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
|
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
|
|||||||
@@ -12,17 +12,8 @@ import (
|
|||||||
|
|
||||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||||
var estimatedVRAM uint64
|
|
||||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
|
||||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
|
||||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NumCtx < 4 {
|
|
||||||
opts.NumCtx = 4
|
|
||||||
}
|
|
||||||
|
|
||||||
// Split up the GPUs by type and try them
|
// Split up the GPUs by type and try them
|
||||||
|
var estimatedVRAM uint64
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
for _, gpus := range allGpus.ByLibrary() {
|
||||||
var layerCount int
|
var layerCount int
|
||||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|||||||
107
llm/server.go
107
llm/server.go
@@ -53,6 +53,7 @@ type llmServer struct {
|
|||||||
estimatedTotal uint64 // Total size of model
|
estimatedTotal uint64 // Total size of model
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
gpuCount int
|
gpuCount int
|
||||||
|
loadDuration time.Duration // Record how long it took the model to load
|
||||||
|
|
||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
@@ -76,15 +77,7 @@ func LoadModel(model string) (*GGML, error) {
|
|||||||
// The gpu list must be a single family.
|
// The gpu list must be a single family.
|
||||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||||
var err error
|
var err error
|
||||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
var cpuRunner string
|
||||||
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NumCtx < 4 {
|
|
||||||
opts.NumCtx = 4
|
|
||||||
}
|
|
||||||
|
|
||||||
cpuRunner := ""
|
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
var estimatedTotal uint64
|
var estimatedTotal uint64
|
||||||
var systemMemory uint64
|
var systemMemory uint64
|
||||||
@@ -112,6 +105,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
// can lead to locking up the system
|
// can lead to locking up the system
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
|
} else if gpus[0].Library != "metal" && layers == 0 {
|
||||||
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
|
cpuRunner = serverForCpu()
|
||||||
|
gpuCount = 0
|
||||||
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
|
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
|
||||||
opts.NumGPU = layers
|
opts.NumGPU = layers
|
||||||
}
|
}
|
||||||
@@ -156,11 +153,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
if envconfig.Debug {
|
|
||||||
params = append(params, "--log-format", "json")
|
|
||||||
} else {
|
|
||||||
params = append(params, "--log-disable")
|
params = append(params, "--log-disable")
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||||
@@ -220,7 +214,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
if dir == "" {
|
if dir == "" {
|
||||||
// Shouldn't happen
|
// Shouldn't happen
|
||||||
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
|
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
|
||||||
slog.Error("sever list inconsistent", "error", finalErr)
|
slog.Error("server list inconsistent", "error", finalErr)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -291,33 +285,28 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
totalLayers: ggml.KV().BlockCount() + 1,
|
totalLayers: ggml.KV().BlockCount() + 1,
|
||||||
gpuCount: gpuCount,
|
gpuCount: gpuCount,
|
||||||
|
done: make(chan error, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cmd.Env = os.Environ()
|
|
||||||
s.cmd.Stdout = os.Stdout
|
s.cmd.Stdout = os.Stdout
|
||||||
s.cmd.Stderr = s.status
|
s.cmd.Stderr = s.status
|
||||||
|
|
||||||
visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
if v := strings.Join(libraryPaths, string(filepath.ListSeparator)); v != "" {
|
||||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+v)
|
||||||
|
}
|
||||||
|
|
||||||
// Update or add the path and visible devices variable with our adjusted version
|
if k, v := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv(); k != "" {
|
||||||
pathNeeded := true
|
s.cmd.Env = append(s.cmd.Env, k+"="+v)
|
||||||
devicesNeeded := visibleDevicesEnv != ""
|
|
||||||
for i := range s.cmd.Env {
|
|
||||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
|
||||||
if strings.EqualFold(cmp[0], pathEnv) {
|
|
||||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
|
||||||
pathNeeded = false
|
|
||||||
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
|
||||||
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
|
||||||
devicesNeeded = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, ev := range os.Environ() {
|
||||||
|
if strings.HasPrefix(ev, "CUDA_") ||
|
||||||
|
strings.HasPrefix(ev, "ROCM_") ||
|
||||||
|
strings.HasPrefix(ev, "HIP_") ||
|
||||||
|
strings.HasPrefix(ev, "HSA_") ||
|
||||||
|
strings.HasPrefix(ev, "GGML_") {
|
||||||
|
s.cmd.Env = append(s.cmd.Env, ev)
|
||||||
}
|
}
|
||||||
if pathNeeded {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
|
||||||
}
|
|
||||||
if devicesNeeded {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||||
@@ -339,6 +328,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reap subprocess when it exits
|
||||||
|
go func() {
|
||||||
|
s.done <- s.cmd.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -483,13 +477,11 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
|||||||
|
|
||||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
|
|
||||||
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||||
ticker := time.NewTicker(50 * time.Millisecond)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
slog.Info("waiting for llama runner to start responding")
|
slog.Info("waiting for llama runner to start responding")
|
||||||
var lastStatus ServerStatus = -1
|
var lastStatus ServerStatus = -1
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@@ -501,7 +493,8 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||||||
msg = s.status.LastErrMsg
|
msg = s.status.LastErrMsg
|
||||||
}
|
}
|
||||||
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||||
case <-ticker.C:
|
default:
|
||||||
|
}
|
||||||
if time.Now().After(expiresAt) {
|
if time.Now().After(expiresAt) {
|
||||||
// timeout
|
// timeout
|
||||||
msg := ""
|
msg := ""
|
||||||
@@ -517,25 +510,22 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
||||||
}
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
||||||
c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
|
||||||
defer cancel()
|
defer cancel()
|
||||||
status, err := s.getServerStatus(c)
|
status, _ := s.getServerStatus(ctx)
|
||||||
if err != nil && lastStatus != status {
|
if lastStatus != status && status != ServerStatusReady {
|
||||||
slog.Debug("server not yet available", "error", err)
|
// Only log on status changes
|
||||||
lastStatus = status
|
slog.Info("waiting for server to become available", "status", status.ToString())
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switch status {
|
switch status {
|
||||||
case ServerStatusLoadingModel:
|
|
||||||
// TODO - this state never seems to happen with the current server.cpp code (bug?)
|
|
||||||
// it doesn't respond to the health endpoint until after the model is loaded
|
|
||||||
slog.Debug("loading model")
|
|
||||||
case ServerStatusReady:
|
case ServerStatusReady:
|
||||||
slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
|
s.loadDuration = time.Since(start)
|
||||||
|
slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds()))
|
||||||
return nil
|
return nil
|
||||||
}
|
default:
|
||||||
|
lastStatus = status
|
||||||
|
time.Sleep(time.Millisecond * 250)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -580,6 +570,7 @@ type completion struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Prompt string `json:"prompt"`
|
Prompt string `json:"prompt"`
|
||||||
Stop bool `json:"stop"`
|
Stop bool `json:"stop"`
|
||||||
|
StoppedLimit bool `json:"stopped_limit"`
|
||||||
|
|
||||||
Timings struct {
|
Timings struct {
|
||||||
PredictedN int `json:"predicted_n"`
|
PredictedN int `json:"predicted_n"`
|
||||||
@@ -598,6 +589,7 @@ type CompletionRequest struct {
|
|||||||
|
|
||||||
type CompletionResponse struct {
|
type CompletionResponse struct {
|
||||||
Content string
|
Content string
|
||||||
|
DoneReason string
|
||||||
Done bool
|
Done bool
|
||||||
PromptEvalCount int
|
PromptEvalCount int
|
||||||
PromptEvalDuration time.Duration
|
PromptEvalDuration time.Duration
|
||||||
@@ -739,8 +731,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|||||||
}
|
}
|
||||||
|
|
||||||
if c.Stop {
|
if c.Stop {
|
||||||
|
doneReason := "stop"
|
||||||
|
if c.StoppedLimit {
|
||||||
|
doneReason = "length"
|
||||||
|
}
|
||||||
|
|
||||||
fn(CompletionResponse{
|
fn(CompletionResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
|
DoneReason: doneReason,
|
||||||
PromptEvalCount: c.Timings.PromptN,
|
PromptEvalCount: c.Timings.PromptN,
|
||||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||||
EvalCount: c.Timings.PredictedN,
|
EvalCount: c.Timings.PredictedN,
|
||||||
@@ -935,8 +933,11 @@ func (s *llmServer) Close() error {
|
|||||||
if err := s.cmd.Process.Kill(); err != nil {
|
if err := s.cmd.Process.Kill(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// if ProcessState is already populated, Wait already completed, no need to wait again
|
||||||
_ = s.cmd.Wait()
|
if s.cmd.ProcessState == nil {
|
||||||
|
slog.Debug("waiting for llama server to exit")
|
||||||
|
<-s.done
|
||||||
|
}
|
||||||
|
|
||||||
slog.Debug("llama server stopped")
|
slog.Debug("llama server stopped")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -109,13 +109,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
|||||||
Choices: []Choice{{
|
Choices: []Choice{{
|
||||||
Index: 0,
|
Index: 0,
|
||||||
Message: Message{Role: r.Message.Role, Content: r.Message.Content},
|
Message: Message{Role: r.Message.Role, Content: r.Message.Content},
|
||||||
FinishReason: func(done bool) *string {
|
FinishReason: &r.DoneReason,
|
||||||
if done {
|
|
||||||
reason := "stop"
|
|
||||||
return &reason
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}(r.Done),
|
|
||||||
}},
|
}},
|
||||||
Usage: Usage{
|
Usage: Usage{
|
||||||
// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
|
// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
|
||||||
@@ -137,13 +131,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
|||||||
{
|
{
|
||||||
Index: 0,
|
Index: 0,
|
||||||
Delta: Message{Role: "assistant", Content: r.Message.Content},
|
Delta: Message{Role: "assistant", Content: r.Message.Content},
|
||||||
FinishReason: func(done bool) *string {
|
FinishReason: &r.DoneReason,
|
||||||
if done {
|
|
||||||
reason := "stop"
|
|
||||||
return &reason
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}(r.Done),
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -565,7 +565,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !envconfig.NoPrune {
|
if !envconfig.NoPrune {
|
||||||
if err := deleteUnusedLayers(nil, unref, false); err != nil {
|
if err := deleteUnusedLayers(nil, unref); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -613,7 +613,7 @@ func CopyModel(src, dst model.Name) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
|
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
|
||||||
fp, err := GetManifestPath()
|
fp, err := GetManifestPath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -660,14 +660,10 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
|
|||||||
slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
|
slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !dryRun {
|
|
||||||
if err := os.Remove(fp); err != nil {
|
if err := os.Remove(fp); err != nil {
|
||||||
slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
|
slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@@ -689,14 +685,25 @@ func PruneLayers() error {
|
|||||||
for _, blob := range blobs {
|
for _, blob := range blobs {
|
||||||
name := blob.Name()
|
name := blob.Name()
|
||||||
name = strings.ReplaceAll(name, "-", ":")
|
name = strings.ReplaceAll(name, "-", ":")
|
||||||
if strings.HasPrefix(name, "sha256:") {
|
|
||||||
deleteMap[name] = struct{}{}
|
_, err := GetBlobsPath(name)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrInvalidDigestFormat) {
|
||||||
|
// remove invalid blobs (e.g. partial downloads)
|
||||||
|
if err := os.Remove(filepath.Join(p, blob.Name())); err != nil {
|
||||||
|
slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
deleteMap[name] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
||||||
|
|
||||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
err = deleteUnusedLayers(nil, deleteMap)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -752,7 +759,7 @@ func DeleteModel(name string) error {
|
|||||||
}
|
}
|
||||||
deleteMap[manifest.Config.Digest] = struct{}{}
|
deleteMap[manifest.Config.Digest] = struct{}{}
|
||||||
|
|
||||||
err = deleteUnusedLayers(&mp, deleteMap, false)
|
err = deleteUnusedLayers(&mp, deleteMap)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -912,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
|||||||
|
|
||||||
if noprune == "" {
|
if noprune == "" {
|
||||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
err = deleteUnusedLayers(nil, deleteMap)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -154,9 +154,6 @@ func GetBlobsPath(digest string) (string, error) {
|
|||||||
// only accept actual sha256 digests
|
// only accept actual sha256 digests
|
||||||
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
||||||
re := regexp.MustCompile(pattern)
|
re := regexp.MustCompile(pattern)
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
if digest != "" && !re.MatchString(digest) {
|
if digest != "" && !re.MatchString(digest) {
|
||||||
return "", ErrInvalidDigestFormat
|
return "", ErrInvalidDigestFormat
|
||||||
|
|||||||
@@ -127,10 +127,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
|
|
||||||
opts, err := modelOptions(model, req.Options)
|
opts, err := modelOptions(model, req.Options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, api.ErrInvalidOpts) {
|
|
||||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -159,6 +155,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
Done: true,
|
Done: true,
|
||||||
|
DoneReason: "load",
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -230,6 +227,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
Done: r.Done,
|
Done: r.Done,
|
||||||
Response: r.Content,
|
Response: r.Content,
|
||||||
|
DoneReason: r.DoneReason,
|
||||||
Metrics: api.Metrics{
|
Metrics: api.Metrics{
|
||||||
PromptEvalCount: r.PromptEvalCount,
|
PromptEvalCount: r.PromptEvalCount,
|
||||||
PromptEvalDuration: r.PromptEvalDuration,
|
PromptEvalDuration: r.PromptEvalDuration,
|
||||||
@@ -370,10 +368,6 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
|||||||
|
|
||||||
opts, err := modelOptions(model, req.Options)
|
opts, err := modelOptions(model, req.Options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, api.ErrInvalidOpts) {
|
|
||||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -560,7 +554,12 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
|
|||||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
|
quantization := req.Quantization
|
||||||
|
if req.Quantize != "" {
|
||||||
|
quantization = req.Quantize
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil {
|
||||||
ch <- gin.H{"error": err.Error()}
|
ch <- gin.H{"error": err.Error()}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -740,20 +739,28 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
n := model.ParseNameFromFilepath(rel)
|
n := model.ParseNameFromFilepath(rel)
|
||||||
|
if !n.IsValid() {
|
||||||
|
slog.Warn("bad manifest filepath", "path", rel)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
m, err := ParseNamedManifest(n)
|
m, err := ParseNamedManifest(n)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
slog.Warn("bad manifest", "name", n, "error", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
f, err := m.Config.Open()
|
f, err := m.Config.Open()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
slog.Warn("bad manifest config filepath", "name", n, "error", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
var c ConfigV2
|
var c ConfigV2
|
||||||
if err := json.NewDecoder(f).Decode(&c); err != nil {
|
if err := json.NewDecoder(f).Decode(&c); err != nil {
|
||||||
return err
|
slog.Warn("bad manifest config", "name", n, "error", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// tag should never be masked
|
// tag should never be masked
|
||||||
@@ -1037,7 +1044,8 @@ func Serve(ln net.Listener) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx, done := context.WithCancel(context.Background())
|
ctx, done := context.WithCancel(context.Background())
|
||||||
sched := InitScheduler(ctx)
|
schedCtx, schedDone := context.WithCancel(ctx)
|
||||||
|
sched := InitScheduler(schedCtx)
|
||||||
s := &Server{addr: ln.Addr(), sched: sched}
|
s := &Server{addr: ln.Addr(), sched: sched}
|
||||||
r := s.GenerateRoutes()
|
r := s.GenerateRoutes()
|
||||||
|
|
||||||
@@ -1052,23 +1060,31 @@ func Serve(ln net.Listener) error {
|
|||||||
go func() {
|
go func() {
|
||||||
<-signals
|
<-signals
|
||||||
srvr.Close()
|
srvr.Close()
|
||||||
done()
|
schedDone()
|
||||||
sched.unloadAllRunners()
|
sched.unloadAllRunners()
|
||||||
gpu.Cleanup()
|
gpu.Cleanup()
|
||||||
os.Exit(0)
|
done()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if err := llm.Init(); err != nil {
|
if err := llm.Init(); err != nil {
|
||||||
return fmt.Errorf("unable to initialize llm library %w", err)
|
return fmt.Errorf("unable to initialize llm library %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
s.sched.Run(ctx)
|
s.sched.Run(schedCtx)
|
||||||
|
|
||||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||||
// This will log warnings to the log in case we have problems with detected GPUs
|
// This will log warnings to the log in case we have problems with detected GPUs
|
||||||
_ = gpu.GetGPUInfo()
|
gpus := gpu.GetGPUInfo()
|
||||||
|
gpus.LogDetails()
|
||||||
|
|
||||||
return srvr.Serve(ln)
|
err = srvr.Serve(ln)
|
||||||
|
// If server is closed from the signal handler, wait for the ctx to be done
|
||||||
|
// otherwise error out quickly
|
||||||
|
if !errors.Is(err, http.ErrServerClosed) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
<-ctx.Done()
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func waitForStream(c *gin.Context, ch chan interface{}) {
|
func waitForStream(c *gin.Context, ch chan interface{}) {
|
||||||
@@ -1177,10 +1193,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
|
|
||||||
opts, err := modelOptions(model, req.Options)
|
opts, err := modelOptions(model, req.Options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, api.ErrInvalidOpts) {
|
|
||||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -1225,6 +1237,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
Done: true,
|
Done: true,
|
||||||
|
DoneReason: "load",
|
||||||
Message: api.Message{Role: "assistant"},
|
Message: api.Message{Role: "assistant"},
|
||||||
}
|
}
|
||||||
c.JSON(http.StatusOK, resp)
|
c.JSON(http.StatusOK, resp)
|
||||||
@@ -1262,6 +1275,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
Message: api.Message{Role: "assistant", Content: r.Content},
|
Message: api.Message{Role: "assistant", Content: r.Content},
|
||||||
Done: r.Done,
|
Done: r.Done,
|
||||||
|
DoneReason: r.DoneReason,
|
||||||
Metrics: api.Metrics{
|
Metrics: api.Metrics{
|
||||||
PromptEvalCount: r.PromptEvalCount,
|
PromptEvalCount: r.PromptEvalCount,
|
||||||
PromptEvalDuration: r.PromptEvalDuration,
|
PromptEvalDuration: r.PromptEvalDuration,
|
||||||
|
|||||||
@@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||||||
// context must be canceled to decrement ref count and release the runner
|
// context must be canceled to decrement ref count and release the runner
|
||||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||||
// allocate a large enough kv cache for all parallel requests
|
// allocate a large enough kv cache for all parallel requests
|
||||||
|
if opts.NumCtx < 4 {
|
||||||
|
opts.NumCtx = 4
|
||||||
|
}
|
||||||
|
|
||||||
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
|
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
|
||||||
|
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
@@ -265,11 +269,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
|||||||
|
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
slog.Debug("got lock to unload", "model", runner.model)
|
slog.Debug("got lock to unload", "model", runner.model)
|
||||||
|
finished := runner.waitForVRAMRecovery()
|
||||||
runner.unload()
|
runner.unload()
|
||||||
delete(s.loaded, runner.model)
|
delete(s.loaded, runner.model)
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
slog.Debug("runner released", "model", runner.model)
|
slog.Debug("runner released", "model", runner.model)
|
||||||
runner.refMu.Unlock()
|
runner.refMu.Unlock()
|
||||||
|
|
||||||
|
<-finished
|
||||||
slog.Debug("sending an unloaded event", "model", runner.model)
|
slog.Debug("sending an unloaded event", "model", runner.model)
|
||||||
s.unloadedCh <- struct{}{}
|
s.unloadedCh <- struct{}{}
|
||||||
}
|
}
|
||||||
@@ -465,6 +472,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Free memory reporting on GPUs can lag for a while even after the runner
|
||||||
|
// exits, so we have to keep checking until we see the available memory recover,
|
||||||
|
// otherwise subsequent model loads will get far less layers loaded or worse
|
||||||
|
// case, may completely fall back to CPU mode.
|
||||||
|
// This routine must be called before the runner unloads so it can establish
|
||||||
|
// a before and after GPU memory allocation. The returned channel
|
||||||
|
// will be notified when we're done waiting, or have timed out and should
|
||||||
|
// proceed anyway
|
||||||
|
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
|
finished := make(chan interface{}, 1)
|
||||||
|
|
||||||
|
// CPU or Metal don't need checking, so no waiting required
|
||||||
|
if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
|
||||||
|
finished <- struct{}{}
|
||||||
|
return finished
|
||||||
|
}
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
// Establish a baseline before we unload
|
||||||
|
gpusBefore := gpu.GetGPUInfo()
|
||||||
|
var totalMemoryBefore, freeMemoryBefore uint64
|
||||||
|
for _, gpu := range gpusBefore {
|
||||||
|
totalMemoryBefore += gpu.TotalMemory
|
||||||
|
freeMemoryBefore += gpu.FreeMemory
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
|
||||||
|
ticker := time.NewTicker(250 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
<-ticker.C
|
||||||
|
if time.Now().After(expiresAt) {
|
||||||
|
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
|
||||||
|
finished <- struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query GPUs, look for free to go back up
|
||||||
|
gpusNow := gpu.GetGPUInfo()
|
||||||
|
var totalMemoryNow, freeMemoryNow uint64
|
||||||
|
for _, gpu := range gpusNow {
|
||||||
|
totalMemoryNow += gpu.TotalMemory
|
||||||
|
freeMemoryNow += gpu.FreeMemory
|
||||||
|
}
|
||||||
|
// If we're within ~80% of the estimated memory usage recovered, bail out
|
||||||
|
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
|
||||||
|
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
|
||||||
|
finished <- struct{}{}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return finished
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
type ByDuration []*runnerRef
|
type ByDuration []*runnerRef
|
||||||
|
|
||||||
func (a ByDuration) Len() int { return len(a) }
|
func (a ByDuration) Len() int { return len(a) }
|
||||||
@@ -505,9 +567,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
|||||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||||
|
|
||||||
// Now try all the GPUs
|
// Now try all the GPUs
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
return gl
|
return sgl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
###download
|
###download
|
||||||
|
|
||||||
Make preparation as guide on (Development) [https://github.com/ollama/ollama/blob/main/docs/development.md]
|
Make preparation as guide on [Development] (https://github.com/ollama/ollama/blob/main/docs/development.md)
|
||||||
run
|
run
|
||||||
$env:CGO_ENABLED="1"
|
$env:CGO_ENABLED="1"
|
||||||
go generate ./...
|
go generate ./...
|
||||||
|
|||||||
Reference in New Issue
Block a user