diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 40f9c41f..61ca3c43 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -437,6 +437,7 @@ jobs:
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
+ GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -460,15 +461,20 @@ jobs:
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- - uses: ncipollo/release-action@v1
- with:
- name: ${{ env.RELEASE_VERSION }}
- allowUpdates: true
- artifacts: 'dist/*'
- draft: true
- prerelease: true
- omitBodyDuringUpdate: true
- generateReleaseNotes: true
- omitDraftDuringUpdate: true
- omitPrereleaseDuringUpdate: true
- replacesArtifacts: true
+ - name: Create or update Release
+ run: |
+ echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
+ OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
+ if [ -n "$OLD_TAG" ]; then
+ echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
+ gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
+ else
+ echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
+ gh release create ${GITHUB_REF_NAME} \
+ --title ${{ env.RELEASE_VERSION }} \
+ --draft \
+ --generate-notes \
+ --prerelease
+ fi
+ echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
+ gh release upload ${GITHUB_REF_NAME} dist/* --clobber
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dbb6c2fd..29adf56f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,7 +124,7 @@ jobs:
strategy:
matrix:
rocm-version:
- - '6.0.2'
+ - '6.1.1'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
diff --git a/Dockerfile b/Dockerfile
index 72edef2a..98a3ddfd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0.2
+ARG ROCM_VERSION=6.1.1
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
diff --git a/README.md b/README.md
index c4119560..ea302277 100644
--- a/README.md
+++ b/README.md
@@ -200,6 +200,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
+### Show model information
+
+```
+ollama show llama3
+```
+
### List models on your computer
```
diff --git a/api/types.go b/api/types.go
index d99cf3bc..95ed5d37 100644
--- a/api/types.go
+++ b/api/types.go
@@ -159,18 +159,49 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
- UseNUMA bool `json:"numa,omitempty"`
- NumCtx int `json:"num_ctx,omitempty"`
- NumBatch int `json:"num_batch,omitempty"`
- NumGPU int `json:"num_gpu,omitempty"`
- MainGPU int `json:"main_gpu,omitempty"`
- LowVRAM bool `json:"low_vram,omitempty"`
- F16KV bool `json:"f16_kv,omitempty"`
- LogitsAll bool `json:"logits_all,omitempty"`
- VocabOnly bool `json:"vocab_only,omitempty"`
- UseMMap bool `json:"use_mmap,omitempty"`
- UseMLock bool `json:"use_mlock,omitempty"`
- NumThread int `json:"num_thread,omitempty"`
+ UseNUMA bool `json:"numa,omitempty"`
+ NumCtx int `json:"num_ctx,omitempty"`
+ NumBatch int `json:"num_batch,omitempty"`
+ NumGPU int `json:"num_gpu,omitempty"`
+ MainGPU int `json:"main_gpu,omitempty"`
+ LowVRAM bool `json:"low_vram,omitempty"`
+ F16KV bool `json:"f16_kv,omitempty"`
+ LogitsAll bool `json:"logits_all,omitempty"`
+ VocabOnly bool `json:"vocab_only,omitempty"`
+ UseMMap TriState `json:"use_mmap,omitempty"`
+ UseMLock bool `json:"use_mlock,omitempty"`
+ NumThread int `json:"num_thread,omitempty"`
+}
+
+type TriState int
+
+const (
+ TriStateUndefined TriState = -1
+ TriStateFalse TriState = 0
+ TriStateTrue TriState = 1
+)
+
+func (b *TriState) UnmarshalJSON(data []byte) error {
+ var v bool
+ if err := json.Unmarshal(data, &v); err != nil {
+ return err
+ }
+ if v {
+ *b = TriStateTrue
+ }
+ *b = TriStateFalse
+ return nil
+}
+
+func (b *TriState) MarshalJSON() ([]byte, error) {
+ if *b == TriStateUndefined {
+ return nil, nil
+ }
+ var v bool
+ if *b == TriStateTrue {
+ v = true
+ }
+ return json.Marshal(v)
}
// EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -222,6 +253,7 @@ type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
+ Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"`
@@ -231,13 +263,16 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show].
type ShowResponse struct {
- License string `json:"license,omitempty"`
- Modelfile string `json:"modelfile,omitempty"`
- Parameters string `json:"parameters,omitempty"`
- Template string `json:"template,omitempty"`
- System string `json:"system,omitempty"`
- Details ModelDetails `json:"details,omitempty"`
- Messages []Message `json:"messages,omitempty"`
+ License string `json:"license,omitempty"`
+ Modelfile string `json:"modelfile,omitempty"`
+ Parameters string `json:"parameters,omitempty"`
+ Template string `json:"template,omitempty"`
+ System string `json:"system,omitempty"`
+ Details ModelDetails `json:"details,omitempty"`
+ Messages []Message `json:"messages,omitempty"`
+ ModelInfo map[string]any `json:"model_info,omitempty"`
+ ProjectorInfo map[string]any `json:"projector_info,omitempty"`
+ ModifiedAt time.Time `json:"modified_at,omitempty"`
}
// CopyRequest is the request passed to [Client.Copy].
@@ -402,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue
}
+ if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+ val, ok := val.(bool)
+ if !ok {
+ return fmt.Errorf("option %q must be of type boolean", key)
+ }
+ if val {
+ field.SetInt(int64(TriStateTrue))
+ } else {
+ field.SetInt(int64(TriStateFalse))
+ }
+ continue
+ }
+
switch field.Kind() {
case reflect.Int:
switch t := val.(type) {
@@ -490,7 +538,7 @@ func DefaultOptions() Options {
LowVRAM: false,
F16KV: true,
UseMLock: false,
- UseMMap: true,
+ UseMMap: TriStateUndefined,
UseNUMA: false,
},
}
@@ -560,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
+ if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+ boolVal, err := strconv.ParseBool(vals[0])
+ if err != nil {
+ return nil, fmt.Errorf("invalid bool value %s", vals)
+ }
+ if boolVal {
+ out[key] = TriStateTrue
+ } else {
+ out[key] = TriStateFalse
+ }
+ continue
+ }
+
switch field.Kind() {
case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32)
diff --git a/api/types_test.go b/api/types_test.go
index 211385c7..8b6c60c6 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
+ "fmt"
"math"
"testing"
"time"
@@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
})
}
}
+
+func TestUseMmapParsingFromJSON(t *testing.T) {
+ tests := []struct {
+ name string
+ req string
+ exp TriState
+ }{
+ {
+ name: "Undefined",
+ req: `{ }`,
+ exp: TriStateUndefined,
+ },
+ {
+ name: "True",
+ req: `{ "use_mmap": true }`,
+ exp: TriStateTrue,
+ },
+ {
+ name: "False",
+ req: `{ "use_mmap": false }`,
+ exp: TriStateFalse,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ var oMap map[string]interface{}
+ err := json.Unmarshal([]byte(test.req), &oMap)
+ require.NoError(t, err)
+ opts := DefaultOptions()
+ err = opts.FromMap(oMap)
+ require.NoError(t, err)
+ assert.Equal(t, test.exp, opts.UseMMap)
+ })
+ }
+}
+
+func TestUseMmapFormatParams(t *testing.T) {
+ tests := []struct {
+ name string
+ req map[string][]string
+ exp TriState
+ err error
+ }{
+ {
+ name: "True",
+ req: map[string][]string{
+ "use_mmap": []string{"true"},
+ },
+ exp: TriStateTrue,
+ err: nil,
+ },
+ {
+ name: "False",
+ req: map[string][]string{
+ "use_mmap": []string{"false"},
+ },
+ exp: TriStateFalse,
+ err: nil,
+ },
+ {
+ name: "Numeric True",
+ req: map[string][]string{
+ "use_mmap": []string{"1"},
+ },
+ exp: TriStateTrue,
+ err: nil,
+ },
+ {
+ name: "Numeric False",
+ req: map[string][]string{
+ "use_mmap": []string{"0"},
+ },
+ exp: TriStateFalse,
+ err: nil,
+ },
+ {
+ name: "invalid string",
+ req: map[string][]string{
+ "use_mmap": []string{"foo"},
+ },
+ exp: TriStateUndefined,
+ err: fmt.Errorf("invalid bool value [foo]"),
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ resp, err := FormatParams(test.req)
+ require.Equal(t, err, test.err)
+ respVal, ok := resp["use_mmap"]
+ if test.exp != TriStateUndefined {
+ assert.True(t, ok, "resp: %v", resp)
+ assert.Equal(t, test.exp, respVal)
+ }
+ })
+ }
+}
diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go
index df2597a8..a8f1f7cd 100644
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,6 +5,8 @@ import (
"log/slog"
"os"
"path/filepath"
+ "strconv"
+ "strings"
"github.com/ollama/ollama/envconfig"
)
@@ -24,6 +26,7 @@ func InitLogging() {
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
+ rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
slog.Info("ollama app started")
}
+
+func rotateLogs(logFile string) {
+ if _, err := os.Stat(logFile); os.IsNotExist(err) {
+ return
+ }
+ index := strings.LastIndex(logFile, ".")
+ pre := logFile[:index]
+ post := "." + logFile[index+1:]
+ for i := LogRotationCount; i > 0; i-- {
+ older := pre + "-" + strconv.Itoa(i) + post
+ newer := pre + "-" + strconv.Itoa(i-1) + post
+ if i == 1 {
+ newer = pre + post
+ }
+ if _, err := os.Stat(newer); err == nil {
+ if _, err := os.Stat(older); err == nil {
+ err := os.Remove(older)
+ if err != nil {
+ slog.Warn("Failed to remove older log", "older", older, "error", err)
+ continue
+ }
+ }
+ err := os.Rename(newer, older)
+ if err != nil {
+ slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
+ }
+ }
+ }
+}
diff --git a/app/lifecycle/logging_test.go b/app/lifecycle/logging_test.go
new file mode 100644
index 00000000..a2157ca2
--- /dev/null
+++ b/app/lifecycle/logging_test.go
@@ -0,0 +1,44 @@
+package lifecycle
+
+import (
+ "os"
+ "path/filepath"
+ "strconv"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestRotateLogs(t *testing.T) {
+ logDir := t.TempDir()
+ logFile := filepath.Join(logDir, "testlog.log")
+
+ // No log exists
+ rotateLogs(logFile)
+
+ require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
+ assert.FileExists(t, logFile)
+ // First rotation
+ rotateLogs(logFile)
+ assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+ assert.NoFileExists(t, logFile)
+
+ // Should be a no-op without a new log
+ rotateLogs(logFile)
+ assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+ assert.NoFileExists(t, logFile)
+
+ for i := 2; i <= LogRotationCount+1; i++ {
+ require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
+ assert.FileExists(t, logFile)
+ rotateLogs(logFile)
+ assert.NoFileExists(t, logFile)
+ for j := 1; j < i; j++ {
+ assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
+ }
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
+ }
+}
diff --git a/app/lifecycle/paths.go b/app/lifecycle/paths.go
index fe07bce1..4d9f4c5a 100644
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -16,11 +16,12 @@ var (
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
- UpdateStageDir = "/tmp"
- AppLogFile = "/tmp/ollama_app.log"
- ServerLogFile = "/tmp/ollama.log"
- UpgradeLogFile = "/tmp/ollama_update.log"
- Installer = "OllamaSetup.exe"
+ UpdateStageDir = "/tmp"
+ AppLogFile = "/tmp/ollama_app.log"
+ ServerLogFile = "/tmp/ollama.log"
+ UpgradeLogFile = "/tmp/ollama_update.log"
+ Installer = "OllamaSetup.exe"
+ LogRotationCount = 5
)
func init() {
diff --git a/app/lifecycle/server.go b/app/lifecycle/server.go
index 0152ccd1..c178a1ab 100644
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
}
- // TODO - rotation
+ rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)
diff --git a/app/ollama.iss b/app/ollama.iss
index 9dc61abb..e6502abd 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,10 +88,15 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+#if DirExists("..\dist\windows-amd64\cuda")
+ Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\oneapi")
+ Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
+#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif
diff --git a/cmd/cmd.go b/cmd/cmd.go
index ae7c8da8..68197f72 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -579,10 +579,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
- if len(args) != 1 {
- return errors.New("missing model name")
- }
-
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +621,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
- } else if flagsSet == 0 {
- return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
+ }
+
+ if flagsSet == 1 {
+ req := api.ShowRequest{Name: args[0]}
+ resp, err := client.Show(cmd.Context(), &req)
+ if err != nil {
+ return err
+ }
+
+ switch showType {
+ case "license":
+ fmt.Println(resp.License)
+ case "modelfile":
+ fmt.Println(resp.Modelfile)
+ case "parameters":
+ fmt.Println(resp.Parameters)
+ case "system":
+ fmt.Println(resp.System)
+ case "template":
+ fmt.Println(resp.Template)
+ }
+
+ return nil
}
req := api.ShowRequest{Name: args[0]}
@@ -635,22 +652,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
- switch showType {
- case "license":
- fmt.Println(resp.License)
- case "modelfile":
- fmt.Println(resp.Modelfile)
- case "parameters":
- fmt.Println(resp.Parameters)
- case "system":
- fmt.Println(resp.System)
- case "template":
- fmt.Println(resp.Template)
+ arch := resp.ModelInfo["general.architecture"].(string)
+
+ modelData := [][]string{
+ {"arch", arch},
+ {"parameters", resp.Details.ParameterSize},
+ {"quantization", resp.Details.QuantizationLevel},
+ {"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+ {"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
+ mainTableData := [][]string{
+ {"Model"},
+ {renderSubTable(modelData, false)},
+ }
+
+ if resp.ProjectorInfo != nil {
+ projectorData := [][]string{
+ {"arch", "clip"},
+ {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+ {"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
+ {"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+ {"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+ }
+
+ mainTableData = append(mainTableData,
+ []string{"Projector"},
+ []string{renderSubTable(projectorData, false)},
+ )
+ }
+
+ if resp.Parameters != "" {
+ mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
+ }
+
+ if resp.System != "" {
+ mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
+ }
+
+ if resp.License != "" {
+ mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
+ }
+
+ table := tablewriter.NewWriter(os.Stdout)
+ table.SetAutoWrapText(false)
+ table.SetBorder(false)
+ table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+ for _, v := range mainTableData {
+ table.Append(v)
+ }
+
+ table.Render()
+
return nil
}
+func renderSubTable(data [][]string, file bool) string {
+ var buf bytes.Buffer
+ table := tablewriter.NewWriter(&buf)
+ table.SetAutoWrapText(!file)
+ table.SetBorder(false)
+ table.SetNoWhiteSpace(true)
+ table.SetTablePadding("\t")
+ table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+ for _, v := range data {
+ table.Append(v)
+ }
+
+ table.Render()
+
+ renderedTable := buf.String()
+ lines := strings.Split(renderedTable, "\n")
+ for i, line := range lines {
+ lines[i] = "\t" + line
+ }
+
+ return strings.Join(lines, "\n")
+}
+
+func twoLines(s string) [][]string {
+ lines := strings.Split(s, "\n")
+ res := [][]string{}
+
+ count := 0
+ for _, line := range lines {
+ line = strings.TrimSpace(line)
+ if line != "" {
+ count++
+ res = append(res, []string{line})
+ if count == 2 {
+ return res
+ }
+ }
+ }
+ return res
+}
+
+func formatParams(s string) string {
+ lines := strings.Split(s, "\n")
+ table := [][]string{}
+
+ for _, line := range lines {
+ table = append(table, strings.Fields(line))
+ }
+ return renderSubTable(table, false)
+}
+
func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
diff --git a/docs/api.md b/docs/api.md
index 35f1def3..107b5211 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
POST /api/show
```
-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, system prompt.
### Parameters
- `name`: name of the model to show
+- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
- "parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
- "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
+ "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
+ "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": {
+ "parent_model": "",
"format": "gguf",
"family": "llama",
- "families": ["llama", "clip"],
- "parameter_size": "7B",
+ "families": [
+ "llama"
+ ],
+ "parameter_size": "8.0B",
"quantization_level": "Q4_0"
+ },
+ "model_info": {
+ "general.architecture": "llama",
+ "general.file_type": 2,
+ "general.parameter_count": 8030261248,
+ "general.quantization_version": 2,
+ "llama.attention.head_count": 32,
+ "llama.attention.head_count_kv": 8,
+ "llama.attention.layer_norm_rms_epsilon": 0.00001,
+ "llama.block_count": 32,
+ "llama.context_length": 8192,
+ "llama.embedding_length": 4096,
+ "llama.feed_forward_length": 14336,
+ "llama.rope.dimension_count": 128,
+ "llama.rope.freq_base": 500000,
+ "llama.vocab_size": 128256,
+ "tokenizer.ggml.bos_token_id": 128000,
+ "tokenizer.ggml.eos_token_id": 128009,
+ "tokenizer.ggml.merges": [], // populates if `verbose=true`
+ "tokenizer.ggml.model": "gpt2",
+ "tokenizer.ggml.pre": "llama-bpe",
+ "tokenizer.ggml.token_type": [], // populates if `verbose=true`
+ "tokenizer.ggml.tokens": [] // populates if `verbose=true`
}
}
```
diff --git a/docs/development.md b/docs/development.md
index 8c035a51..2a6886a4 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
### Windows
-Note: The windows build for Ollama is still under development.
+Note: The Windows build for Ollama is still under development.
-Install required tools:
+First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
- [MinGW-w64](https://www.mingw-w64.org/)
- [MSYS2](https://www.msys2.org/)
+- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
+
+Then, build the `ollama` binary:
```powershell
$env:CGO_ENABLED="1"
diff --git a/docs/gpu.md b/docs/gpu.md
index a6b559f0..55c41c9d 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -8,7 +8,7 @@ Check your compute compatibility to see if your card is supported:
| Compute Capability | Family | Cards |
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
| 9.0 | NVIDIA | `H100` |
-| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti` |
+| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
diff --git a/docs/import.md b/docs/import.md
index 7abe39b2..f34f09ac 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -47,19 +47,13 @@ success
### Supported Quantizations
-
-Legacy Quantization
-
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
-
-
-
-K-means Quantization
`
+#### K-means Quantizations
- `Q3_K_S`
- `Q3_K_M`
@@ -70,11 +64,6 @@ success
- `Q5_K_M`
- `Q6_K`
-
-
-> [!NOTE]
-> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
-
## Template Detection
> [!NOTE]
diff --git a/docs/openai.md b/docs/openai.md
index 557b5846..59e7d640 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
#### Notes
-- Setting `seed` will always set `temperature` to `0`
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 60d63c7d..de29b344 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -22,7 +22,7 @@ docker logs
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs
+- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
diff --git a/docs/windows.md b/docs/windows.md
index 832b3d43..abc0eb30 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- - *app.log* contains logs from the GUI application
- - *server.log* contains the server logs
+ - *app.log* contains most resent logs from the GUI application
+ - *server.log* contains the most recent server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
diff --git a/envconfig/config.go b/envconfig/config.go
index 2c3b6f77..e86f72e6 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -31,6 +31,8 @@ var (
Debug bool
// Experimental flash attention
FlashAttention bool
+ // Set via OLLAMA_HOST in the environment
+ Host *OllamaHost
// Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive string
// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -39,6 +41,8 @@ var (
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
+ // Set via OLLAMA_MODELS in the environment
+ ModelsDir string
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64
// Set via OLLAMA_NOHISTORY in the environment
@@ -47,12 +51,25 @@ var (
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int
- // Set via OLLAMA_HOST in the environment
- Host *OllamaHost
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
+ // Set via OLLAMA_SCHED_SPREAD in the environment
+ SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
+ // Set via OLLAMA_INTEL_GPU in the environment
+ IntelGpu bool
+
+ // Set via CUDA_VISIBLE_DEVICES in the environment
+ CudaVisibleDevices string
+ // Set via HIP_VISIBLE_DEVICES in the environment
+ HipVisibleDevices string
+ // Set via ROCR_VISIBLE_DEVICES in the environment
+ RocrVisibleDevices string
+ // Set via GPU_DEVICE_ORDINAL in the environment
+ GpuDeviceOrdinal string
+ // Set via HSA_OVERRIDE_GFX_VERSION in the environment
+ HsaOverrideGfxVersion string
)
type EnvVar struct {
@@ -62,7 +79,7 @@ type EnvVar struct {
}
func AsMap() map[string]EnvVar {
- return map[string]EnvVar{
+ ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
@@ -71,14 +88,24 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
- "OLLAMA_MODELS": {"OLLAMA_MODELS", "", "The path to the models directory"},
+ "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
+ "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
}
+ if runtime.GOOS != "darwin" {
+ ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
+ ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
+ ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
+ ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
+ ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+ ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+ }
+ return ret
}
func Values() map[string]string {
@@ -189,6 +216,15 @@ func LoadConfig() {
NoHistory = true
}
+ if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
+ s, err := strconv.ParseBool(spread)
+ if err == nil {
+ SchedSpread = s
+ } else {
+ SchedSpread = true
+ }
+ }
+
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
@@ -233,10 +269,36 @@ func LoadConfig() {
KeepAlive = clean("OLLAMA_KEEP_ALIVE")
var err error
+ ModelsDir, err = getModelsDir()
+ if err != nil {
+ slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
+ }
+
Host, err = getOllamaHost()
if err != nil {
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
}
+
+ if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
+ IntelGpu = set
+ }
+
+ CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
+ HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
+ RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
+ GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
+ HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
+}
+
+func getModelsDir() (string, error) {
+ if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
+ return models, nil
+ }
+ home, err := os.UserHomeDir()
+ if err != nil {
+ return "", err
+ }
+ return filepath.Join(home, ".ollama", "models"), nil
}
func getOllamaHost() (*OllamaHost, error) {
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 4258e449..1fa4c625 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -13,6 +13,7 @@ import (
"strconv"
"strings"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -25,7 +26,16 @@ const (
// Prefix with the node dir
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
- GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
+
+ // Direct Rendering Manager sysfs location
+ DRMDeviceDirGlob = "/sys/class/drm/card*/device"
+ DRMTotalMemoryFile = "mem_info_vram_total"
+ DRMUsedMemoryFile = "mem_info_vram_used"
+
+ // In hex; properties file is in decimal
+ DRMUniqueIDFile = "unique_id"
+ DRMVendorFile = "vendor"
+ DRMDeviceFile = "device"
)
var (
@@ -35,8 +45,8 @@ var (
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
-func AMDGetGPUInfo() []GpuInfo {
- resp := []GpuInfo{}
+func AMDGetGPUInfo() []RocmGPUInfo {
+ resp := []RocmGPUInfo{}
if !AMDDetected() {
return resp
}
@@ -50,9 +60,9 @@ func AMDGetGPUInfo() []GpuInfo {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string
- hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
- rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
- gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
+ hipVD := envconfig.HipVisibleDevices // zero based index only
+ rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
+ gpuDO := envconfig.GpuDeviceOrdinal // zero based index
switch {
// TODO is this priorty order right?
case hipVD != "":
@@ -65,7 +75,7 @@ func AMDGetGPUInfo() []GpuInfo {
visibleDevices = strings.Split(gpuDO, ",")
}
- gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+ gfxOverride := envconfig.HsaOverrideGfxVersion
var supported []string
libDir := ""
@@ -90,7 +100,7 @@ func AMDGetGPUInfo() []GpuInfo {
scanner := bufio.NewScanner(fp)
isCPU := false
var major, minor, patch uint64
- var vendor, device uint64
+ var vendor, device, uniqueID uint64
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -121,30 +131,43 @@ func AMDGetGPUInfo() []GpuInfo {
} else if strings.HasPrefix(line, "vendor_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
- slog.Debug("malformed vendor_id", "vendor_id", line)
+ slog.Debug("malformed", "vendor_id", line)
continue
}
- vendor, err = strconv.ParseUint(ver[1], 10, 32)
+ vendor, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
- slog.Debug("malformed vendor_id" + line)
+ slog.Debug("malformed", "vendor_id", line, "error", err)
}
} else if strings.HasPrefix(line, "device_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
- slog.Debug("malformed device_id", "device_id", line)
+ slog.Debug("malformed", "device_id", line)
continue
}
- device, err = strconv.ParseUint(ver[1], 10, 32)
+ device, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
- slog.Debug("malformed device_id" + line)
+ slog.Debug("malformed", "device_id", line, "error", err)
+ }
+ } else if strings.HasPrefix(line, "unique_id") {
+ ver := strings.Fields(line)
+ if len(ver) != 2 {
+ slog.Debug("malformed", "unique_id", line)
+ continue
+ }
+ uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
+ if err != nil {
+ slog.Debug("malformed", "unique_id", line, "error", err)
}
}
-
// TODO - any other properties we want to extract and record?
// vendor_id + device_id -> pci lookup for "Name"
// Other metrics that may help us understand relative performance between multiple GPUs
}
+ // Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
+ // into consideration, so we instead map the device over to the DRM driver sysfs nodes which
+ // do reliably report VRAM usage.
+
if isCPU {
cpuCount++
continue
@@ -156,7 +179,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Shouldn't happen, but just in case...
if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
- return []GpuInfo{}
+ return nil
}
//if int(major) < RocmComputeMin {
@@ -167,65 +190,68 @@ func AMDGetGPUInfo() []GpuInfo {
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
- propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
- propFiles, err := filepath.Glob(propGlob)
- if err != nil {
- slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
+ var usedFile string
+ mapping := []struct {
+ id uint64
+ filename string
+ }{
+ {vendor, DRMVendorFile},
+ {device, DRMDeviceFile},
+ {uniqueID, DRMUniqueIDFile}, // Not all devices will report this
}
- // 1 or more memory banks - sum the values of all of them
- for _, propFile := range propFiles {
- fp, err := os.Open(propFile)
- if err != nil {
- slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
- continue
- }
- defer fp.Close()
- scanner := bufio.NewScanner(fp)
- for scanner.Scan() {
- line := strings.TrimSpace(scanner.Text())
- if strings.HasPrefix(line, "size_in_bytes") {
- ver := strings.Fields(line)
- if len(ver) != 2 {
- slog.Warn("malformed " + line)
- continue
- }
- bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
- if err != nil {
- slog.Warn("malformed int " + line)
- continue
- }
- totalMemory += bankSizeInBytes
+ slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
+ // Map over to DRM location to find the total/free memory
+ drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
+ for _, devDir := range drmMatches {
+ matched := true
+ for _, m := range mapping {
+ if m.id == 0 {
+ // Null ID means it didn't populate, so we can't use it to match
+ continue
+ }
+ filename := filepath.Join(devDir, m.filename)
+ buf, err := os.ReadFile(filename)
+ if err != nil {
+ slog.Debug("failed to read sysfs node", "file", filename, "error", err)
+ matched = false
+ break
+ }
+ // values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
+ cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
+ if err != nil {
+ slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
+ matched = false
+ break
+ }
+ if cmp != m.id {
+ matched = false
+ break
}
}
- }
- if totalMemory == 0 {
- slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
- continue
- }
- usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
- usedFiles, err := filepath.Glob(usedGlob)
- if err != nil {
- slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
- continue
- }
- for _, usedFile := range usedFiles {
- fp, err := os.Open(usedFile)
- if err != nil {
- slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
+ if !matched {
continue
}
- defer fp.Close()
- data, err := io.ReadAll(fp)
+
+ // Found the matching DRM directory
+ slog.Debug("matched", "amdgpu", match, "drm", devDir)
+ totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
+ buf, err := os.ReadFile(totalFile)
if err != nil {
- slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
- continue
+ slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
+ break
}
- used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+ totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
- slog.Warn("malformed used memory", "data", string(data), "error", err)
- continue
+ slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
+ break
}
- usedMemory += used
+
+ usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
+ usedMemory, err = getFreeMemory(usedFile)
+ if err != nil {
+ slog.Debug("failed to update used memory", "error", err)
+ }
+ break
}
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@@ -241,18 +267,21 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
- gpuInfo := GpuInfo{
- Library: "rocm",
- memInfo: memInfo{
- TotalMemory: totalMemory,
- FreeMemory: (totalMemory - usedMemory),
+ gpuInfo := RocmGPUInfo{
+ GpuInfo: GpuInfo{
+ Library: "rocm",
+ memInfo: memInfo{
+ TotalMemory: totalMemory,
+ FreeMemory: (totalMemory - usedMemory),
+ },
+ ID: strconv.Itoa(gpuID),
+ Name: name,
+ Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
+ MinimumMemory: rocmMinimumMemory,
+ DriverMajor: driverMajor,
+ DriverMinor: driverMinor,
},
- ID: fmt.Sprintf("%d", gpuID),
- Name: name,
- Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
- MinimumMemory: rocmMinimumMemory,
- DriverMajor: driverMajor,
- DriverMinor: driverMinor,
+ usedFilepath: usedFile,
}
// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -276,7 +305,7 @@ func AMDGetGPUInfo() []GpuInfo {
libDir, err = AMDValidateLibDir()
if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
- return []GpuInfo{}
+ return nil
}
}
gpuInfo.DependencyPath = libDir
@@ -287,7 +316,7 @@ func AMDGetGPUInfo() []GpuInfo {
supported, err = GetSupportedGFX(libDir)
if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
- return []GpuInfo{}
+ return nil
}
slog.Debug("rocm supported GPUs", "types", supported)
}
@@ -304,6 +333,11 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}
+ // Check for env var workarounds
+ if name == "1002:687f" { // Vega RX 56
+ gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
+ }
+
// The GPU has passed all the verification steps and is supported
resp = append(resp, gpuInfo)
}
@@ -378,3 +412,31 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
}
return driverMajor, driverMinor, nil
}
+
+func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
+ if len(gpus) == 0 {
+ return nil
+ }
+ for i := range gpus {
+ usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
+ if err != nil {
+ return err
+ }
+ slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
+ gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
+ }
+ return nil
+}
+
+func getFreeMemory(usedFile string) (uint64, error) {
+ buf, err := os.ReadFile(usedFile)
+ if err != nil {
+ return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
+ }
+ usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
+ if err != nil {
+ slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
+ return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
+ }
+ return usedMemory, nil
+}
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index 290f8677..ebf17ef1 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,8 +7,9 @@ import (
"os"
"path/filepath"
"slices"
- // "strings"
+ "strconv"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -24,8 +25,8 @@ var (
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
)
-func AMDGetGPUInfo() []GpuInfo {
- resp := []GpuInfo{}
+func AMDGetGPUInfo() []RocmGPUInfo {
+ resp := []RocmGPUInfo{}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
@@ -52,7 +53,7 @@ func AMDGetGPUInfo() []GpuInfo {
}
var supported []string
- gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+ gfxOverride := envconfig.HsaOverrideGfxVersion
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {
@@ -117,21 +118,24 @@ func AMDGetGPUInfo() []GpuInfo {
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
- gpuInfo := GpuInfo{
- Library: "rocm",
- memInfo: memInfo{
- TotalMemory: totalMemory,
- FreeMemory: freeMemory,
- },
- ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
- DependencyPath: libDir,
- MinimumMemory: rocmMinimumMemory,
- Name: name,
- Compute: gfx,
+ gpuInfo := RocmGPUInfo{
+ GpuInfo: GpuInfo{
+ Library: "rocm",
+ memInfo: memInfo{
+ TotalMemory: totalMemory,
+ FreeMemory: freeMemory,
+ },
+ ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
+ DependencyPath: libDir,
+ MinimumMemory: rocmMinimumMemory,
+ Name: name,
+ Compute: gfx,
- // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
- // DriverMajor: driverMajor,
- // DriverMinor: driverMinor,
+ // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
+ // DriverMajor: driverMajor,
+ // DriverMinor: driverMinor,
+ },
+ index: i,
}
resp = append(resp, gpuInfo)
@@ -159,3 +163,30 @@ func AMDValidateLibDir() (string, error) {
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
+
+func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
+ if len(gpus) == 0 {
+ return nil
+ }
+ hl, err := NewHipLib()
+ if err != nil {
+ slog.Debug(err.Error())
+ return nil
+ }
+ defer hl.Release()
+
+ for i := range gpus {
+ err := hl.HipSetDevice(gpus[i].index)
+ if err != nil {
+ return err
+ }
+ freeMemory, _, err := hl.HipMemGetInfo()
+ if err != nil {
+ slog.Warn("get mem info", "id", i, "error", err)
+ continue
+ }
+ slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
+ gpus[i].FreeMemory = freeMemory
+ }
+ return nil
+}
diff --git a/gpu/assets.go b/gpu/assets.go
index f2adcf3e..073d2e81 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
- if err == nil {
- pid, err := strconv.Atoi(string(raw))
- if err == nil {
- if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
- // Another running ollama, ignore this tmpdir
- continue
- }
- }
- } else {
- slog.Debug("failed to open ollama.pid", "path", d, "error", err)
- }
- err = os.RemoveAll(d)
if err != nil {
- slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
+ slog.Warn("failed to read ollama.pid", "path", d, "error", err)
+ // No pid, ignore this tmpdir
+ continue
+ }
+
+ pid, err := strconv.Atoi(string(raw))
+ if err != nil {
+ slog.Warn("failed to parse pid", "path", d, "error", err)
+ continue
+ }
+
+ proc, err := os.FindProcess(pid)
+ if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+ slog.Warn("found running ollama", "pid", pid, "path", d)
+ // Another running ollama, ignore this tmpdir
+ continue
+ }
+
+ if err := os.Remove(d); err != nil {
+ slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}
diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go
index 920d0f5b..63e88f25 100644
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -1,21 +1,16 @@
package gpu
import (
- "log/slog"
-
"golang.org/x/sys/cpu"
)
-func GetCPUVariant() string {
+func GetCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 {
- slog.Debug("CPU has AVX2")
- return "avx2"
+ return CPUCapabilityAVX2
}
if cpu.X86.HasAVX {
- slog.Debug("CPU has AVX")
- return "avx"
+ return CPUCapabilityAVX
}
- slog.Debug("CPU does not have vector extensions")
// else LCD
- return ""
+ return CPUCapabilityNone
}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 73ef1358..583bb79c 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -24,19 +24,37 @@ import (
"github.com/ollama/ollama/format"
)
-type handles struct {
+type cudaHandles struct {
deviceCount int
cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
+ nvml *C.nvml_handle_t
+}
+
+type oneapiHandles struct {
oneapi *C.oneapi_handle_t
+ deviceCount int
}
const (
cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte
+ // TODO OneAPI minimum memory
)
-var gpuMutex sync.Mutex
+var (
+ gpuMutex sync.Mutex
+ bootstrapped bool
+ cpuCapability CPUCapability
+ cpus []CPUInfo
+ cudaGPUs []CudaGPUInfo
+ nvcudaLibPath string
+ cudartLibPath string
+ oneapiLibPath string
+ nvmlLibPath string
+ rocmGPUs []RocmGPUInfo
+ oneapiGPUs []OneapiGPUInfo
+)
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
@@ -46,113 +64,113 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
-var CudartLinuxGlobs = []string{
- "/usr/local/cuda/lib64/libcudart.so*",
- "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
- "/usr/lib/x86_64-linux-gnu/libcudart.so*",
- "/usr/lib/wsl/lib/libcudart.so*",
- "/usr/lib/wsl/drivers/*/libcudart.so*",
- "/opt/cuda/lib64/libcudart.so*",
- "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
- "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
- "/usr/lib/aarch64-linux-gnu/libcudart.so*",
- "/usr/local/cuda/lib*/libcudart.so*",
- "/usr/lib*/libcudart.so*",
- "/usr/local/lib*/libcudart.so*",
-}
-
-var CudartWindowsGlobs = []string{
- "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
-}
-
-var NvcudaLinuxGlobs = []string{
- "/usr/local/cuda*/targets/*/lib/libcuda.so*",
- "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
- "/usr/lib/*-linux-gnu/libcuda.so*",
- "/usr/lib/wsl/lib/libcuda.so*",
- "/usr/lib/wsl/drivers/*/libcuda.so*",
- "/opt/cuda/lib*/libcuda.so*",
- "/usr/local/cuda/lib*/libcuda.so*",
- "/usr/lib*/libcuda.so*",
- "/usr/local/lib*/libcuda.so*",
-}
-
-var NvcudaWindowsGlobs = []string{
- "c:\\windows\\system*\\nvcuda.dll",
-}
-
-var OneapiWindowsGlobs = []string{
- "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
-}
-
-var OneapiLinuxGlobs = []string{
- "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
- "/usr/lib*/libze_intel_gpu.so*",
-}
-
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
-func initGPUHandles() *handles {
+func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
- gpuHandles := &handles{}
- var cudartMgmtName string
- var cudartMgmtPatterns []string
- var nvcudaMgmtName string
- var nvcudaMgmtPatterns []string
-
- tmpDir, _ := PayloadsDir()
- switch runtime.GOOS {
- case "windows":
- cudartMgmtName = "cudart64_*.dll"
- localAppData := os.Getenv("LOCALAPPDATA")
- cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
- cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
- // Aligned with driver, we can't carry as payloads
- nvcudaMgmtName = "nvcuda.dll"
- nvcudaMgmtPatterns = NvcudaWindowsGlobs
- case "linux":
- cudartMgmtName = "libcudart.so*"
- if tmpDir != "" {
- // TODO - add "payloads" for subprocess
- cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
- }
- cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
- // Aligned with driver, we can't carry as payloads
- nvcudaMgmtName = "libcuda.so*"
- nvcudaMgmtPatterns = NvcudaLinuxGlobs
- default:
- return gpuHandles
+ cHandles := &cudaHandles{}
+ // Short Circuit if we already know which library to use
+ if nvmlLibPath != "" {
+ cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
+ return cHandles
+ }
+ if nvcudaLibPath != "" {
+ cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
+ return cHandles
+ }
+ if cudartLibPath != "" {
+ cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
+ return cHandles
}
- slog.Debug("Detecting GPUs")
- nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
+ slog.Debug("searching for GPU discovery libraries for NVIDIA")
+ var cudartMgmtPatterns []string
+
+ // Aligned with driver, we can't carry as payloads
+ nvcudaMgmtPatterns := NvcudaGlobs
+
+ if runtime.GOOS == "windows" {
+ localAppData := os.Getenv("LOCALAPPDATA")
+ cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
+ }
+ tmpDir, _ := PayloadsDir()
+ if tmpDir != "" {
+ // TODO - add "payloads" for subprocess
+ cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
+ }
+ cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
+
+ if len(NvmlGlobs) > 0 {
+ nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
+ if len(nvmlLibPaths) > 0 {
+ nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
+ if nvml != nil {
+ slog.Debug("nvidia-ml loaded", "library", libPath)
+ cHandles.nvml = nvml
+ nvmlLibPath = libPath
+ }
+ }
+ }
+
+ nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
- gpuHandles.nvcuda = nvcuda
- gpuHandles.deviceCount = deviceCount
- return gpuHandles
+ cHandles.nvcuda = nvcuda
+ cHandles.deviceCount = deviceCount
+ nvcudaLibPath = libPath
+ return cHandles
}
}
- cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
+ cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil {
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
- gpuHandles.cudart = cudart
- gpuHandles.deviceCount = deviceCount
- return gpuHandles
+ cHandles.cudart = cudart
+ cHandles.deviceCount = deviceCount
+ cudartLibPath = libPath
+ return cHandles
}
}
- return gpuHandles
+ return cHandles
+}
+
+// Note: gpuMutex must already be held
+func initOneAPIHandles() *oneapiHandles {
+ oHandles := &oneapiHandles{}
+
+ // Short Circuit if we already know which library to use
+ if oneapiLibPath != "" {
+ oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
+ return oHandles
+ }
+
+ oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
+ if len(oneapiLibPaths) > 0 {
+ oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
+ }
+
+ return oHandles
+}
+
+func GetCPUInfo() GpuInfoList {
+ gpuMutex.Lock()
+ if !bootstrapped {
+ gpuMutex.Unlock()
+ GetGPUInfo()
+ } else {
+ gpuMutex.Unlock()
+ }
+ return GpuInfoList{cpus[0].GpuInfo}
}
func GetGPUInfo() GpuInfoList {
@@ -160,112 +178,255 @@ func GetGPUInfo() GpuInfoList {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
-
- gpuHandles := initGPUHandles()
+ needRefresh := true
+ var cHandles *cudaHandles
+ var oHandles *oneapiHandles
defer func() {
- if gpuHandles.cudart != nil {
- C.cudart_release(*gpuHandles.cudart)
+ if cHandles != nil {
+ if cHandles.cudart != nil {
+ C.cudart_release(*cHandles.cudart)
+ }
+ if cHandles.nvcuda != nil {
+ C.nvcuda_release(*cHandles.nvcuda)
+ }
+ if cHandles.nvml != nil {
+ C.nvml_release(*cHandles.nvml)
+ }
}
- if gpuHandles.nvcuda != nil {
- C.nvcuda_release(*gpuHandles.nvcuda)
+ if oHandles != nil {
+ if oHandles.oneapi != nil {
+ // TODO - is this needed?
+ C.oneapi_release(*oHandles.oneapi)
+ }
}
}()
- // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
- cpuVariant := GetCPUVariant()
- if cpuVariant == "" && runtime.GOARCH == "amd64" {
- slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
- }
+ if !bootstrapped {
+ slog.Debug("Detecting GPUs")
+ needRefresh = false
+ cpuCapability = GetCPUCapability()
+ var memInfo C.mem_info_t
- // On windows we bundle the nvidia library one level above the runner dir
- depPath := ""
- if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
- depPath = filepath.Dir(envconfig.RunnersDir)
- }
-
- var memInfo C.mem_info_t
- resp := []GpuInfo{}
-
- // NVIDIA first
- for i := range gpuHandles.deviceCount {
- // TODO once we support CPU compilation variants of GPU libraries refine this...
- if cpuVariant == "" && runtime.GOARCH == "amd64" {
- continue
+ mem, err := GetCPUMem()
+ if err != nil {
+ slog.Warn("error looking up system memory", "error", err)
}
- if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
- gpuInfo := GpuInfo{
- Library: "cuda",
+ cpus = []CPUInfo{CPUInfo{
+ GpuInfo: GpuInfo{
+ memInfo: mem,
+ Library: "cpu",
+ Variant: cpuCapability,
+ ID: "0",
+ },
+ }}
+
+ // Fallback to CPU mode if we're lacking required vector extensions on x86
+ if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
+ slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
+ bootstrapped = true
+ // No need to do any GPU discovery, since we can't run on them
+ return GpuInfoList{cpus[0].GpuInfo}
+ }
+
+ // On windows we bundle the nvidia library one level above the runner dir
+ depPath := ""
+ if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+ depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
+ }
+
+ // Load ALL libraries
+ cHandles = initCudaHandles()
+
+ // NVIDIA
+ for i := range cHandles.deviceCount {
+ if cHandles.cudart != nil || cHandles.nvcuda != nil {
+ gpuInfo := CudaGPUInfo{
+ GpuInfo: GpuInfo{
+ Library: "cuda",
+ },
+ index: i,
+ }
+ var driverMajor int
+ var driverMinor int
+ if cHandles.cudart != nil {
+ C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
+ } else {
+ C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
+ driverMajor = int(cHandles.nvcuda.driver_major)
+ driverMinor = int(cHandles.nvcuda.driver_minor)
+ }
+ if memInfo.err != nil {
+ slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+ C.free(unsafe.Pointer(memInfo.err))
+ continue
+ }
+ if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+ slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
+ continue
+ }
+ gpuInfo.TotalMemory = uint64(memInfo.total)
+ gpuInfo.FreeMemory = uint64(memInfo.free)
+ gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+ gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+ gpuInfo.MinimumMemory = cudaMinimumMemory
+ gpuInfo.DependencyPath = depPath
+ gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+ gpuInfo.DriverMajor = driverMajor
+ gpuInfo.DriverMinor = driverMinor
+
+ // TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+ cudaGPUs = append(cudaGPUs, gpuInfo)
}
- var driverMajor int
- var driverMinor int
- if gpuHandles.cudart != nil {
- C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
+ }
+
+ // Intel
+ if envconfig.IntelGpu {
+ oHandles = initOneAPIHandles()
+ // On windows we bundle the oneapi library one level above the runner dir
+ depPath = ""
+ if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+ depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
+ }
+
+ for d := range oHandles.oneapi.num_drivers {
+ if oHandles.oneapi == nil {
+ // shouldn't happen
+ slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+ continue
+ }
+ devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+ for i := range devCount {
+ gpuInfo := OneapiGPUInfo{
+ GpuInfo: GpuInfo{
+ Library: "oneapi",
+ },
+ driverIndex: int(d),
+ gpuIndex: int(i),
+ }
+ // TODO - split bootstrapping from updating free memory
+ C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+ // TODO - convert this to MinimumMemory based on testing...
+ var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+ memInfo.free = C.uint64_t(totalFreeMem)
+ gpuInfo.TotalMemory = uint64(memInfo.total)
+ gpuInfo.FreeMemory = uint64(memInfo.free)
+ gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+ gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+ gpuInfo.DependencyPath = depPath
+ oneapiGPUs = append(oneapiGPUs, gpuInfo)
+ }
+ }
+ }
+
+ rocmGPUs = AMDGetGPUInfo()
+ bootstrapped = true
+ }
+
+ // For detected GPUs, load library if not loaded
+
+ // Refresh free memory usage
+ if needRefresh {
+ mem, err := GetCPUMem()
+ if err != nil {
+ slog.Warn("error looking up system memory", "error", err)
+ } else {
+ slog.Debug("updating system memory data",
+ slog.Group(
+ "before",
+ "total", format.HumanBytes2(cpus[0].TotalMemory),
+ "free", format.HumanBytes2(cpus[0].FreeMemory),
+ ),
+ slog.Group(
+ "now",
+ "total", format.HumanBytes2(mem.TotalMemory),
+ "free", format.HumanBytes2(mem.FreeMemory),
+ ),
+ )
+ cpus[0].FreeMemory = mem.FreeMemory
+ }
+
+ var memInfo C.mem_info_t
+ if cHandles == nil && len(cudaGPUs) > 0 {
+ cHandles = initCudaHandles()
+ }
+ for i, gpu := range cudaGPUs {
+ if cHandles.nvml != nil {
+ C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
+ } else if cHandles.cudart != nil {
+ C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
+ } else if cHandles.nvcuda != nil {
+ C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
+ memInfo.used = memInfo.total - memInfo.free
} else {
- C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
- driverMajor = int(gpuHandles.nvcuda.driver_major)
- driverMinor = int(gpuHandles.nvcuda.driver_minor)
+ // shouldn't happen
+ slog.Warn("no valid cuda library loaded to refresh vram usage")
+ break
}
if memInfo.err != nil {
- slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+ slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
- if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
- slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
+ if memInfo.free == 0 {
+ slog.Warn("error looking up nvidia GPU memory")
continue
}
- gpuInfo.TotalMemory = uint64(memInfo.total)
- gpuInfo.FreeMemory = uint64(memInfo.free)
- gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
- gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
- gpuInfo.MinimumMemory = cudaMinimumMemory
- gpuInfo.DependencyPath = depPath
- gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
- gpuInfo.DriverMajor = driverMajor
- gpuInfo.DriverMinor = driverMinor
+ slog.Debug("updating cuda memory data",
+ "gpu", gpu.ID,
+ "name", gpu.Name,
+ slog.Group(
+ "before",
+ "total", format.HumanBytes2(gpu.TotalMemory),
+ "free", format.HumanBytes2(gpu.FreeMemory),
+ ),
+ slog.Group(
+ "now",
+ "total", format.HumanBytes2(uint64(memInfo.total)),
+ "free", format.HumanBytes2(uint64(memInfo.free)),
+ "used", format.HumanBytes2(uint64(memInfo.used)),
+ ),
+ )
+ cudaGPUs[i].FreeMemory = uint64(memInfo.free)
+ }
- // TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
- resp = append(resp, gpuInfo)
+ if oHandles == nil && len(oneapiGPUs) > 0 {
+ oHandles = initOneAPIHandles()
+ }
+ for i, gpu := range oneapiGPUs {
+ if oHandles.oneapi == nil {
+ // shouldn't happen
+ slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
+ continue
+ }
+ C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
+ // TODO - convert this to MinimumMemory based on testing...
+ var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+ memInfo.free = C.uint64_t(totalFreeMem)
+ oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
+ }
+
+ err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
+ if err != nil {
+ slog.Debug("problem refreshing ROCm free memory", "error", err)
}
}
- // Then AMD
- resp = append(resp, AMDGetGPUInfo()...)
-
+ resp := []GpuInfo{}
+ for _, gpu := range cudaGPUs {
+ resp = append(resp, gpu.GpuInfo)
+ }
+ for _, gpu := range rocmGPUs {
+ resp = append(resp, gpu.GpuInfo)
+ }
+ for _, gpu := range oneapiGPUs {
+ resp = append(resp, gpu.GpuInfo)
+ }
if len(resp) == 0 {
- C.cpu_check_ram(&memInfo)
- if memInfo.err != nil {
- slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
- C.free(unsafe.Pointer(memInfo.err))
- return resp
- }
- gpuInfo := GpuInfo{
- Library: "cpu",
- Variant: cpuVariant,
- }
- gpuInfo.TotalMemory = uint64(memInfo.total)
- gpuInfo.FreeMemory = uint64(memInfo.free)
- gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-
- resp = append(resp, gpuInfo)
+ resp = append(resp, cpus[0].GpuInfo)
}
-
return resp
}
-func GetCPUMem() (memInfo, error) {
- var ret memInfo
- var info C.mem_info_t
- C.cpu_check_ram(&info)
- if info.err != nil {
- defer C.free(unsafe.Pointer(info.err))
- return ret, fmt.Errorf(C.GoString(info.err))
- }
- ret.FreeMemory = uint64(info.free)
- ret.TotalMemory = uint64(info.total)
- return ret, nil
-}
-
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
@@ -296,6 +457,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Nvidia PhysX known to return bogus results
if strings.Contains(pattern, "PhysX") {
slog.Debug("skipping PhysX cuda library path", "path", pattern)
+ continue
}
// Ignore glob discovery errors
matches, _ := filepath.Glob(pattern)
@@ -361,8 +523,26 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
return 0, nil, ""
}
+func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
+ var resp C.nvml_init_resp_t
+ resp.ch.verbose = getVerboseState()
+ for _, libPath := range nvmlLibPaths {
+ lib := C.CString(libPath)
+ defer C.free(unsafe.Pointer(lib))
+ C.nvml_init(lib, &resp)
+ if resp.err != nil {
+ slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+ C.free(unsafe.Pointer(resp.err))
+ } else {
+ return &resp.ch, libPath
+ }
+ }
+ return nil, ""
+}
+
func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
var resp C.oneapi_init_resp_t
+ num_devices := 0
resp.oh.verbose = getVerboseState()
for _, libPath := range oneapiLibPaths {
lib := C.CString(libPath)
@@ -372,7 +552,10 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
- return int(resp.num_devices), &resp.oh, libPath
+ for i := range resp.oh.num_drivers {
+ num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
+ }
+ return num_devices, &resp.oh, libPath
}
}
return 0, nil, ""
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index f8cc1adb..f26d23c1 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
- Variant: GetCPUVariant(),
+ Variant: GetCPUCapability(),
memInfo: mem,
},
}
@@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{info}
}
+func GetCPUInfo() GpuInfoList {
+ mem, _ := GetCPUMem()
+ return []GpuInfo{
+ {
+ Library: "cpu",
+ Variant: GetCPUCapability(),
+ memInfo: mem,
+ },
+ }
+}
+
func GetCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()),
diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
index 482b81a6..ab0952d9 100644
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -47,6 +47,7 @@ typedef struct mem_info {
char gpu_name[GPU_NAME_LEN];
uint64_t total;
uint64_t free;
+ uint64_t used;
// Compute Capability
int major;
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
#include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
+#include "gpu_info_nvml.h"
#include "gpu_info_oneapi.h"
#endif // __GPU_INFO_H__
diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c
deleted file mode 100644
index 6cbe28b0..00000000
--- a/gpu/gpu_info_cpu.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "gpu_info.h"
-// Fallbacks for CPU mode
-
-#ifdef _WIN32
-#include
-void cpu_check_ram(mem_info_t *resp) {
- resp->err = NULL;
- MEMORYSTATUSEX info;
- info.dwLength = sizeof(info);
- if (GlobalMemoryStatusEx(&info) != 0) {
- resp->total = info.ullTotalPhys;
- resp->free = info.ullAvailPhys;
- snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
- } else {
- resp->err = LOAD_ERR();
- }
- return;
-}
-
-#elif __linux__
-#include
-#include
-#include
-void cpu_check_ram(mem_info_t *resp) {
- struct sysinfo info;
- resp->err = NULL;
- if (sysinfo(&info) != 0) {
- resp->err = strdup(strerror(errno));
- } else {
- resp->total = info.totalram * info.mem_unit;
- resp->free = info.freeram * info.mem_unit;
- snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
- }
- return;
-}
-
-#elif __APPLE__
-// TODO consider an Apple implementation that does something useful
-// mem_info_t cpu_check_ram() {
-// mem_info_t resp = {0, 0, NULL};
-// return resp;
-// }
-#else
-#error "Unsupported platform"
-#endif
diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c
index 8e9204ea..03f15a2c 100644
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!l[i].p) {
+ if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
}
-void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
+void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret;
@@ -166,9 +166,11 @@ void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
resp->total = memInfo.total;
resp->free = memInfo.free;
+ resp->used = memInfo.used;
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
+ LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
}
diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h
index e8a89856..ff0c0af1 100644
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
} cudart_init_resp_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
+void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
+// TODO - if we keep this library longer term, add cudart_get_free
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__
diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c
index 26d855df..abe14084 100644
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!*l[i].p) {
+ if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
}
const int buflen = 256;
-void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
+void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
@@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
- snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
+ snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
resp->err = strdup(buf);
return;
}
@@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
- LOG(1, "nvcuda failed to release primary device context %d", ret);
+ LOG(1, "nvcuda failed to release device context %d", ret);
+ }
+}
+
+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
+ CUresult ret;
+ CUcontext ctx = NULL;
+ CUdevice device = -1;
+ *free = 0;
+ *total = 0;
+
+ ret = (*h.cuDeviceGet)(&device, i);
+ if (ret != CUDA_SUCCESS) {
+ LOG(1, "nvcuda device failed to initialize");
+ return;
+ }
+
+
+ // To get memory we have to set (and release) a context
+ ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+ if (ret != CUDA_SUCCESS) {
+ LOG(1, "nvcuda failed to get device context %d", ret);
+ return;
+ }
+
+ ret = (*h.cuMemGetInfo_v2)(free, total);
+ if (ret != CUDA_SUCCESS) {
+ LOG(1, "nvcuda device memory info lookup failure %d", ret);
+ // Best effort on failure...
+ (*h.cuCtxDestroy)(ctx);
+ return;
+ }
+
+ ret = (*h.cuCtxDestroy)(ctx);
+ if (ret != CUDA_SUCCESS) {
+ LOG(1, "nvcuda failed to release device context %d", ret);
}
}
diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h
index 2b232839..f9654f64 100644
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__
diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c
new file mode 100644
index 00000000..11293e44
--- /dev/null
+++ b/gpu/gpu_info_nvml.c
@@ -0,0 +1,104 @@
+#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
+
+#include
+
+#include "gpu_info_nvml.h"
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
+ nvmlReturn_t ret;
+ resp->err = NULL;
+ const int buflen = 256;
+ char buf[buflen + 1];
+ int i;
+
+ struct lookup {
+ char *s;
+ void **p;
+ } l[] = {
+ {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+ {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+ {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+ {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+ {NULL, NULL},
+ };
+
+ resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+ if (!resp->ch.handle) {
+ char *msg = LOAD_ERR();
+ LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+ snprintf(buf, buflen,
+ "Unable to load %s library to query for Nvidia GPUs: %s",
+ nvml_lib_path, msg);
+ free(msg);
+ resp->err = strdup(buf);
+ return;
+ }
+
+ // TODO once we've squashed the remaining corner cases remove this log
+ // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+
+ for (i = 0; l[i].s != NULL; i++) {
+ // TODO once we've squashed the remaining corner cases remove this log
+ // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+ *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+ if (!*(l[i].p)) {
+ resp->ch.handle = NULL;
+ char *msg = LOAD_ERR();
+ LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+ UNLOAD_LIBRARY(resp->ch.handle);
+ snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+ msg);
+ free(msg);
+ resp->err = strdup(buf);
+ return;
+ }
+ }
+
+ ret = (*resp->ch.nvmlInit_v2)();
+ if (ret != NVML_SUCCESS) {
+ LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
+ UNLOAD_LIBRARY(resp->ch.handle);
+ resp->ch.handle = NULL;
+ snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+ resp->err = strdup(buf);
+ return;
+ }
+}
+
+
+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+ nvmlDevice_t device;
+ nvmlMemory_t memInfo = {0};
+ nvmlReturn_t ret;
+ ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+ if (ret != NVML_SUCCESS) {
+ LOG(1, "unable to get device handle %d: %d", device_id, ret);
+ *free = 0;
+ return;
+ }
+
+ ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
+ if (ret != NVML_SUCCESS) {
+ LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+ *free = 0;
+ return;
+ }
+ *free = memInfo.free;
+ *total = memInfo.total;
+ *used = memInfo.used;
+}
+
+
+void nvml_release(nvml_handle_t h) {
+ LOG(h.verbose, "releasing nvml library\n");
+ nvmlReturn_t ret;
+ ret = (*h.nvmlShutdown)();
+ if (ret != NVML_SUCCESS) {
+ LOG(1, "error during nvmlShutdown %d", ret);
+ }
+ UNLOAD_LIBRARY(h.handle);
+ h.handle = NULL;
+}
+
+#endif // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_nvml.h b/gpu/gpu_info_nvml.h
new file mode 100644
index 00000000..a661f723
--- /dev/null
+++ b/gpu/gpu_info_nvml.h
@@ -0,0 +1,48 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVML_H__
+#define __GPU_INFO_NVML_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+ NVML_SUCCESS = 0,
+ // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t; // Opaque is sufficient
+typedef struct nvmlMemory_st {
+ unsigned long long total;
+ unsigned long long free;
+ unsigned long long used;
+} nvmlMemory_t;
+
+typedef enum nvmlBrandType_enum
+{
+ NVML_BRAND_UNKNOWN = 0,
+} nvmlBrandType_t;
+
+typedef struct nvml_handle {
+ void *handle;
+ uint16_t verbose;
+ nvmlReturn_t (*nvmlInit_v2)(void);
+ nvmlReturn_t (*nvmlShutdown)(void);
+ nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+ nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml_handle_t;
+
+typedef struct nvml_init_resp {
+ char *err; // If err is non-null handle is invalid
+ nvml_handle_t ch;
+} nvml_init_resp_t;
+
+typedef struct nvml_compute_capability {
+ char *err;
+ int major;
+ int minor;
+} nvml_compute_capability_t;
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_release(nvml_handle_t ch);
+
+#endif // __GPU_INFO_NVML_H__
+#endif // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index 4be90e80..3ff708ea 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -4,15 +4,17 @@
#include
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
-{
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
ze_result_t ret;
resp->err = NULL;
+ resp->oh.devices = NULL;
+ resp->oh.num_devices = NULL;
+ resp->oh.drivers = NULL;
+ resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
- int i;
- struct lookup
- {
+ int i, d;
+ struct lookup {
char *s;
void **p;
} l[] = {
@@ -28,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
};
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
- if (!resp->oh.handle)
- {
+ if (!resp->oh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n",
@@ -44,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
"wiring Level-Zero management library functions in %s\n",
oneapi_lib_path);
- for (i = 0; l[i].s != NULL; i++)
- {
+ for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
- if (!l[i].p)
- {
+ if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -63,23 +62,70 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
}
}
+ LOG(resp->oh.verbose, "calling zesInit\n");
+
ret = (*resp->oh.zesInit)(0);
- if (ret != ZE_RESULT_SUCCESS)
- {
- LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
- UNLOAD_LIBRARY(resp->oh.handle);
- resp->oh.handle = NULL;
- snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
+ if (ret != ZE_RESULT_SUCCESS) {
+ LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
+ snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf);
+ oneapi_release(resp->oh);
+ return;
}
- (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
+ LOG(resp->oh.verbose, "calling zesDriverGet\n");
+ ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
+ if (ret != ZE_RESULT_SUCCESS) {
+ LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+ snprintf(buf, buflen, "unable to get driver count: %x", ret);
+ resp->err = strdup(buf);
+ oneapi_release(resp->oh);
+ return;
+ }
+ LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
+ resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
+ resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
+ memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
+ resp->oh.devices =
+ malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
+ ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
+ if (ret != ZE_RESULT_SUCCESS) {
+ LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+ snprintf(buf, buflen, "unable to get driver count: %x", ret);
+ resp->err = strdup(buf);
+ oneapi_release(resp->oh);
+ return;
+ }
+
+ for (d = 0; d < resp->oh.num_drivers; d++) {
+ LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
+ ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
+ &resp->oh.num_devices[d], NULL);
+ if (ret != ZE_RESULT_SUCCESS) {
+ LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+ snprintf(buf, buflen, "unable to get device count: %x", ret);
+ resp->err = strdup(buf);
+ oneapi_release(resp->oh);
+ return;
+ }
+ resp->oh.devices[d] =
+ malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
+ ret = (*resp->oh.zesDeviceGet)(
+ resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
+ if (ret != ZE_RESULT_SUCCESS) {
+ LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+ snprintf(buf, buflen, "unable to get device count: %x", ret);
+ resp->err = strdup(buf);
+ oneapi_release(resp->oh);
+ return;
+ }
+ }
return;
}
-void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
-{
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+ mem_info_t *resp) {
ze_result_t ret;
resp->err = NULL;
uint64_t totalMem = 0;
@@ -88,127 +134,126 @@ void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
char buf[buflen + 1];
int i, d, m;
- if (h.handle == NULL)
- {
+ if (h.handle == NULL) {
resp->err = strdup("Level-Zero handle not initialized");
return;
}
- uint32_t driversCount = 0;
- ret = (*h.zesDriverGet)(&driversCount, NULL);
- if (ret != ZE_RESULT_SUCCESS)
- {
- snprintf(buf, buflen, "unable to get driver count: %d", ret);
- resp->err = strdup(buf);
+ if (driver > h.num_drivers || device > h.num_devices[driver]) {
+ resp->err = strdup("driver of device index out of bounds");
return;
}
- LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
-
- zes_driver_handle_t *allDrivers =
- malloc(driversCount * sizeof(zes_driver_handle_t));
- (*h.zesDriverGet)(&driversCount, allDrivers);
resp->total = 0;
resp->free = 0;
- for (d = 0; d < driversCount; d++)
- {
- uint32_t deviceCount = 0;
- ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
- if (ret != ZE_RESULT_SUCCESS)
- {
- snprintf(buf, buflen, "unable to get device count: %d", ret);
+ zes_device_ext_properties_t ext_props;
+ ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+ ext_props.pNext = NULL;
+
+ zes_device_properties_t props;
+ props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ props.pNext = &ext_props;
+
+ ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
+ if (ret != ZE_RESULT_SUCCESS) {
+ snprintf(buf, buflen, "unable to get device properties: %d", ret);
+ resp->err = strdup(buf);
+ return;
+ }
+
+ snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
+
+ // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
+ // (this is probably wrong...)
+ // TODO - the driver isn't included - what if there are multiple drivers?
+ snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
+
+ if (h.verbose) {
+ // When in verbose mode, report more information about
+ // the card we discover.
+ LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
+ props.modelName);
+ LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
+ props.brandName);
+ LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
+ props.vendorName);
+ LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
+ props.serialNumber);
+ LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
+ props.boardNumber);
+ }
+
+ // TODO
+ // Compute Capability equivalent in resp->major, resp->minor, resp->patch
+
+ uint32_t memCount = 0;
+ ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
+ NULL);
+ if (ret != ZE_RESULT_SUCCESS) {
+ snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
+ ret);
+ resp->err = strdup(buf);
+ return;
+ }
+
+ LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+ zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+ (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
+
+ for (m = 0; m < memCount; m++) {
+ zes_mem_state_t state;
+ state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+ state.pNext = NULL;
+ ret = (*h.zesMemoryGetState)(mems[m], &state);
+ if (ret != ZE_RESULT_SUCCESS) {
+ snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf);
- free(allDrivers);
+ free(mems);
return;
}
- LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
-
- zes_device_handle_t *devices =
- malloc(deviceCount * sizeof(zes_device_handle_t));
- (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
-
- for (i = 0; i < deviceCount; i++)
- {
- zes_device_ext_properties_t ext_props;
- ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
- ext_props.pNext = NULL;
-
- zes_device_properties_t props;
- props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
- props.pNext = &ext_props;
-
- ret = (*h.zesDeviceGetProperties)(devices[i], &props);
- if (ret != ZE_RESULT_SUCCESS)
- {
- snprintf(buf, buflen, "unable to get device properties: %d", ret);
- resp->err = strdup(buf);
- free(allDrivers);
- free(devices);
- return;
- }
-
- if (h.verbose)
- {
- // When in verbose mode, report more information about
- // the card we discover.
- LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
- props.modelName);
- LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
- props.brandName);
- LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
- props.vendorName);
- LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
- props.serialNumber);
- LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
- props.boardNumber);
- }
-
- uint32_t memCount = 0;
- ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
- if (ret != ZE_RESULT_SUCCESS)
- {
- snprintf(buf, buflen,
- "unable to enumerate Level-Zero memory modules: %d", ret);
- resp->err = strdup(buf);
- free(allDrivers);
- free(devices);
- return;
- }
-
- LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-
- zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
- (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
-
- for (m = 0; m < memCount; m++)
- {
- zes_mem_state_t state;
- state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
- state.pNext = NULL;
- ret = (*h.zesMemoryGetState)(mems[m], &state);
- if (ret != ZE_RESULT_SUCCESS)
- {
- snprintf(buf, buflen, "unable to get memory state: %d", ret);
- resp->err = strdup(buf);
- free(allDrivers);
- free(devices);
- free(mems);
- return;
- }
-
- resp->total += state.size;
- resp->free += state.free;
- }
-
- free(mems);
- }
-
- free(devices);
+ resp->total += state.size;
+ resp->free += state.free;
}
- free(allDrivers);
+ free(mems);
+}
+
+void oneapi_release(oneapi_handle_t h) {
+ int d;
+ LOG(h.verbose, "releasing oneapi library\n");
+ for (d = 0; d < h.num_drivers; d++) {
+ if (h.devices != NULL && h.devices[d] != NULL) {
+ free(h.devices[d]);
+ }
+ }
+ if (h.devices != NULL) {
+ free(h.devices);
+ h.devices = NULL;
+ }
+ if (h.num_devices != NULL) {
+ free(h.num_devices);
+ h.num_devices = NULL;
+ }
+ if (h.drivers != NULL) {
+ free(h.drivers);
+ h.drivers = NULL;
+ }
+ h.num_drivers = 0;
+ UNLOAD_LIBRARY(h.handle);
+ h.handle = NULL;
+}
+
+int oneapi_get_device_count(oneapi_handle_t h, int driver) {
+ if (h.handle == NULL || h.num_devices == NULL) {
+ return 0;
+ }
+ if (driver > h.num_drivers) {
+ return 0;
+ }
+ return (int)h.num_devices[driver];
}
#endif // __APPLE__
diff --git a/gpu/gpu_info_oneapi.h b/gpu/gpu_info_oneapi.h
index 9db9fae0..97fcecd9 100644
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -9,8 +9,7 @@
#define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t
-{
+typedef enum ze_result_t {
ZE_RESULT_SUCCESS = 0,
// Other values omitted for now...
} ze_result_t;
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
-typedef enum _ze_structure_type_t
-{
+typedef enum _ze_structure_type_t {
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t;
-typedef enum _zes_structure_type_t
-{
+typedef enum _zes_structure_type_t {
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t;
-typedef enum _zes_mem_type_t
-{
+typedef enum _zes_mem_type_t {
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t;
-typedef enum _zes_mem_loc_t
-{
+typedef enum _zes_mem_loc_t {
ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t;
-typedef enum _zes_mem_health_t
-{
+typedef enum _zes_mem_health_t {
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t;
-typedef struct _ze_device_uuid_t
-{
+typedef struct _ze_device_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t;
-typedef struct _zes_uuid_t
-{
+typedef struct _zes_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t;
-typedef enum _ze_device_type_t
-{
+typedef enum _ze_device_type_t {
ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3,
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t;
-typedef enum _zes_device_type_t
-{
+typedef enum _zes_device_type_t {
ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3,
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
} zes_device_type_t;
typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t
-{
+typedef enum _ze_device_property_flag_t {
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
} ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t
-{
+typedef enum _zes_device_property_flag_t {
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t;
-typedef struct _ze_device_properties_t
-{
+typedef struct _ze_device_properties_t {
ze_structure_type_t stype;
void *pNext;
ze_device_type_t type;
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t;
-typedef struct _zes_device_properties_t
-{
+typedef struct _zes_device_properties_t {
zes_structure_type_t stype;
void *pNext;
ze_device_properties_t core;
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t;
-typedef struct _zes_device_ext_properties_t
-{
+typedef struct _zes_device_ext_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_uuid_t uuid;
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
zes_device_property_flags_t flags;
} zes_device_ext_properties_t;
-typedef struct _zes_mem_properties_t
-{
+typedef struct _zes_mem_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_mem_type_t type;
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
int32_t numChannels;
} zes_mem_properties_t;
-typedef struct _zes_mem_state_t
-{
+typedef struct _zes_mem_state_t {
zes_structure_type_t stype;
const void *pNext;
zes_mem_health_t health;
@@ -171,10 +154,19 @@ typedef struct _zes_mem_state_t
uint64_t size;
} zes_mem_state_t;
-typedef struct oneapi_handle
-{
+typedef struct oneapi_handle {
void *handle;
uint16_t verbose;
+
+ uint32_t num_drivers;
+ zes_driver_handle_t *drivers;
+ uint32_t *num_devices;
+ zes_device_handle_t **devices;
+
+ // TODO Driver major, minor information
+ // int driver_major;
+ // int driver_minor;
+
ze_result_t (*zesInit)(int);
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
@@ -191,21 +183,21 @@ typedef struct oneapi_handle
} oneapi_handle_t;
-typedef struct oneapi_init_resp
-{
+typedef struct oneapi_init_resp {
char *err; // If err is non-null handle is invalid
- int num_devices;
oneapi_handle_t oh;
} oneapi_init_resp_t;
-typedef struct oneapi_version_resp
-{
+typedef struct oneapi_version_resp {
ze_result_t status;
char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+ mem_info_t *resp);
+void oneapi_release(oneapi_handle_t h);
+int oneapi_get_device_count(oneapi_handle_t h, int driver);
#endif // __GPU_INFO_INTEL_H__
#endif // __APPLE__
diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go
new file mode 100644
index 00000000..a099bf82
--- /dev/null
+++ b/gpu/gpu_linux.go
@@ -0,0 +1,89 @@
+package gpu
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "strings"
+
+ "github.com/ollama/ollama/format"
+)
+
+var CudartGlobs = []string{
+ "/usr/local/cuda/lib64/libcudart.so*",
+ "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+ "/usr/lib/x86_64-linux-gnu/libcudart.so*",
+ "/usr/lib/wsl/lib/libcudart.so*",
+ "/usr/lib/wsl/drivers/*/libcudart.so*",
+ "/opt/cuda/lib64/libcudart.so*",
+ "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+ "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+ "/usr/lib/aarch64-linux-gnu/libcudart.so*",
+ "/usr/local/cuda/lib*/libcudart.so*",
+ "/usr/lib*/libcudart.so*",
+ "/usr/local/lib*/libcudart.so*",
+}
+
+var NvmlGlobs = []string{}
+
+var NvcudaGlobs = []string{
+ "/usr/local/cuda*/targets/*/lib/libcuda.so*",
+ "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+ "/usr/lib/*-linux-gnu/libcuda.so*",
+ "/usr/lib/wsl/lib/libcuda.so*",
+ "/usr/lib/wsl/drivers/*/libcuda.so*",
+ "/opt/cuda/lib*/libcuda.so*",
+ "/usr/local/cuda/lib*/libcuda.so*",
+ "/usr/lib*/libcuda.so*",
+ "/usr/local/lib*/libcuda.so*",
+}
+
+var OneapiGlobs = []string{
+ "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+ "/usr/lib*/libze_intel_gpu.so*",
+}
+
+var CudartMgmtName = "libcudart.so*"
+var NvcudaMgmtName = "libcuda.so*"
+var NvmlMgmtName = "" // not currently wired on linux
+var OneapiMgmtName = "libze_intel_gpu.so"
+
+func GetCPUMem() (memInfo, error) {
+ var mem memInfo
+ var total, available, free, buffers, cached uint64
+ f, err := os.Open("/proc/meminfo")
+ if err != nil {
+ return mem, err
+ }
+ defer f.Close()
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ line := s.Text()
+ switch {
+ case strings.HasPrefix(line, "MemTotal:"):
+ _, err = fmt.Sscanf(line, "MemTotal:%d", &total)
+ case strings.HasPrefix(line, "MemAvailable:"):
+ _, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
+ case strings.HasPrefix(line, "MemFree:"):
+ _, err = fmt.Sscanf(line, "MemFree:%d", &free)
+ case strings.HasPrefix(line, "Buffers:"):
+ _, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
+ case strings.HasPrefix(line, "Cached:"):
+ _, err = fmt.Sscanf(line, "Cached:%d", &cached)
+ default:
+ continue
+ }
+ if err != nil {
+ return mem, err
+ }
+
+ if total > 0 && available > 0 {
+ mem.TotalMemory = total * format.KibiByte
+ mem.FreeMemory = available * format.KibiByte
+ return mem, nil
+ }
+ }
+ mem.TotalMemory = total * format.KibiByte
+ mem.FreeMemory = (free + buffers + cached) * format.KibiByte
+ return mem, nil
+}
diff --git a/gpu/gpu_windows.go b/gpu/gpu_windows.go
new file mode 100644
index 00000000..f8c2e76f
--- /dev/null
+++ b/gpu/gpu_windows.go
@@ -0,0 +1,55 @@
+package gpu
+
+import (
+ "fmt"
+ "syscall"
+ "unsafe"
+)
+
+type MEMORYSTATUSEX struct {
+ length uint32
+ MemoryLoad uint32
+ TotalPhys uint64
+ AvailPhys uint64
+ TotalPageFile uint64
+ AvailPageFile uint64
+ TotalVirtual uint64
+ AvailVirtual uint64
+ AvailExtendedVirtual uint64
+}
+
+var (
+ k32 = syscall.NewLazyDLL("kernel32.dll")
+ globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
+ sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
+)
+
+var CudartGlobs = []string{
+ "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+var NvmlGlobs = []string{
+ "c:\\Windows\\System32\\nvml.dll",
+}
+
+var NvcudaGlobs = []string{
+ "c:\\windows\\system*\\nvcuda.dll",
+}
+
+var OneapiGlobs = []string{
+ "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var CudartMgmtName = "cudart64_*.dll"
+var NvcudaMgmtName = "nvcuda.dll"
+var NvmlMgmtName = "nvml.dll"
+var OneapiMgmtName = "ze_intel_gpu64.dll"
+
+func GetCPUMem() (memInfo, error) {
+ memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
+ r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
+ if r1 == 0 {
+ return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
+ }
+ return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
+}
diff --git a/gpu/types.go b/gpu/types.go
index af33b896..9920db5f 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -18,7 +18,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
- Variant string `json:"variant,omitempty"`
+ Variant CPUCapability `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
@@ -26,6 +26,9 @@ type GpuInfo struct {
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath string `json:"lib_path,omitempty"`
+ // Extra environment variables specific to the GPU as list of [key,value]
+ EnvWorkarounds [][2]string `json:"envs,omitempty"`
+
// GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available
@@ -38,6 +41,30 @@ type GpuInfo struct {
// TODO other performance capability info to help in scheduling decisions
}
+type CPUInfo struct {
+ GpuInfo
+}
+
+type CudaGPUInfo struct {
+ GpuInfo
+ index int //nolint:unused,nolintlint
+}
+type CudaGPUInfoList []CudaGPUInfo
+
+type RocmGPUInfo struct {
+ GpuInfo
+ usedFilepath string //nolint:unused,nolintlint
+ index int //nolint:unused,nolintlint
+}
+type RocmGPUInfoList []RocmGPUInfo
+
+type OneapiGPUInfo struct {
+ GpuInfo
+ driverIndex int //nolint:unused,nolintlint
+ gpuIndex int //nolint:unused,nolintlint
+}
+type OneapiGPUInfoList []OneapiGPUInfo
+
type GpuInfoList []GpuInfo
// Split up the set of gpu info's by Library and variant
@@ -47,8 +74,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l {
found := false
requested := info.Library
- if info.Variant != "" {
- requested += "_" + info.Variant
+ if info.Variant != CPUCapabilityNone {
+ requested += "_" + info.Variant.String()
}
for i, lib := range libs {
if lib == requested {
@@ -86,3 +113,26 @@ type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
+
+type CPUCapability uint32
+
+// Override at build time when building base GPU runners
+var GPURunnerCPUCapability = CPUCapabilityAVX
+
+const (
+ CPUCapabilityNone CPUCapability = iota
+ CPUCapabilityAVX
+ CPUCapabilityAVX2
+ // TODO AVX512
+)
+
+func (c CPUCapability) String() string {
+ switch c {
+ case CPUCapabilityAVX:
+ return "avx"
+ case CPUCapabilityAVX2:
+ return "avx2"
+ default:
+ return "no vector extensions"
+ }
+}
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index f6bdb9d4..d66ba9f0 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
var (
req = [2]api.GenerateRequest{
{
- Model: "orca-mini",
- Prompt: "why is the ocean blue?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "why is the ocean blue?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
- Model: "tinydolphin",
- Prompt: "what is the origin of the us thanksgiving holiday?",
- Stream: &stream,
+ Model: "tinydolphin",
+ Prompt: "what is the origin of the us thanksgiving holiday?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -38,42 +40,64 @@ func TestMultiModelConcurrency(t *testing.T) {
}
resp = [2][]string{
[]string{"sunlight"},
- []string{"england", "english", "massachusetts", "pilgrims"},
+ []string{"england", "english", "massachusetts", "pilgrims", "british"},
}
)
var wg sync.WaitGroup
wg.Add(len(req))
- ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
+ ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
defer cancel()
+
+ client, _, cleanup := InitServerConnection(ctx, t)
+ defer cleanup()
+
+ for i := 0; i < len(req); i++ {
+ require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
+ }
+
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
- GenerateTestHelper(ctx, t, req[i], resp[i])
+ DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
}(i)
}
wg.Wait()
}
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
+ req, resp := GenerateRequests()
+ reqLimit := len(req)
+ iterLimit := 5
+
+ vram := os.Getenv("OLLAMA_MAX_VRAM")
+ if vram != "" {
+ max, err := strconv.ParseUint(vram, 10, 64)
+ require.NoError(t, err)
+ // Don't hammer on small VRAM cards...
+ if max < 4*1024*1024*1024 {
+ reqLimit = min(reqLimit, 2)
+ iterLimit = 2
+ }
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
- req, resp := GenerateRequests()
// Get the server running (if applicable) warm the model up with a single initial request
- DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
+ DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
var wg sync.WaitGroup
- wg.Add(len(req))
- for i := 0; i < len(req); i++ {
+ wg.Add(reqLimit)
+ for i := 0; i < reqLimit; i++ {
go func(i int) {
defer wg.Done()
- for j := 0; j < 5; j++ {
+ for j := 0; j < iterLimit; j++ {
slog.Info("Starting", "req", i, "iter", j)
- // On slower GPUs it can take a while to process the 4 concurrent requests
+ // On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
- DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
+ DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
}
}(i)
}
@@ -221,5 +245,23 @@ func TestMultiModelStress(t *testing.T) {
}
}(i)
}
+ go func() {
+ for {
+ time.Sleep(2 * time.Second)
+ select {
+ case <-ctx.Done():
+ return
+ default:
+ models, err := client.ListRunning(ctx)
+ if err != nil {
+ slog.Warn("failed to list running models", "error", err)
+ continue
+ }
+ for _, m := range models.Models {
+ slog.Info("loaded model snapshot", "model", m)
+ }
+ }
+ }
+ }()
wg.Wait()
}
diff --git a/integration/context_test.go b/integration/context_test.go
index 08033125..46fac5ea 100644
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,8 @@ import (
)
func TestContextExhaustion(t *testing.T) {
- ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
+ // Longer needed for small footprint GPUs
+ ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
diff --git a/integration/llm_image_test.go b/integration/llm_image_test.go
index 77319aef..d0c861cc 100644
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
resp := "the ollam"
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
- GenerateTestHelper(ctx, t, req, []string{resp})
+ client, _, cleanup := InitServerConnection(ctx, t)
+ defer cleanup()
+ require.NoError(t, PullIfMissing(ctx, client, req.Model))
+ // llava models on CPU can be quite slow to start,
+ DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
}
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c6f19e98..7e1fcc10 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
showCtx, cancel := context.WithDeadlineCause(
ctx,
- time.Now().Add(5*time.Second),
+ time.Now().Add(10*time.Second),
fmt.Errorf("show for existing model %s took too long", modelName),
)
defer cancel()
@@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
return []api.GenerateRequest{
{
- Model: "orca-mini",
- Prompt: "why is the ocean blue?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "why is the ocean blue?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
- Model: "orca-mini",
- Prompt: "why is the color of dirt brown?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "why is the color of dirt brown?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
- Model: "orca-mini",
- Prompt: "what is the origin of the us thanksgiving holiday?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "what is the origin of the us thanksgiving holiday?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
- Model: "orca-mini",
- Prompt: "what is the origin of independence day?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "what is the origin of independence day?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
- Model: "orca-mini",
- Prompt: "what is the composition of air?",
- Stream: &stream,
+ Model: "orca-mini",
+ Prompt: "what is the composition of air?",
+ Stream: &stream,
+ KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -331,7 +336,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{
[]string{"sunlight"},
[]string{"soil", "organic", "earth", "black", "tan"},
- []string{"england", "english", "massachusetts", "pilgrims"},
+ []string{"england", "english", "massachusetts", "pilgrims", "british"},
[]string{"fourth", "july", "declaration", "independence"},
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
}
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 93e71562..492126a4 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -56,7 +56,6 @@ struct server_params {
std::string hostname = "127.0.0.1";
std::vector api_keys;
std::string public_path = "examples/server/public";
- std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
return true;
}
- void validate_model_chat_template(server_params & sparams) {
- llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
- if (res < 0) {
- LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "chatml";
- }
- }
-
void initialize() {
// create slots
all_slots_are_idle = true;
@@ -2335,9 +2324,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA
}
else if (arg == "--tensor-split" || arg == "-ts")
{
@@ -2346,7 +2335,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@@ -2367,8 +2356,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
}
}
#else
- LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+ LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUDA
}
else if (arg == "--main-gpu" || arg == "-mg")
{
@@ -2377,7 +2366,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
- sparams.chat_template = argv[i];
}
else if (arg == "--override-kv")
{
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
}
const auto model_meta = llama.model_meta();
- if (sparams.chat_template.empty()) { // custom chat template is not supplied
- // check if the template comes with the model is supported by us
- llama.validate_model_chat_template(sparams);
- }
-
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 0baf86ff..721a9ae8 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
fi
}
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
case "${GOARCH}" in
"amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index dbf06c19..43ad5b81 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -60,7 +60,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -73,7 +73,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
@@ -102,7 +102,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
- COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+ COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -187,7 +187,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
- CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+ CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index e02fb596..b793af56 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -51,7 +51,8 @@ function init_vars {
}
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
- "-DLLAMA_NATIVE=off"
+ "-DLLAMA_NATIVE=off",
+ "-DLLAMA_OPENMP=off"
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -134,8 +135,13 @@ function build {
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
- write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
- & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+ if ($cmakeDefs -contains "-G") {
+ $extra=@("-j8")
+ } else {
+ $extra= @("--", "/p:CL_MPcount=8")
+ }
+ write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
+ & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -215,7 +221,8 @@ function build_static() {
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
- "-DLLAMA_FMA=off")
+ "-DLLAMA_FMA=off",
+ "-DLLAMA_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
@@ -282,7 +289,15 @@ function build_cuda() {
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
- $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+ $script:cmakeDefs += @(
+ "-A", "x64",
+ "-DLLAMA_CUDA=ON",
+ "-DLLAMA_AVX=on",
+ "-DLLAMA_AVX2=off",
+ "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
+ "-DCMAKE_CUDA_FLAGS=-t8",
+ "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+ )
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -292,10 +307,12 @@ function build_cuda() {
sign
install
- write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+ rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
+ write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else {
write-host "Skipping CUDA generation step"
}
@@ -329,16 +346,18 @@ function build_oneapi() {
sign
install
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+ rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else {
Write-Host "Skipping oneAPI generation step"
}
diff --git a/llm/ggml.go b/llm/ggml.go
index 645447d5..f02f0ff6 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -69,6 +69,30 @@ func (kv KV) HeadCountKV() uint64 {
return 1
}
+func (kv KV) EmbeddingHeadCount() uint64 {
+ if heads := kv.HeadCount(); heads > 0 {
+ return kv.EmbeddingLength() / kv.HeadCount()
+ }
+
+ return 0
+}
+
+func (kv KV) EmbeddingHeadCountK() uint64 {
+ if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
+ return k
+ }
+
+ return kv.EmbeddingHeadCount()
+}
+
+func (kv KV) EmbeddingHeadCountV() uint64 {
+ if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
+ return v
+ }
+
+ return kv.EmbeddingHeadCount()
+}
+
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
@@ -299,6 +323,9 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+ embeddingHeads := llm.KV().EmbeddingHeadCount()
+ embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
+
layers := llm.Tensors().Layers()
switch llm.KV().Architecture() {
@@ -307,7 +334,8 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding
partialOffload += max(
- 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
+ // 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
+ 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
@@ -315,15 +343,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max(
- 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
- 4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
+ 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+ 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
)
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
- 4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+ 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
@@ -366,6 +394,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding),
fullOffload,
)
+ case "deepseek2":
+ fullOffload = max(
+ 4*batch*(3*embedding+vocab),
+ 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
+ )
+
+ partialOffload = max(
+ 4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
+ 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
+ )
}
return
diff --git a/llm/llama.cpp b/llm/llama.cpp
index 5921b8f0..7c26775a 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603
+Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c
diff --git a/llm/memory.go b/llm/memory.go
index 1c2e476b..19b12cbf 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,9 +3,10 @@ package llm
import (
"fmt"
"log/slog"
+ "strconv"
+ "strings"
"github.com/ollama/ollama/api"
- "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
- layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
+ estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+ layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM
@@ -30,24 +32,76 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM
}
+type MemoryEstimate struct {
+ // How many layers we predict we can load
+ Layers int
+
+ // The size of the graph which occupies the main GPU
+ Graph uint64
+
+ // How much VRAM will be allocated given the number of layers we predict
+ VRAMSize uint64
+
+ // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
+ TotalSize uint64
+
+ // For multi-GPU scenarios, this provides the tensor split parameter
+ TensorSplit string
+
+ // For multi-GPU scenarios, this is the size in bytes per GPU
+ GPUSizes []uint64
+
+ // internal fields for logging purposes
+ inferenceLibrary string
+ layersRequested int
+ layersModel int
+ availableList []string
+ kv uint64
+ allocationsList []string
+ memoryWeights uint64
+ memoryLayerOutput uint64
+ graphFullOffload uint64
+ graphPartialOffload uint64
+}
+
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
- var memoryAvailable uint64
- for _, info := range gpus {
- memoryAvailable += info.FreeMemory
- }
- if envconfig.MaxVRAM > 0 {
- memoryAvailable = envconfig.MaxVRAM
- }
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+ // Graph size for a partial offload, applies to all GPUs
+ var graphPartialOffload uint64
- slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
+ // Graph size when all layers are offloaded, applies to all GPUs
+ var graphFullOffload uint64
- // TODO - this is probably wrong, first GPU vs secondaries will have different overheads
- memoryMinimum := gpus[0].MinimumMemory
+ // Final graph offload once we know full or partial
+ var graphOffload uint64
+
+ // Projectors loaded into GPU0 only
+ var projectorSize uint64
+
+ // Conditional output size on GPU 0
+ var memoryLayerOutput uint64
+
+ // The sizes of a layer
+ var layerSize uint64
+
+ // The sum of all the layer sizes (just for logging)
+ var memoryWeights uint64
+
+ // True if all the layers are loaded
+ var fullyLoaded bool
+
+ // Overflow that didn't fit into the GPU
+ var overflow uint64
+
+ availableList := make([]string, len(gpus))
+ for i, gpu := range gpus {
+ availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+ }
+ slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
- memoryMinimum += projectorMemoryRequirements(projector)
+ projectorSize += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
@@ -56,127 +110,246 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok {
- memoryMinimum += blk0.size()
+ layerSize = blk0.size()
+ } else {
+ slog.Warn("model missing blk.0 layer size")
}
- // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
- var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
+ // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+ var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
- graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+ // KV is proportional to the number of layers
+ layerSize += kv / ggml.KV().BlockCount()
+
+ graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
-
if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
}
- graphFullOffload *= uint64(len(gpus))
- graphPartialOffload *= uint64(len(gpus))
-
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload
+ } else if len(gpus) > 1 {
+ // multigpu should always use the partial graph size
+ graphFullOffload = graphPartialOffload
}
- // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
- memoryRequiredTotal := memoryMinimum + graphFullOffload
-
- // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
- memoryRequiredPartial := memoryMinimum + graphPartialOffload
-
- var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size()
}
-
if layer, ok := layers["output"]; ok {
memoryLayerOutput += layer.size()
} else if layer, ok := layers["token_embd"]; ok {
memoryLayerOutput += layer.size()
}
- if gpus[0].Library == "metal" && opts.UseMMap {
- // memory is preallocated for output tensors
- memoryRequiredTotal += memoryLayerOutput
- memoryRequiredPartial += memoryLayerOutput
+ // Output layer handled at the end if we have space
+ gpuZeroOverhead := projectorSize
+
+ // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
+ var layerCount int
+ layerCounts := make([]int, len(gpus))
+ gpuAllocations := make([]uint64, len(gpus))
+ type gs struct {
+ i int
+ g *gpu.GpuInfo
+ }
+ gpusWithSpace := []gs{}
+ for i := range gpus {
+ var gzo uint64
+ if len(gpusWithSpace) == 0 {
+ gzo = gpuZeroOverhead
+ }
+ // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
+ if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+ slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
+ continue
+ }
+ gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
+ gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
}
- var layerCount int
+ var gpuZeroID int
+ if len(gpusWithSpace) > 0 {
+ gpuZeroID = gpusWithSpace[0].i
+ gpuAllocations[gpuZeroID] += gpuZeroOverhead
+ }
+
+ // For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) {
+ // Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
- memoryLayer := blk.size()
+ layerSize = blk.size()
+ layerSize += kv / ggml.KV().BlockCount()
+ }
+ memoryWeights += layerSize
- // KV is proportional to the number of layers
- memoryLayer += kv / ggml.KV().BlockCount()
+ if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
+ // Stop allocating on GPU(s) once we hit the users target NumGPU
+ continue
+ }
- memoryRequiredTotal += memoryLayer
- if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
- memoryRequiredPartial += memoryLayer
+ // distribute the layers across the GPU(s) that have space
+ for j := len(gpusWithSpace); j > 0; j-- {
+ g := gpusWithSpace[i%j]
+ used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+ if g.g.FreeMemory > used+layerSize {
+ gpuAllocations[g.i] += layerSize
+ layerCounts[g.i]++
layerCount++
+ break
+ } else {
+ gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
}
}
}
-
- if gpus[0].Library != "metal" || !opts.UseMMap {
- // memory was not preallocated for output tensors
- memoryRequiredTotal += memoryLayerOutput
+ if layerCount >= int(ggml.KV().BlockCount()) {
+ fullyLoaded = true
+ } else {
+ for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
+ overflow += layerSize
+ }
}
- if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
- layerCount = int(ggml.KV().BlockCount()) + 1
- memoryRequiredPartial = memoryRequiredTotal
+ // Determine if we need to consider output then find where it fits
+ if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+ for j := len(gpusWithSpace); j > 0; j-- {
+ g := gpusWithSpace[layerCount%j]
+ used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+ if g.g.FreeMemory > used+memoryLayerOutput {
+ gpuAllocations[g.i] += memoryLayerOutput
+ layerCounts[g.i]++
+ layerCount++
+ break
+ }
+ }
+
+ if layerCount < int(ggml.KV().BlockCount())+1 {
+ fullyLoaded = false
+ overflow += memoryLayerOutput
+ }
}
- memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
+ // Add the applicable (full or partial) graph allocations
+ for i := range gpus {
+ if layerCounts[i] <= 0 {
+ continue
+ }
+ if fullyLoaded {
+ gpuAllocations[i] += graphFullOffload
+ } else {
+ gpuAllocations[i] += graphPartialOffload
+ }
+ }
+ if fullyLoaded {
+ graphOffload = graphFullOffload
+ } else {
+ graphOffload = graphPartialOffload
+ }
+ // Summaries for the log
+ var memoryRequiredPartial, memoryRequiredTotal uint64
+ for i := range gpuAllocations {
+ memoryRequiredPartial += gpuAllocations[i]
+ }
+ memoryRequiredTotal = memoryRequiredPartial + overflow
+
+ tensorSplit := ""
+ if len(gpus) > 1 {
+ splits := make([]string, len(gpus))
+ for i, count := range layerCounts {
+ splits[i] = strconv.Itoa(count)
+ }
+ tensorSplit = strings.Join(splits, ",")
+ }
+ allocationsList := []string{}
+ for _, a := range gpuAllocations {
+ allocationsList = append(allocationsList, format.HumanBytes2(a))
+ }
+
+ estimate := MemoryEstimate{
+ TotalSize: memoryRequiredTotal,
+ Layers: 0,
+ Graph: 0,
+ VRAMSize: 0,
+ GPUSizes: []uint64{},
+
+ inferenceLibrary: gpus[0].Library,
+ layersRequested: opts.NumGPU,
+ layersModel: int(ggml.KV().BlockCount()) + 1,
+ availableList: availableList,
+ kv: kv,
+ allocationsList: allocationsList,
+ memoryWeights: memoryWeights,
+ memoryLayerOutput: memoryLayerOutput,
+ graphFullOffload: graphFullOffload,
+ graphPartialOffload: graphPartialOffload,
+ }
+
+ if gpus[0].Library == "cpu" {
+ return estimate
+ }
+ if layerCount == 0 {
+ slog.Debug("insufficient VRAM to load any model layers")
+ return estimate
+ }
+ estimate.Layers = layerCount
+ estimate.Graph = graphOffload
+ estimate.VRAMSize = memoryRequiredPartial
+ estimate.TotalSize = memoryRequiredTotal
+ estimate.TensorSplit = tensorSplit
+ estimate.GPUSizes = gpuAllocations
+ return estimate
+}
+
+func (m MemoryEstimate) log() {
slog.Info(
- "offload to gpu",
+ "offload to "+m.inferenceLibrary,
slog.Group(
"layers",
// requested number of layers to offload
- "requested", opts.NumGPU,
+ "requested", m.layersRequested,
+ // The number of layers the model has (including output)
+ "model", m.layersModel,
// estimated number of layers that can be offloaded
- "real", layerCount,
+ "offload", m.Layers,
+ // multi-gpu split for tensors
+ "split", m.TensorSplit,
),
slog.Group(
"memory",
- // memory available for offloading
- "available", format.HumanBytes2(memoryAvailable),
+ // memory available by GPU for offloading
+ "available", m.availableList,
slog.Group(
"required",
// memory required for full offloading
- "full", format.HumanBytes2(memoryRequiredTotal),
+ "full", format.HumanBytes2(m.TotalSize),
// memory required to offload layers.estimate layers
- "partial", format.HumanBytes2(memoryRequiredPartial),
+ "partial", format.HumanBytes2(m.VRAMSize),
// memory of KV cache
- "kv", format.HumanBytes2(kv),
+ "kv", format.HumanBytes2(m.kv),
+ // Allocations across the GPUs
+ "allocations", m.allocationsList,
),
slog.Group(
"weights",
// memory of the weights
- "total", format.HumanBytes2(memoryWeights),
+ "total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers
- "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+ "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
// memory of non-repeating layers
- "nonrepeating", format.HumanBytes2(memoryLayerOutput),
+ "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
- "full", format.HumanBytes2(graphFullOffload),
+ "full", format.HumanBytes2(m.graphFullOffload),
// memory of graph when not fully offloaded
- "partial", format.HumanBytes2(graphPartialOffload),
+ "partial", format.HumanBytes2(m.graphPartialOffload),
),
),
)
- if gpus[0].Library == "cpu" {
- return 0, 0, memoryRequiredTotal
- }
- if memoryRequiredPartial > memoryAvailable {
- slog.Debug("insufficient VRAM to load any model layers")
- return 0, 0, memoryRequiredTotal
- }
-
- return layerCount, memoryRequiredPartial, memoryRequiredTotal
}
diff --git a/llm/memory_test.go b/llm/memory_test.go
new file mode 100644
index 00000000..8eaa0771
--- /dev/null
+++ b/llm/memory_test.go
@@ -0,0 +1,127 @@
+package llm
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "os"
+ "testing"
+
+ "github.com/ollama/ollama/api"
+ "github.com/ollama/ollama/envconfig"
+ "github.com/ollama/ollama/gpu"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestEstimateGPULayers(t *testing.T) {
+ envconfig.Debug = true
+ modelName := "dummy"
+ f, err := os.CreateTemp(t.TempDir(), modelName)
+ require.NoError(t, err)
+ defer f.Close()
+ gguf := NewGGUFV3(binary.LittleEndian)
+ inputLayerCount := 5
+ tensors := []Tensor{
+ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ }
+ assert.Len(t, tensors, inputLayerCount+1)
+ err = gguf.Encode(f, KV{
+ "general.architecture": "llama",
+ "general.name": "name",
+ "llama.context_length": uint32(32),
+ "llama.embedding_length": uint32(4096),
+ "llama.block_count": uint32(inputLayerCount),
+ "llama.attention.head_count": uint32(32),
+ "llama.attention.head_count_kv": uint32(32),
+ "tokenizer.ggml.tokens": []string{" "},
+ "tokenizer.ggml.scores": []float32{0},
+ "tokenizer.ggml.token_type": []int32{0},
+ }, tensors)
+ require.NoError(t, err)
+
+ ggml, err := LoadModel(f.Name())
+ require.NoError(t, err)
+
+ // Simple CPU scenario
+ gpus := []gpu.GpuInfo{
+ {
+ Library: "cpu",
+ },
+ }
+ projectors := []string{}
+ opts := api.DefaultOptions()
+ t.Run("cpu", func(t *testing.T) {
+ estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+ assert.Equal(t, 0, estimate.Layers)
+ assert.Equal(t, uint64(0), estimate.Graph)
+ })
+
+ // derived from the dummy ggml file above
+ graphPartialOffload := uint64(202377216)
+ graphFullOffload := uint64(171968512)
+ layerSize := uint64(33554436)
+ projectorSize := uint64(0)
+ memoryLayerOutput := uint64(4)
+
+ // Dual CUDA scenario with assymetry
+ gpuMinimumMemory := uint64(2048)
+ gpus = []gpu.GpuInfo{
+ {
+ Library: "cuda",
+ MinimumMemory: gpuMinimumMemory,
+ },
+ {
+ Library: "cuda",
+ MinimumMemory: gpuMinimumMemory,
+ },
+ }
+ // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
+ for i, s := range []struct {
+ layer0, layer1 uint64
+ expect0, expect1 uint64
+ }{
+ {1, 1, 1, 1},
+ {2, 1, 2, 1},
+ {2, 2, 2, 2},
+ {1, 2, 1, 2},
+ {3, 3, 3, 3},
+ {4, 4, 3, 3},
+ {6, 6, 3, 3},
+ {0, 3, 0, 3},
+ } {
+ t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
+ gpus[0].FreeMemory = 0
+ gpus[1].FreeMemory = 0
+ gpus[0].FreeMemory += projectorSize
+ if s.layer0 > 0 {
+ gpus[0].FreeMemory += memoryLayerOutput
+ } else {
+ gpus[1].FreeMemory += memoryLayerOutput
+ }
+ gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
+ gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
+ gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+ gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+ estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+ assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
+ assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
+ var layerSums uint64
+ for _, b := range estimate.GPUSizes {
+ layerSums += b
+ }
+ if estimate.Layers < inputLayerCount+1 {
+ assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+ assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+ } else {
+ assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+ assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+ }
+ })
+ }
+}
diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff
index acd44d20..be528609 100644
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
--- a/common/common.cpp
+++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
--- a/common/common.h
+++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
- // multimodal models (see examples/llava)
+@@ -180,6 +180,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector image; // path to image file(s)
-+
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
- };
-
- void gpt_params_handle_model_default(gpt_params & params);
++
+ // server params
+ int32_t port = 8080; // server listens on this network port
+ int32_t timeout_read = 600; // http read timeout in seconds
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 27c8aabc..2a2e7306 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
--- a/llama.cpp
+++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-- } else if (
-+ if (
- tokenizer_pre == "default") {
+- } else if (tokenizer_pre == "default") {
++ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
- tokenizer_pre == "smaug-bpe") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+ tokenizer_pre == "llama3" ||
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
+ tokenizer_pre == "poro-chat") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/payload.go b/llm/payload.go
index a025ee34..9296db33 100644
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
}
// glob payloadsDir for files that start with ollama_
- pattern := filepath.Join(payloadsDir, "*")
+ pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
- servers[filepath.Base(file)] = file
+ servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := availableServers()
requested := info.Library
- if info.Variant != "" {
- requested += "_" + info.Variant
+ if info.Variant != gpu.CPUCapabilityNone {
+ requested += "_" + info.Variant.String()
}
servers := []string{}
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
- variant := gpu.GetCPUVariant()
+ variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
- if variant != "" {
+ if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
- if cmp == "cpu_"+variant {
+ if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
@@ -146,11 +146,11 @@ func serverForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
- variant := gpu.GetCPUVariant()
+ variant := gpu.GetCPUCapability()
availableServers := availableServers()
- if variant != "" {
+ if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
- if cmp == "cpu_"+variant {
+ if cmp == "cpu_"+variant.String() {
return cmp
}
}
diff --git a/llm/server.go b/llm/server.go
index 0a815798..da83416e 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -37,8 +37,9 @@ type LlamaServer interface {
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
- EstimatedVRAM() uint64
+ EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
+ EstimatedVRAMByGPU(gpuID string) uint64
}
// llmServer is an instance of the llama.cpp server
@@ -49,13 +50,12 @@ type llmServer struct {
status *StatusWriter
options api.Options
- // TODO - this should be broken down by GPU
- estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
- estimatedTotal uint64 // Total size of model
- totalLayers uint64
- gpuCount int
- loadDuration time.Duration // Record how long it took the model to load
- loadProgress float32
+ estimate MemoryEstimate
+ totalLayers uint64
+ // gpuCount int
+ gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
+ loadDuration time.Duration // Record how long it took the model to load
+ loadProgress float32
sem *semaphore.Weighted
}
@@ -80,43 +80,45 @@ func LoadModel(model string) (*GGML, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
var err error
var cpuRunner string
- var estimatedVRAM uint64
- var estimatedTotal uint64
- var systemMemory uint64
- gpuCount := len(gpus)
- if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
- // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
+ var estimate MemoryEstimate
+ var systemTotalMemory uint64
+ var systemFreeMemory uint64
- cpuRunner = serverForCpu()
- gpuCount = 0
- _, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+ systemMemInfo, err := gpu.GetCPUMem()
+ if err != nil {
+ slog.Error("failed to lookup system memory", "error", err)
} else {
- if gpus[0].Library == "metal" {
- memInfo, err := gpu.GetCPUMem()
- if err != nil {
- slog.Error("failed to lookup system memory", "error", err)
- } else {
- systemMemory = memInfo.TotalMemory
- slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
- }
- }
- var layers int
- layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+ systemTotalMemory = systemMemInfo.TotalMemory
+ systemFreeMemory = systemMemInfo.FreeMemory
+ slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
+ }
+
+ // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
+ if opts.NumGPU == 0 {
+ gpus = gpu.GetCPUInfo()
+ }
+ if len(gpus) == 1 && gpus[0].Library == "cpu" {
+ cpuRunner = serverForCpu()
+ estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+ } else {
+ estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
- case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
+ case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
- case gpus[0].Library != "metal" && layers == 0:
+ case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu()
- gpuCount = 0
- case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
- opts.NumGPU = layers
+ gpus = gpu.GetCPUInfo()
+ case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
+ opts.NumGPU = estimate.Layers
}
}
+ estimate.log()
+
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
@@ -201,7 +203,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
- opts.UseMMap = false
+ opts.UseMMap = api.TriStateFalse
}
}
@@ -209,7 +211,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn")
}
- if !opts.UseMMap {
+ // Windows CUDA should not use mmap for best performance
+ // Linux with a model larger than free space, mmap leads to thrashing
+ if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
+ (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+ opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}
@@ -232,6 +238,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
+ if estimate.TensorSplit != "" {
+ params = append(params, "--tensor-split", estimate.TensorSplit)
+ }
+
+ if estimate.TensorSplit != "" {
+ params = append(params, "--tensor-split", estimate.TensorSplit)
+ }
+
for i := range len(servers) {
dir := availableServers[servers[i]]
if dir == "" {
@@ -242,8 +256,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if strings.HasPrefix(servers[i], "cpu") {
- // TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
- gpuCount = 0
+ gpus = gpu.GetCPUInfo()
}
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@@ -265,8 +278,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
- // prepend the server directory to LD_LIBRARY_PATH/PATH
- libraryPaths := []string{dir}
+ // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
+ libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
@@ -299,22 +312,25 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
s := &llmServer{
- port: port,
- cmd: exec.Command(server, finalParams...),
- status: NewStatusWriter(os.Stderr),
- options: opts,
- estimatedVRAM: estimatedVRAM,
- estimatedTotal: estimatedTotal,
- sem: semaphore.NewWeighted(int64(numParallel)),
- totalLayers: ggml.KV().BlockCount() + 1,
- gpuCount: gpuCount,
- done: make(chan error, 1),
+ port: port,
+ cmd: exec.Command(server, finalParams...),
+ status: NewStatusWriter(os.Stderr),
+ options: opts,
+ estimate: estimate,
+ sem: semaphore.NewWeighted(int64(numParallel)),
+ totalLayers: ggml.KV().BlockCount() + 1,
+ gpus: gpus,
+ done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
+ envWorkarounds := [][2]string{}
+ for _, gpu := range gpus {
+ envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
+ }
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
@@ -329,6 +345,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
+ } else if len(envWorkarounds) != 0 {
+ for _, kv := range envWorkarounds {
+ if strings.EqualFold(cmp[0], kv[0]) {
+ s.cmd.Env[i] = kv[0] + "=" + kv[1]
+ }
+ }
}
}
if pathNeeded {
@@ -1004,11 +1026,20 @@ func (s *llmServer) Close() error {
}
func (s *llmServer) EstimatedVRAM() uint64 {
- return s.estimatedVRAM
+ return s.estimate.VRAMSize
}
func (s *llmServer) EstimatedTotal() uint64 {
- return s.estimatedTotal
+ return s.estimate.TotalSize
+}
+
+func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
+ for i, gpu := range s.gpus {
+ if gpu.ID == gpuID {
+ return s.estimate.GPUSizes[i]
+ }
+ }
+ return 0
}
func parseDurationMs(ms float64) time.Duration {
diff --git a/openai/openai.go b/openai/openai.go
index 310051a5..706d31aa 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -178,9 +178,6 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
if r.Seed != nil {
options["seed"] = *r.Seed
-
- // temperature=0 is required for reproducible outputs
- options["temperature"] = 0.0
}
if r.FrequencyPenalty != nil {
diff --git a/parser/parser.go b/parser/parser.go
index c9afc20b..686a1e69 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -3,15 +3,14 @@ package parser
import (
"bufio"
"bytes"
- "encoding/binary"
"errors"
"fmt"
"io"
- "log/slog"
"strconv"
"strings"
- "unicode/utf16"
- "unicode/utf8"
+
+ "golang.org/x/text/encoding/unicode"
+ "golang.org/x/text/transform"
)
type File struct {
@@ -74,26 +73,14 @@ func ParseFile(r io.Reader) (*File, error) {
var f File
- br := bufio.NewReader(r)
+ tr := unicode.BOMOverride(unicode.UTF8.NewDecoder())
+ br := bufio.NewReader(transform.NewReader(r, tr))
- var sc scannerDecoder = utf8ScannerDecoder{}
- if bom, err := br.Peek(2); err != nil {
- slog.Warn("error reading byte-order mark", "error", err)
- } else if bytes.Equal(bom, []byte{0xFE, 0xFF}) {
- sc = utf16ScannerDecoder{binary.LittleEndian}
- //nolint:errcheck
- br.Discard(2)
- } else if bytes.Equal(bom, []byte{0xFF, 0xFE}) {
- sc = utf16ScannerDecoder{binary.BigEndian}
- //nolint:errcheck
- br.Discard(2)
- }
-
- scanner := bufio.NewScanner(br)
- scanner.Split(sc.ScanBytes)
- for scanner.Scan() {
- r, err := sc.DecodeRune(scanner.Bytes())
- if err != nil {
+ for {
+ r, _, err := br.ReadRune()
+ if errors.Is(err, io.EOF) {
+ break
+ } else if err != nil {
return nil, err
}
@@ -315,39 +302,3 @@ func isValidCommand(cmd string) bool {
return false
}
}
-
-type scannerDecoder interface {
- ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error)
- DecodeRune([]byte) (rune, error)
-}
-
-type utf8ScannerDecoder struct{}
-
-func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
- return scanBytesN(data, 1, atEOF)
-}
-
-func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) {
- r, _ := utf8.DecodeRune(data)
- return r, nil
-}
-
-type utf16ScannerDecoder struct {
- binary.ByteOrder
-}
-
-func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
- return scanBytesN(data, 2, atEOF)
-}
-
-func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) {
- return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil
-}
-
-func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) {
- if atEOF && len(data) == 0 {
- return 0, nil, nil
- }
-
- return n, data[:n], nil
-}
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 55660590..7123e53b 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -11,6 +11,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
+ "golang.org/x/text/encoding"
+ "golang.org/x/text/encoding/unicode"
)
func TestParseFileFile(t *testing.T) {
@@ -517,14 +519,6 @@ PARAMETER param1 1
PARAMETER param2 4096
SYSTEM You are a utf16 file.
`
- // simulate a utf16 le file
- utf16File := utf16.Encode(append([]rune{'\ufffe'}, []rune(data)...))
- buf := new(bytes.Buffer)
- err := binary.Write(buf, binary.LittleEndian, utf16File)
- require.NoError(t, err)
-
- actual, err := ParseFile(buf)
- require.NoError(t, err)
expected := []Command{
{Name: "model", Args: "bob"},
@@ -533,14 +527,52 @@ SYSTEM You are a utf16 file.
{Name: "system", Args: "You are a utf16 file."},
}
- assert.Equal(t, expected, actual.Commands)
+ t.Run("le", func(t *testing.T) {
+ var b bytes.Buffer
+ require.NoError(t, binary.Write(&b, binary.LittleEndian, []byte{0xff, 0xfe}))
+ require.NoError(t, binary.Write(&b, binary.LittleEndian, utf16.Encode([]rune(data))))
- // simulate a utf16 be file
- buf = new(bytes.Buffer)
- err = binary.Write(buf, binary.BigEndian, utf16File)
- require.NoError(t, err)
+ actual, err := ParseFile(&b)
+ require.NoError(t, err)
- actual, err = ParseFile(buf)
- require.NoError(t, err)
- assert.Equal(t, expected, actual.Commands)
+ assert.Equal(t, expected, actual.Commands)
+ })
+
+ t.Run("be", func(t *testing.T) {
+ var b bytes.Buffer
+ require.NoError(t, binary.Write(&b, binary.BigEndian, []byte{0xfe, 0xff}))
+ require.NoError(t, binary.Write(&b, binary.BigEndian, utf16.Encode([]rune(data))))
+
+ actual, err := ParseFile(&b)
+ require.NoError(t, err)
+ assert.Equal(t, expected, actual.Commands)
+ })
+}
+
+func TestParseMultiByte(t *testing.T) {
+ input := `FROM test
+ SYSTEM 你好👋`
+
+ expect := []Command{
+ {Name: "model", Args: "test"},
+ {Name: "system", Args: "你好👋"},
+ }
+
+ encodings := []encoding.Encoding{
+ unicode.UTF8,
+ unicode.UTF16(unicode.LittleEndian, unicode.UseBOM),
+ unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
+ }
+
+ for _, encoding := range encodings {
+ t.Run(fmt.Sprintf("%s", encoding), func(t *testing.T) {
+ s, err := encoding.NewEncoder().String(input)
+ require.NoError(t, err)
+
+ actual, err := ParseFile(strings.NewReader(s))
+ require.NoError(t, err)
+
+ assert.Equal(t, expect, actual.Commands)
+ })
+ }
}
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 60de0307..b3991ce1 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,19 +103,19 @@ function buildApp() {
function gatherDependencies() {
write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}"
- md "${script:DEPS_DIR}" -ea 0 > $null
+ md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") {
write-host "about to sign"
- foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+ foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index a71d921d..2a06c350 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -159,8 +159,8 @@ check_gpu() {
esac ;;
lshw)
case $2 in
- nvidia) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
- amdgpu) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[1002\]' || return 1 ;;
+ nvidia) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
+ amdgpu) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[1002\]' || return 1 ;;
esac ;;
nvidia-smi) available nvidia-smi || return 1 ;;
esac
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
- fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+ fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
diff --git a/server/images.go b/server/images.go
index 5fd762ae..98794149 100644
--- a/server/images.go
+++ b/server/images.go
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
- layers, err := parseFromFile(ctx, temp, "", fn)
+ layer, err := NewLayer(temp, baseLayer.MediaType)
if err != nil {
return err
}
- if len(layers) != 1 {
- return errors.New("quantization failed")
+ if _, err := temp.Seek(0, io.SeekStart); err != nil {
+ return err
}
- baseLayer.Layer = layers[0].Layer
- baseLayer.GGML = layers[0].GGML
+ ggml, _, err := llm.DecodeGGML(temp)
+ if err != nil {
+ return err
+ }
+
+ baseLayer.Layer = layer
+ baseLayer.GGML = ggml
}
}
@@ -960,7 +965,6 @@ var errUnauthorized = fmt.Errorf("unauthorized: access denied")
func getTokenSubject(token string) string {
parts := strings.Split(token, ".")
if len(parts) != 3 {
- slog.Error("jwt token does not contain 3 parts")
return ""
}
diff --git a/server/manifest_test.go b/server/manifest_test.go
index b85976fd..ceee31d8 100644
--- a/server/manifest_test.go
+++ b/server/manifest_test.go
@@ -7,6 +7,7 @@ import (
"slices"
"testing"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
@@ -107,6 +108,7 @@ func TestManifests(t *testing.T) {
t.Run(n, func(t *testing.T) {
d := t.TempDir()
t.Setenv("OLLAMA_MODELS", d)
+ envconfig.LoadConfig()
for _, p := range wants.ps {
createManifest(t, d, p)
diff --git a/server/modelpath.go b/server/modelpath.go
index 25a817ca..64f59c29 100644
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -8,6 +8,8 @@ import (
"path/filepath"
"regexp"
"strings"
+
+ "github.com/ollama/ollama/envconfig"
)
type ModelPath struct {
@@ -104,14 +106,7 @@ func (mp ModelPath) GetShortTagname() string {
// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
// The models directory is where Ollama stores its model files and manifests.
func modelsDir() (string, error) {
- if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
- return models, nil
- }
- home, err := os.UserHomeDir()
- if err != nil {
- return "", err
- }
- return filepath.Join(home, ".ollama", "models"), nil
+ return envconfig.ModelsDir, nil
}
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
diff --git a/server/modelpath_test.go b/server/modelpath_test.go
index 849e0fa7..6c4dfbee 100644
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -7,6 +7,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
+
+ "github.com/ollama/ollama/envconfig"
)
func TestGetBlobsPath(t *testing.T) {
@@ -61,6 +63,7 @@ func TestGetBlobsPath(t *testing.T) {
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("OLLAMA_MODELS", dir)
+ envconfig.LoadConfig()
got, err := GetBlobsPath(tc.digest)
diff --git a/server/routes.go b/server/routes.go
index 188fe974..3d112e9f 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -646,9 +646,12 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
resp, err := GetModelInfo(req)
if err != nil {
- if os.IsNotExist(err) {
+ switch {
+ case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
- } else {
+ case err.Error() == "invalid model name":
+ c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+ default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
@@ -658,44 +661,55 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
}
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
- model, err := GetModel(req.Model)
+ m, err := GetModel(req.Model)
if err != nil {
return nil, err
}
modelDetails := api.ModelDetails{
- ParentModel: model.ParentModel,
- Format: model.Config.ModelFormat,
- Family: model.Config.ModelFamily,
- Families: model.Config.ModelFamilies,
- ParameterSize: model.Config.ModelType,
- QuantizationLevel: model.Config.FileType,
+ ParentModel: m.ParentModel,
+ Format: m.Config.ModelFormat,
+ Family: m.Config.ModelFamily,
+ Families: m.Config.ModelFamilies,
+ ParameterSize: m.Config.ModelType,
+ QuantizationLevel: m.Config.FileType,
}
if req.System != "" {
- model.System = req.System
+ m.System = req.System
}
if req.Template != "" {
- model.Template = req.Template
+ m.Template = req.Template
}
msgs := make([]api.Message, 0)
- for _, msg := range model.Messages {
+ for _, msg := range m.Messages {
msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
}
+ n := model.ParseName(req.Model)
+ if !n.IsValid() {
+ return nil, fmt.Errorf("invalid model name")
+ }
+
+ manifest, err := ParseNamedManifest(n)
+ if err != nil {
+ return nil, err
+ }
+
resp := &api.ShowResponse{
- License: strings.Join(model.License, "\n"),
- System: model.System,
- Template: model.Template,
- Details: modelDetails,
- Messages: msgs,
+ License: strings.Join(m.License, "\n"),
+ System: m.System,
+ Template: m.Template,
+ Details: modelDetails,
+ Messages: msgs,
+ ModifiedAt: manifest.fi.ModTime(),
}
var params []string
cs := 30
- for k, v := range model.Options {
+ for k, v := range m.Options {
switch val := v.(type) {
case []interface{}:
for _, nv := range val {
@@ -709,20 +723,55 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
for k, v := range req.Options {
if _, ok := req.Options[k]; ok {
- model.Options[k] = v
+ m.Options[k] = v
}
}
var sb strings.Builder
fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
- fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
- fmt.Fprint(&sb, model.String())
+ fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
+ fmt.Fprint(&sb, m.String())
resp.Modelfile = sb.String()
+ kvData, err := getKVData(m.ModelPath, req.Verbose)
+ if err != nil {
+ return nil, err
+ }
+ delete(kvData, "general.name")
+ delete(kvData, "tokenizer.chat_template")
+ resp.ModelInfo = kvData
+
+ if len(m.ProjectorPaths) > 0 {
+ projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
+ if err != nil {
+ return nil, err
+ }
+ resp.ProjectorInfo = projectorData
+ }
+
return resp, nil
}
+func getKVData(digest string, verbose bool) (llm.KV, error) {
+ kvData, err := llm.LoadModel(digest)
+ if err != nil {
+ return nil, err
+ }
+
+ kv := kvData.KV()
+
+ if !verbose {
+ for k := range kv {
+ if t, ok := kv[k].([]any); len(t) > 5 && ok {
+ kv[k] = []any{}
+ }
+ }
+ }
+
+ return kv, nil
+}
+
func (s *Server) ListModelsHandler(c *gin.Context) {
ms, err := Manifests()
if err != nil {
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index a61a618f..34061282 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -15,6 +15,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
)
@@ -86,6 +87,7 @@ func checkFileExists(t *testing.T, p string, expect []string) {
func TestCreateFromBin(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -111,6 +113,7 @@ func TestCreateFromBin(t *testing.T) {
func TestCreateFromModel(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -151,6 +154,7 @@ func TestCreateFromModel(t *testing.T) {
func TestCreateRemovesLayers(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -197,6 +201,7 @@ func TestCreateRemovesLayers(t *testing.T) {
func TestCreateUnsetsSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -252,6 +257,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
func TestCreateMergeParameters(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -354,6 +360,7 @@ func TestCreateMergeParameters(t *testing.T) {
func TestCreateReplacesMessages(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -429,6 +436,7 @@ func TestCreateReplacesMessages(t *testing.T) {
func TestCreateTemplateSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -474,6 +482,7 @@ func TestCreateTemplateSystem(t *testing.T) {
func TestCreateLicenses(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -519,6 +528,7 @@ func TestCreateLicenses(t *testing.T) {
func TestCreateDetectTemplate(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
var s Server
t.Run("matched", func(t *testing.T) {
diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go
index 0f003574..00303bd1 100644
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -9,12 +9,15 @@ import (
"testing"
"github.com/ollama/ollama/api"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
func TestDelete(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
+ envconfig.LoadConfig()
+
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
diff --git a/server/routes_list_test.go b/server/routes_list_test.go
index 97bf8b8f..d04be9d6 100644
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -8,10 +8,12 @@ import (
"testing"
"github.com/ollama/ollama/api"
+ "github.com/ollama/ollama/envconfig"
)
func TestList(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
+ envconfig.LoadConfig()
expectNames := []string{
"mistral:7b-instruct-q4_0",
diff --git a/server/routes_test.go b/server/routes_test.go
index 4e9cfc2a..5a5c0fbb 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -18,6 +18,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
+ "github.com/ollama/ollama/envconfig"
+ "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
@@ -211,11 +213,13 @@ func Test_Routes(t *testing.T) {
"top_p 0.9",
}
assert.Equal(t, expectedParams, params)
+ assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
},
},
}
t.Setenv("OLLAMA_MODELS", t.TempDir())
+ envconfig.LoadConfig()
s := &Server{}
router := s.GenerateRoutes()
@@ -246,6 +250,7 @@ func Test_Routes(t *testing.T) {
func TestCase(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
+ envconfig.LoadConfig()
cases := []string{
"mistral",
@@ -322,3 +327,40 @@ func TestCase(t *testing.T) {
})
}
}
+
+func TestShow(t *testing.T) {
+ t.Setenv("OLLAMA_MODELS", t.TempDir())
+ envconfig.LoadConfig()
+
+ var s Server
+
+ createRequest(t, s.CreateModelHandler, api.CreateRequest{
+ Name: "show-model",
+ Modelfile: fmt.Sprintf(
+ "FROM %s\nFROM %s",
+ createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
+ createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
+ ),
+ })
+
+ w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+ Name: "show-model",
+ })
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected status code 200, actual %d", w.Code)
+ }
+
+ var resp api.ShowResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatal(err)
+ }
+
+ if resp.ModelInfo["general.architecture"] != "test" {
+ t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
+ }
+
+ if resp.ProjectorInfo["general.architecture"] != "clip" {
+ t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
+ }
+}
diff --git a/server/sched.go b/server/sched.go
index c36486f7..42439554 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -7,7 +7,6 @@ import (
"log/slog"
"reflect"
"runtime"
- "slices"
"sort"
"strings"
"sync"
@@ -27,6 +26,7 @@ type LlmRequest struct {
sessionDuration time.Duration
successCh chan *runnerRef
errCh chan error
+ schedAttempts uint
}
type Scheduler struct {
@@ -38,9 +38,11 @@ type Scheduler struct {
loaded map[string]*runnerRef
loadedMu sync.Mutex
- loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
- newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
- getGpuFn func() gpu.GpuInfoList
+ loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
+ newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
+ getGpuFn func() gpu.GpuInfoList
+ getCpuFn func() gpu.GpuInfoList
+ reschedDelay time.Duration
}
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
@@ -54,6 +56,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo,
+ getCpuFn: gpu.GetCPUInfo,
+ reschedDelay: 250 * time.Millisecond,
}
sched.loadFn = sched.load
return sched
@@ -105,6 +109,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
return
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
+ pending.schedAttempts++
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
@@ -131,7 +136,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else {
// Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list
- gpus := s.getGpuFn()
+ var gpus gpu.GpuInfoList
+ if pending.opts.NumGPU == 0 {
+ gpus = s.getCpuFn()
+ } else {
+ gpus = s.getGpuFn()
+ }
// Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath)
@@ -140,16 +150,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
- // If we're CPU only mode, just limit by envconfig.MaxRunners above
- // TODO handle system memory exhaustion
- if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
- slog.Debug("cpu mode with existing models, loading")
- s.loadFn(pending, ggml, gpus)
- break
- }
-
- // No models loaded. Load the model but prefer the best fit.
- if loadedCount == 0 {
+ // Evaluate if the model will fit in the available system memory, or if we should unload a model first
+ if len(gpus) == 1 && gpus[0].Library == "cpu" {
+ if loadedCount == 0 {
+ slog.Debug("cpu mode with first model, loading")
+ s.loadFn(pending, ggml, gpus)
+ break
+ }
+ runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
+ if runnerToExpire == nil {
+ slog.Debug("cpu mode with available system memory or first model, loading")
+ s.loadFn(pending, ggml, gpus)
+ break
+ }
+ // else we need to expire a runner
+ } else if loadedCount == 0 {
+ // No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFitGPUs(pending, ggml, gpus)
if g != nil {
@@ -159,16 +175,44 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
- // More than one loaded model, so we have to see if the new one fits
- // Update free memory from currently loaded models
- s.updateFreeSpace(gpus)
- gpus = pickBestFitGPUs(pending, ggml, gpus)
- if gpus != nil {
- slog.Debug("new model fits with existing models, loading")
- s.loadFn(pending, ggml, gpus)
- break
+ if runnerToExpire == nil {
+ // More than one loaded model, so we have to see if the
+ // new one fits
+ //
+ // We want to avoid loading on any GPUs that have other
+ // models still loading on them to avoid potential races
+ // with VRAM consumption ramping up during load
+ availGpus := s.filterGPUsWithoutLoadingModels(gpus)
+
+ // Update free memory from currently loaded models
+ s.updateFreeSpace(availGpus)
+ fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
+ if fitGpus != nil {
+ slog.Debug("new model fits with existing models, loading")
+ s.loadFn(pending, ggml, fitGpus)
+ break
+ }
+
+ // We couldn't find a set of GPUs to fully load the new
+ // model. If no other models are loading (both GPU lists
+ // are the same) then we need to unload another model to
+ // make room
+ if len(availGpus) < len(gpus) {
+ // There are other requests pending, and this one
+ // needs more time, so put it on the back of the
+ // queue so that we might satisfy other pending
+ // requests that aren't blocked
+ go func() {
+ // Process in a go routine to avoid deadlocking
+ // the scheduler if our queue is full
+ slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
+ time.Sleep(s.reschedDelay)
+ s.pendingReqCh <- pending
+ }()
+ break
+ }
+ runnerToExpire = s.findRunnerToUnload()
}
- runnerToExpire = s.findRunnerToUnload()
}
if runnerToExpire == nil {
@@ -368,17 +412,9 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
s.loadedMu.Lock()
for _, r := range s.loaded {
r.refMu.Lock()
- gpuIDs := make([]string, 0, len(r.gpus))
if r.llama != nil {
- // TODO this should be broken down by GPU instead of assuming uniform spread
- estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
- for _, gpu := range r.gpus {
- gpuIDs = append(gpuIDs, gpu.ID)
- }
for _, gpu := range allGpus {
- if slices.Contains(gpuIDs, gpu.ID) {
- predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
- }
+ predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
}
} else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -401,11 +437,36 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
}
- slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
+ slog.Info("updated VRAM based on existing loaded models", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
}
}
}
+// While models are loading the VRAM consumption numbers will be indeterminate, so we have
+// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
+// This routine returns the set of GPUs that do not have an active loading model.
+// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
+ ret := append(gpu.GpuInfoList{}, allGpus...)
+ s.loadedMu.Lock()
+ defer s.loadedMu.Unlock()
+ for _, runner := range s.loaded {
+ if runner.loading {
+ slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
+ for _, busyGPU := range runner.gpus {
+ for i := range ret {
+ if ret[i].ID == busyGPU.ID {
+ ret = append(ret[:i], ret[i+1:]...)
+ break
+ }
+ }
+ }
+ }
+ }
+ return ret
+}
+
+// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
@@ -487,8 +548,11 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
finished := make(chan interface{}, 1)
- // CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
- if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
+ // CPU or Metal don't need checking, so no waiting required
+ // windows can page VRAM, only cuda currently can report accurate used vram usage
+ if len(runner.gpus) == 0 ||
+ (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
+ (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
finished <- struct{}{}
return finished
}
@@ -508,7 +572,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
for {
<-ticker.C
if time.Now().After(expiresAt) {
- slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
+ slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath)
finished <- struct{}{}
}
@@ -521,7 +585,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
- slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
+ slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath)
finished <- struct{}{}
return
}
@@ -558,10 +622,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU
- for _, g := range sgl {
- if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
- slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
- return []gpu.GpuInfo{g}
+ if !envconfig.SchedSpread {
+ for _, g := range sgl {
+ if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+ slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+ return []gpu.GpuInfo{g}
+ }
}
}
@@ -586,6 +652,10 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
runnerList = append(runnerList, r)
}
s.loadedMu.Unlock()
+ if len(runnerList) == 0 {
+ slog.Debug("no loaded runner to unload")
+ return nil
+ }
// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
// e.g., if we have multiple options, will one make room for the request?
@@ -616,3 +686,18 @@ func (s *Scheduler) unloadAllRunners() {
}
}
}
+
+// If other runners are loaded, make sure the pending request will fit in system memory
+// If not, pick a runner to unload, else return nil and the request can be loaded
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
+ slog.Debug("evaluating if CPU model load will fit in available system memory")
+ estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
+ if estimate.TotalSize <= gpus[0].FreeMemory {
+ slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
+ return nil
+ }
+
+ // TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
+
+ return s.findRunnerToUnload()
+}
diff --git a/server/sched_test.go b/server/sched_test.go
index f7dce6d1..95328834 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible")
- server := &mockLlm{estimatedVRAM: 10}
+ server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
return server, nil
}
@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
})
require.NoError(t, err)
@@ -145,17 +146,17 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
- scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
+ scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return scenario
}
func TestRequests(t *testing.T) {
- ctx, done := context.WithTimeout(context.Background(), time.Second)
+ ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
defer done()
// Same model, same request
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
- scenario1a.req.sessionDuration = 0
+ scenario1a.req.sessionDuration = 5 * time.Millisecond
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
scenario1b.req.model = scenario1a.req.model
scenario1b.ggml = scenario1a.ggml
@@ -166,6 +167,7 @@ func TestRequests(t *testing.T) {
tmpModel := *scenario1a.req.model
scenario2a.req.model = &tmpModel
scenario2a.ggml = scenario1a.ggml
+ scenario2a.req.sessionDuration = 5 * time.Millisecond
// Multiple loaded models
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -181,6 +183,12 @@ func TestRequests(t *testing.T) {
g.FreeMemory = 12 * format.GigaByte
return []gpu.GpuInfo{g}
}
+ s.getCpuFn = func() gpu.GpuInfoList {
+ g := gpu.GpuInfo{Library: "cpu"}
+ g.TotalMemory = 32 * format.GigaByte
+ g.FreeMemory = 26 * format.GigaByte
+ return []gpu.GpuInfo{g}
+ }
s.newServerFn = scenario1a.newServer
slog.Info("scenario1a")
s.pendingReqCh <- scenario1a.req
@@ -309,7 +317,6 @@ func TestGetRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
- // Same model, same request
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
scenario1a.req.sessionDuration = 0
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
@@ -419,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: 2,
}
finished := make(chan *LlmRequest)
- llm1 := &mockLlm{}
+ llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount)
@@ -452,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900
- llm1 := &mockLlm{estimatedVRAM: 100}
- llm2 := &mockLlm{estimatedVRAM: 200}
+ llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
+ llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus}
r2 := &runnerRef{llama: llm2, gpus: gpus}
@@ -464,8 +471,42 @@ func TestUpdateFreeSpace(t *testing.T) {
s.loadedMu.Unlock()
s.updateFreeSpace(gpus)
- require.Equal(t, uint64(850), gpus[0].FreeMemory)
- require.Equal(t, uint64(1850), gpus[1].FreeMemory)
+ require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
+ require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
+}
+
+func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
+ ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+ defer done()
+ gpus := gpu.GpuInfoList{
+ {
+ Library: "cuda",
+ ID: "0",
+ },
+ {
+ Library: "cuda",
+ ID: "1",
+ },
+ }
+ r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
+
+ s := InitScheduler(ctx)
+ s.loadedMu.Lock()
+ s.loaded["a"] = r1
+ s.loadedMu.Unlock()
+
+ tmp := s.filterGPUsWithoutLoadingModels(gpus)
+ require.Len(t, tmp, 1)
+ require.Equal(t, "1", tmp[0].ID)
+
+ r1.gpus = gpu.GpuInfoList{gpus[1]}
+ tmp = s.filterGPUsWithoutLoadingModels(gpus)
+ require.Len(t, tmp, 1)
+ require.Equal(t, "0", tmp[0].ID)
+
+ r1.gpus = gpu.GpuInfoList{}
+ tmp = s.filterGPUsWithoutLoadingModels(gpus)
+ require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) {
@@ -492,7 +533,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
- llm := &mockLlm{}
+ llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
do := api.DefaultOptions()
runner := &runnerRef{
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@@ -535,8 +576,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
- llm1 := &mockLlm{}
- llm2 := &mockLlm{}
+ llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+ llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
s := InitScheduler(ctx)
s.unloadAllRunners()
@@ -554,7 +595,7 @@ func TestUnloadAllRunners(t *testing.T) {
}
func TestUnload(t *testing.T) {
- llm1 := &mockLlm{}
+ llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
r1.unload()
@@ -564,19 +605,20 @@ func TestUnload(t *testing.T) {
}
type mockLlm struct {
- pingResp error
- waitResp error
- completionResp error
- embeddingResp []float64
- embeddingRespErr error
- tokenizeResp []int
- tokenizeRespErr error
- detokenizeResp string
- detonekizeRespErr error
- closeResp error
- closeCalled bool
- estimatedVRAM uint64
- estimatedTotal uint64
+ pingResp error
+ waitResp error
+ completionResp error
+ embeddingResp []float64
+ embeddingRespErr error
+ tokenizeResp []int
+ tokenizeRespErr error
+ detokenizeResp string
+ detonekizeRespErr error
+ closeResp error
+ closeCalled bool
+ estimatedVRAM uint64
+ estimatedTotal uint64
+ estimatedVRAMByGPU map[string]uint64
}
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
@@ -597,5 +639,6 @@ func (s *mockLlm) Close() error {
s.closeCalled = true
return s.closeResp
}
-func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
-func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
+func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
+func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
diff --git a/types/model/name.go b/types/model/name.go
index d85fd0c6..e645a844 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -4,7 +4,6 @@ package model
import (
"cmp"
- "encoding/hex"
"errors"
"fmt"
"log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
}
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
}
-
-type DigestType byte
-
-const (
- DigestTypeInvalid DigestType = iota
- DigestTypeSHA256
-)
-
-func (t DigestType) String() string {
- switch t {
- case DigestTypeSHA256:
- return "sha256"
- default:
- return "invalid"
- }
-}
-
-type Digest struct {
- Type DigestType
- Sum [32]byte
-}
-
-func ParseDigest(s string) (Digest, error) {
- i := strings.IndexAny(s, "-:")
- if i < 0 {
- return Digest{}, fmt.Errorf("invalid digest %q", s)
- }
- typ, encSum := s[:i], s[i+1:]
- if typ != "sha256" {
- return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
- }
- d := Digest{
- Type: DigestTypeSHA256,
- }
- n, err := hex.Decode(d.Sum[:], []byte(encSum))
- if err != nil {
- return Digest{}, err
- }
- if n != 32 {
- return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
- }
- return d, nil
-}
-
-func (d Digest) String() string {
- if d.Type == DigestTypeInvalid {
- return ""
- }
- return fmt.Sprintf("sha256-%x", d.Sum)
-}
-
-func (d Digest) IsValid() bool {
- return d.Type != DigestTypeInvalid
-}
diff --git a/types/model/name_test.go b/types/model/name_test.go
index 66ce4c33..008dd586 100644
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
}
}
-const (
- validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
- validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
-)
-
-func TestParseDigest(t *testing.T) {
- cases := []struct {
- in string
- want string
- }{
- {"", ""}, // empty
- {"sha123-12", ""}, // invalid type
- {"sha256-", ""}, // invalid sum
- {"sha256-123", ""}, // invalid odd length sum
-
- {validSha256, validSha256},
- {validSha256Old, validSha256},
- }
- for _, tt := range cases {
- t.Run(tt.in, func(t *testing.T) {
- got, err := ParseDigest(tt.in)
- if err != nil {
- if tt.want != "" {
- t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
- }
- return
- }
- if got.String() != tt.want {
- t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
- }
- })
- }
-}
-
func TestParseNameFromFilepath(t *testing.T) {
cases := map[string]Name{
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},