Fix vulkan PCI ID and ID handling (#12775)

* Fix vulkan PCI ID and ID handling

Intel GPUs may not report PCI IDs which was leading to incorrect overlap
detection.  Switch to using the existing PCI IDs, however AMD GPUs claim not to
report PCI IDs, but actually do, so try anyway, as this is required for ADLX to
find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also
switches Vulkan to use UUID based IDs. The GPU discovery patches have been
squashed into a single patch to simplify future rebases.

* review comments
This commit is contained in:
Daniel Hiltgen
2025-10-28 15:15:35 -07:00
committed by GitHub
parent 29f63f37c8
commit 14977a9350
15 changed files with 418 additions and 447 deletions

View File

@@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
// In the second pass, we more deeply initialize the GPUs to weed out devices that
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
var wg sync.WaitGroup
@@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
if devices[i].Library == "Metal" {
continue
}
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
wg.Add(1)
go func(i int) {
defer wg.Done()
@@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
envVar: id, // Filter to just this one GPU
}
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
slog.Debug("filtering device which didn't fully initialize",
"id", devices[i].ID,
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
"pci_id", devices[i].PCIID,
"library", devices[i].Library,
)
needsDelete[i] = true
} else {
supportedMu.Lock()
@@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
}(i)
}
wg.Wait()
logutil.Trace("supported GPU library combinations", "supported", supported)
logutil.Trace("supported GPU library combinations before filtering", "supported", supported)
filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
@@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
}
if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
needsDelete[i] = true
slog.Debug("dropping Vulkan duplicate by PCI ID",
"vulkan_id", devices[i].ID,
"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
slog.Debug("filtering device with duplicate PCI ID",
"id", devices[i].ID,
"library", devices[i].Library,
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
"pci_id", devices[i].PCIID,
"kept_library", devices[j].Library,
"kept_id", devices[j].ID,
"kept_library", devices[j].Library,
)
break
}
@@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need
}
for dev, i := range byLibDirs[libDir] {
if _, found := byLibDirs[newest][dev]; found {
slog.Debug("filtering device with overlapping libraries",
"id", dev,
"library", libDir,
"delete_index", i,
"kept_library", newest,
)
needsDelete[i] = true
}
}

View File

@@ -3,6 +3,7 @@ package discover
import (
"log/slog"
"path/filepath"
"sort"
"strings"
"github.com/ollama/ollama/format"
@@ -26,6 +27,7 @@ type CPU struct {
}
func LogDetails(devices []ml.DeviceInfo) {
sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
for _, dev := range devices {
var libs []string
for _, dir := range dev.LibraryPath {
@@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
}
slog.Info("inference compute",
"id", dev.ID,
"filtered_id", dev.FilteredID,
"library", dev.Library,
"compute", dev.Compute(),
"name", dev.Name,