mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 15:08:27 +00:00
merge upstream for 4.0 update
This commit is contained in:
@@ -37,19 +37,6 @@ func GetSupportedGFX(libDir string) ([]string, error) {
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "rocm" {
|
||||
// TODO shouldn't happen if things are wired correctly...
|
||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
func commonAMDValidateLibDir() (string, error) {
|
||||
// Favor our bundled version
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) {
|
||||
return hl, nil
|
||||
}
|
||||
|
||||
// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
|
||||
// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
|
||||
// so we have to unload/reset the library after we do our initial discovery
|
||||
// to make sure our updates to that variable are processed by llama.cpp
|
||||
func (hl *HipLib) Release() {
|
||||
|
||||
@@ -64,16 +64,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||
var visibleDevices []string
|
||||
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
|
||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
|
||||
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
||||
switch {
|
||||
// TODO is this priorty order right?
|
||||
case hipVD != "":
|
||||
visibleDevices = strings.Split(hipVD, ",")
|
||||
case rocrVD != "":
|
||||
visibleDevices = strings.Split(rocrVD, ",")
|
||||
// TODO - since we don't yet support UUIDs, consider detecting and reporting here
|
||||
// all our test systems show GPU-XX indicating UUID is not supported
|
||||
case hipVD != "":
|
||||
visibleDevices = strings.Split(hipVD, ",")
|
||||
case gpuDO != "":
|
||||
visibleDevices = strings.Split(gpuDO, ",")
|
||||
}
|
||||
@@ -99,7 +96,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
}
|
||||
return a < b
|
||||
})
|
||||
cpuCount := 0
|
||||
gpuCount := 0
|
||||
for _, match := range matches {
|
||||
slog.Debug("evaluating amdgpu node " + match)
|
||||
fp, err := os.Open(match)
|
||||
@@ -108,11 +105,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
||||
if err != nil {
|
||||
slog.Debug("failed to parse node ID", "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
isCPU := false
|
||||
@@ -186,20 +178,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
// do reliably report VRAM usage.
|
||||
|
||||
if isCPU {
|
||||
cpuCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// CPUs are always first in the list
|
||||
gpuID := nodeID - cpuCount
|
||||
|
||||
// Shouldn't happen, but just in case...
|
||||
if gpuID < 0 {
|
||||
err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
||||
slog.Error(err.Error())
|
||||
return nil, err
|
||||
// Skip over any GPUs that are masked
|
||||
if major == 0 && minor == 0 && patch == 0 {
|
||||
slog.Debug("skipping gpu with gfx000")
|
||||
continue
|
||||
}
|
||||
|
||||
// Keep track of numeric IDs based on valid GPUs
|
||||
gpuID := gpuCount
|
||||
gpuCount += 1
|
||||
|
||||
// Look up the memory for the current node
|
||||
totalMemory := uint64(0)
|
||||
usedMemory := uint64(0)
|
||||
@@ -273,6 +264,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
||||
}
|
||||
|
||||
// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
|
||||
var ID string
|
||||
if uniqueID != 0 {
|
||||
ID = fmt.Sprintf("GPU-%016x", uniqueID)
|
||||
} else {
|
||||
ID = strconv.Itoa(gpuID)
|
||||
}
|
||||
|
||||
gpuInfo := RocmGPUInfo{
|
||||
GpuInfo: GpuInfo{
|
||||
Library: "rocm",
|
||||
@@ -280,7 +279,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
TotalMemory: totalMemory,
|
||||
FreeMemory: (totalMemory - usedMemory),
|
||||
},
|
||||
ID: strconv.Itoa(gpuID),
|
||||
ID: ID,
|
||||
Name: name,
|
||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||
MinimumMemory: rocmMinimumMemory,
|
||||
@@ -288,6 +287,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
DriverMinor: driverMinor,
|
||||
},
|
||||
usedFilepath: usedFile,
|
||||
index: gpuID,
|
||||
}
|
||||
|
||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||
@@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
if len(visibleDevices) > 0 {
|
||||
include := false
|
||||
for _, visible := range visibleDevices {
|
||||
if visible == gpuInfo.ID {
|
||||
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
|
||||
include = true
|
||||
break
|
||||
}
|
||||
@@ -516,3 +516,20 @@ func verifyKFDDriverAccess() error {
|
||||
fd.Close()
|
||||
return nil
|
||||
}
|
||||
|
||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "rocm" {
|
||||
// TODO shouldn't happen if things are wired correctly...
|
||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
// There are 3 potential env vars to use to select GPUs.
|
||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
|
||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
||||
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||
slog.Debug("error looking up amd driver version", "error", err)
|
||||
}
|
||||
|
||||
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
||||
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
||||
count := hl.HipGetDeviceCount()
|
||||
if count == 0 {
|
||||
err := fmt.Errorf("no compatible amdgpu devices detected")
|
||||
@@ -201,3 +201,20 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "rocm" {
|
||||
// TODO shouldn't happen if things are wired correctly...
|
||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
// There are 3 potential env vars to use to select GPUs.
|
||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
@@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
|
||||
// query the management library as well so we can record any skew between the two
|
||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||
if cHandles.nvml != nil {
|
||||
C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
||||
uuid := C.CString(gpuInfo.ID)
|
||||
defer C.free(unsafe.Pointer(uuid))
|
||||
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||
if memInfo.err != nil {
|
||||
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
@@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
|
||||
}
|
||||
for i, gpu := range cudaGPUs {
|
||||
if cHandles.nvml != nil {
|
||||
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
||||
uuid := C.CString(gpu.ID)
|
||||
defer C.free(unsafe.Pointer(uuid))
|
||||
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||
} else if cHandles.cudart != nil {
|
||||
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
||||
} else if cHandles.nvcuda != nil {
|
||||
|
||||
@@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||
} l[] = {
|
||||
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
||||
{"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
|
||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||
{NULL, NULL},
|
||||
};
|
||||
@@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||
}
|
||||
|
||||
|
||||
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
|
||||
void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
|
||||
nvmlDevice_t device;
|
||||
nvmlMemory_t memInfo = {0};
|
||||
nvmlReturn_t ret;
|
||||
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
||||
ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
||||
LOG(1, "unable to get device handle %s: %d", uuid, ret);
|
||||
*free = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
|
||||
LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
|
||||
*free = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ typedef struct nvml_handle {
|
||||
uint16_t verbose;
|
||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||
nvmlReturn_t (*nvmlShutdown)(void);
|
||||
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
||||
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
|
||||
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||
} nvml_handle_t;
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
|
||||
} nvml_compute_capability_t;
|
||||
|
||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
|
||||
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
|
||||
void nvml_release(nvml_handle_t ch);
|
||||
|
||||
#endif // __GPU_INFO_NVML_H__
|
||||
|
||||
@@ -3,9 +3,11 @@ package discover
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -109,6 +111,10 @@ func GetCPUDetails() ([]CPU, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return linuxCPUDetails(file)
|
||||
}
|
||||
|
||||
func linuxCPUDetails(file io.Reader) ([]CPU, error) {
|
||||
reColumns := regexp.MustCompile("\t+: ")
|
||||
scanner := bufio.NewScanner(file)
|
||||
cpuInfos := []linuxCpuInfo{}
|
||||
@@ -131,6 +137,9 @@ func GetCPUDetails() ([]CPU, error) {
|
||||
cpu = &linuxCpuInfo{}
|
||||
}
|
||||
}
|
||||
if cpu.ID != "" {
|
||||
cpuInfos = append(cpuInfos, *cpu)
|
||||
}
|
||||
|
||||
// Process the sockets/cores/threads
|
||||
socketByID := map[string]*CPU{}
|
||||
@@ -177,10 +186,14 @@ func GetCPUDetails() ([]CPU, error) {
|
||||
s.EfficiencyCoreCount = efficiencyCoreCount
|
||||
}
|
||||
}
|
||||
|
||||
result := []CPU{}
|
||||
for _, c := range socketByID {
|
||||
result = append(result, *c)
|
||||
keys := make([]string, 0, len(socketByID))
|
||||
result := make([]CPU, 0, len(socketByID))
|
||||
for k := range socketByID {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, k := range keys {
|
||||
result = append(result, *socketByID[k])
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
2097
discover/gpu_linux_test.go
Normal file
2097
discover/gpu_linux_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -175,6 +175,11 @@ func (si SystemInfo) GetOptimalThreadCount() int {
|
||||
if len(si.System.CPUs) == 0 {
|
||||
return 0
|
||||
}
|
||||
// Allocate thread count matching the performance cores on a single socket
|
||||
return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
|
||||
|
||||
coreCount := 0
|
||||
for _, c := range si.System.CPUs {
|
||||
coreCount += c.CoreCount - c.EfficiencyCoreCount
|
||||
}
|
||||
|
||||
return coreCount
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user