llamarunner: Respect device ordering for offloaded layers

We used to control the way that llama.cpp saw devices using
CUDA_VISIBLE_DEVICES or similar. This would ensure that the layers
offloaded to a device were actually the ones intended. This is
particularly important because we might reorder devices based on
free memory or performance.

When we started explicitly scheduling layers, this logic went
away but the llamarunner didn't have any way to set the correct
order of devices. This meant that the correct number of layers
would be assigned to a device but not necessarily the layers
that were expected. This change sets up the devices correctly
based on the offload information.
This commit is contained in:
Jesse Gross
2025-11-10 14:49:46 -08:00
committed by Jesse Gross
parent 31361c4d3c
commit 4372d0bfef
3 changed files with 69 additions and 11 deletions

View File

@@ -63,8 +63,13 @@ func BackendInit() {
C.llama_backend_init()
}
func EnumerateGPUs() []ml.DeviceID {
var ids []ml.DeviceID
type Devices struct {
ml.DeviceID
LlamaID uint64
}
func EnumerateGPUs() []Devices {
var ids []Devices
for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i)
@@ -74,9 +79,12 @@ func EnumerateGPUs() []ml.DeviceID {
C.GGML_BACKEND_DEVICE_TYPE_IGPU:
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, ml.DeviceID{
ID: C.GoString(props.id),
Library: C.GoString(props.library),
ids = append(ids, Devices{
DeviceID: ml.DeviceID{
ID: C.GoString(props.id),
Library: C.GoString(props.library),
},
LlamaID: uint64(i),
})
}
}
@@ -231,6 +239,7 @@ func (c *Context) GetLogitsIth(i int) []float32 {
}
type ModelParams struct {
Devices []uint64
NumGpuLayers int
MainGpu int
UseMmap bool
@@ -254,6 +263,21 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.use_mmap = C.bool(params.UseMmap)
cparams.vocab_only = C.bool(params.VocabOnly)
var devices []C.ggml_backend_dev_t
for _, llamaID := range params.Devices {
devices = append(devices, C.ggml_backend_dev_get(C.size_t(llamaID)))
}
if len(devices) > 0 {
devices = append(devices, C.ggml_backend_dev_t(C.NULL))
devicesData := &devices[0]
var devicesPin runtime.Pinner
devicesPin.Pin(devicesData)
defer devicesPin.Unpin()
cparams.devices = devicesData
}
if len(params.TensorSplit) > 0 {
tensorSplitData := &params.TensorSplit[0]