From 6ed88985903be474ecd59992f7191c2f0fa87e36 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Fri, 25 Apr 2025 16:16:15 -0700
Subject: [PATCH 01/24] ggml: fix crash for array head counts

If it's an array, it uses the max value in the array

If array values for head counts becomes more popular, we can consider a
more invasive change like #10225 to calculate more accurate estimates.

Fixes: #9984
---
 fs/ggml/ggml.go      | 103 +++++++++++++++++++++++++++++++------------
 fs/ggml/ggml_test.go |  30 +++++++++++++
 llm/memory.go        |   7 ++-
 3 files changed, 110 insertions(+), 30 deletions(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 427a43ae..0d38f29e 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -33,7 +33,8 @@ func (kv KV) Kind() string {
 }
 
 func (kv KV) ParameterCount() uint64 {
-	return keyValue(kv, "general.parameter_count", uint64(0))
+	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
+	return val
 }
 
 func (kv KV) FileType() fileType {
@@ -52,16 +53,27 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }
 
-func (kv KV) HeadCount() uint64 {
-	return uint64(kv.Uint("attention.head_count"))
+func (kv KV) HeadCountMax() uint64 {
+	// TODO(drifkin): using the max value can cause an overestimation. In the
+	// future if array values become more popular, we can adapt the more invasive
+	// <https://github.com/ollama/ollama/pull/10225>
+	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
 }
 
-func (kv KV) HeadCountKV() uint64 {
-	return uint64(kv.Uint("attention.head_count_kv", 1))
+func (kv KV) HeadCountMin() uint64 {
+	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
 }
 
-func (kv KV) EmbeddingHeadCount() uint64 {
-	if heads := kv.HeadCount(); heads > 0 {
+func (kv KV) HeadCountKVMax() uint64 {
+	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
+}
+
+func (kv KV) HeadCountKVMin() uint64 {
+	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
+}
+
+func (kv KV) EmbeddingHeadCountMax() uint64 {
+	if heads := kv.HeadCountMin(); heads > 0 {
 		return kv.EmbeddingLength() / heads
 	}
 
@@ -69,15 +81,11 @@ func (kv KV) EmbeddingHeadCount() uint64 {
 }
 
 func (kv KV) EmbeddingHeadCountK() uint64 {
-	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
 }
 
 func (kv KV) EmbeddingHeadCountV() uint64 {
-	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
-}
-
-func (kv KV) GQA() uint64 {
-	return kv.HeadCount() / kv.HeadCountKV()
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
 }
 
 func (kv KV) ContextLength() uint64 {
@@ -89,35 +97,72 @@ func (kv KV) ChatTemplate() string {
 }
 
 func (kv KV) String(key string, defaultValue ...string) string {
-	return keyValue(kv, key, append(defaultValue, "")...)
+	val, _ := keyValue(kv, key, append(defaultValue, "")...)
+	return val
 }
 
 func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
-	return keyValue(kv, key, append(defaultValue, 0)...)
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
 }
 
 func (kv KV) Float(key string, defaultValue ...float32) float32 {
-	return keyValue(kv, key, append(defaultValue, 0)...)
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
 }
 
 func (kv KV) Bool(key string, defaultValue ...bool) bool {
-	return keyValue(kv, key, append(defaultValue, false)...)
+	val, _ := keyValue(kv, key, append(defaultValue, false)...)
+	return val
+}
+
+func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
+	_, max := kv.UintOrArrayValue(key, defaultValue)
+	return max
+}
+
+func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
+	min, _ := kv.UintOrArrayValue(key, defaultValue)
+	return min
+}
+
+func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
+	if u32, ok := keyValue(kv, key, uint32(0)); ok {
+		return u32, u32
+	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
+		min := slices.Min(u32s.values)
+		max := slices.Max(u32s.values)
+		return min, max
+	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
+		min := slices.Min(i32s.values)
+		max := slices.Max(i32s.values)
+		if min < 0 || max < 0 {
+			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
+		}
+		return uint32(min), uint32(max)
+	}
+
+	return defaultValue, defaultValue
 }
 
 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
+	return val.values
 }
 
 func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
+	return val.values
 }
 
 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
+	return val.values
 }
 
 func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
+	return val.values
 }
 
 func (kv KV) OllamaEngineRequired() bool {
@@ -140,17 +185,17 @@ type arrayValueTypes interface {
 		*array[string] | *array[float32] | *array[float64] | *array[bool]
 }
 
-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}
 
-	if val, ok := kv[key]; ok {
-		return val.(T)
+	if val, ok := kv[key].(T); ok {
+		return val, true
 	}
 
-	slog.Warn("key not found", "key", key, "default", defaultValue[0])
-	return defaultValue[0]
+	slog.Warn("key with type not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0], false
 }
 
 type Tensors struct {
@@ -413,11 +458,11 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 
 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCount()
-	headsKV := f.KV().HeadCountKV()
+	heads := f.KV().HeadCountMax()
+	headsKV := f.KV().HeadCountKVMax()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
 
-	embeddingHeads := f.KV().EmbeddingHeadCount()
+	embeddingHeads := f.KV().EmbeddingHeadCountMax()
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
 
diff --git a/fs/ggml/ggml_test.go b/fs/ggml/ggml_test.go
index c1c1b43b..225e7484 100644
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -269,3 +269,33 @@ func TestKeyValue(t *testing.T) {
 		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
 	}
 }
+
+func TestHeadCount(t *testing.T) {
+	valuesArray := []int32{1, 5, 3, 4}
+	cases := []struct {
+		kv   KV
+		want uint64
+	}{
+		{
+			kv: KV{
+				"general.architecture":     "abc",
+				"abc.attention.head_count": &array[int32]{values: valuesArray, size: len(valuesArray)},
+			},
+			want: uint64(5),
+		},
+		{
+			kv: KV{
+				"general.architecture":     "abc",
+				"abc.attention.head_count": uint32(3),
+			},
+			want: uint64(3),
+		},
+	}
+
+	for _, tt := range cases {
+		got := tt.kv.HeadCountMax()
+		if got != tt.want {
+			t.Errorf("unexpected max value: got=%d want=%d", got, tt.want)
+		}
+	}
+}
diff --git a/llm/memory.go b/llm/memory.go
index e05327f7..d029e4d3 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -149,7 +149,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	if graphPartialOffload == 0 {
-		graphPartialOffload = f.KV().GQA() * kvTotal / 6
+		headsKV := f.KV().HeadCountKVMin()
+		if headsKV == 0 {
+			headsKV = 1
+		}
+		gqa := f.KV().HeadCountMax() / headsKV
+		graphPartialOffload = gqa * kvTotal / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload

From d2ee599dcf9746a3af0433463f9ce7dbc8cbc693 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Sun, 27 Apr 2025 13:45:13 -0700
Subject: [PATCH 02/24] load arrays with up to 1024 elements when estimating

This mirrors the old behavior before #10382
---
 server/sched.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/sched.go b/server/sched.go
index d5b19fbf..9c13f6cf 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -201,7 +201,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 
 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
 					if err != nil {
 						pending.errCh <- err
 						break

From f2527b08fba57d606e12cb21b583249c11724d7a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 19 Jun 2025 12:10:19 -0700
Subject: [PATCH 03/24] int: add coverage for older models (#11137)

Verified these fail on 0.9.1 and pass on HEAD.
---
 integration/model_arch_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go
index 6ce183d7..628b0231 100644
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -45,6 +45,8 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
+		"codellama:latest",
+		"nous-hermes:latest",
 	}
 )
 

From 87b7af6ceef2b4d96374dbff5070b41b17d3f138 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 19 Jun 2025 14:39:20 -0700
Subject: [PATCH 04/24] ggml: Check return status for computation.

We don't check the return status after computing the graph, which
can silently lead to bad outputs if we try to keep going and future
computation succeeds. This appears to happens in certain cases on
Apple M2 devices.

Fixes #11070
---
 ml/backend/ggml/ggml.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 5a9fe67e..8aadad86 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -602,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }
 
 func (c *Context) Compute(tensors ...ml.Tensor) {
-	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
+	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
+		panic(fmt.Errorf("error computing ggml graph: %v", status))
+	}
 	C.ggml_backend_sched_reset(c.b.sched)
 
 	needSync := true

From 0a066cfd91abdddc6ee172776974a6720a3072d3 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 20 Jun 2025 11:11:40 -0700
Subject: [PATCH 05/24] Reapply "feat: incremental gguf parser (#10822)"
 (#11114) (#11119)

* Reapply "feat: incremental gguf parser (#10822)" (#11114)

This reverts commit a6e64fbdf28f0d6cb97cc7f022ca493b905fe895.

* fix older ggufs
---
 fs/gguf/gguf.go             | 347 ++++++++++++++++++++++++++++++++++++
 fs/gguf/gguf_test.go        | 249 ++++++++++++++++++++++++++
 fs/gguf/keyvalue.go         |  90 ++++++++++
 fs/gguf/keyvalue_test.go    | 208 +++++++++++++++++++++
 fs/gguf/lazy.go             |  89 +++++++++
 fs/gguf/reader.go           |  23 +++
 fs/gguf/tensor.go           | 288 ++++++++++++++++++++++++++++++
 go.mod                      |   2 +-
 go.sum                      |   4 +-
 server/images.go            |  24 ++-
 server/images_test.go       | 165 +++++------------
 server/quantization_test.go |  12 +-
 server/sched_test.go        |  20 +--
 13 files changed, 1357 insertions(+), 164 deletions(-)
 create mode 100644 fs/gguf/gguf.go
 create mode 100644 fs/gguf/gguf_test.go
 create mode 100644 fs/gguf/keyvalue.go
 create mode 100644 fs/gguf/keyvalue_test.go
 create mode 100644 fs/gguf/lazy.go
 create mode 100644 fs/gguf/reader.go
 create mode 100644 fs/gguf/tensor.go

diff --git a/fs/gguf/gguf.go b/fs/gguf/gguf.go
new file mode 100644
index 00000000..bbb9bb41
--- /dev/null
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version < 2 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
diff --git a/fs/gguf/gguf_test.go b/fs/gguf/gguf_test.go
new file mode 100644
index 00000000..eea28a48
--- /dev/null
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
diff --git a/fs/gguf/keyvalue.go b/fs/gguf/keyvalue.go
new file mode 100644
index 00000000..5843326c
--- /dev/null
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
diff --git a/fs/gguf/keyvalue_test.go b/fs/gguf/keyvalue_test.go
new file mode 100644
index 00000000..2caacc53
--- /dev/null
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
diff --git a/fs/gguf/lazy.go b/fs/gguf/lazy.go
new file mode 100644
index 00000000..16ab9909
--- /dev/null
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
diff --git a/fs/gguf/reader.go b/fs/gguf/reader.go
new file mode 100644
index 00000000..0bd76184
--- /dev/null
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
diff --git a/fs/gguf/tensor.go b/fs/gguf/tensor.go
new file mode 100644
index 00000000..194c1d73
--- /dev/null
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
diff --git a/go.mod b/go.mod
index 283286b7..6de5959b 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
diff --git a/go.sum b/go.sum
index 5755616f..c0ab53aa 100644
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
diff --git a/server/images.go b/server/images.go
index d6cceff4..38505cc5 100644
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,22 +73,18 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}
 
 	// Check for completion capability
-	r, err := os.Open(m.ModelPath)
+	f, err := gguf.Open(m.ModelPath)
 	if err == nil {
-		defer r.Close()
+		defer f.Close()
 
-		f, err := ggml.Decode(r, 1024)
-		if err == nil {
-			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityEmbedding)
-			} else {
-				capabilities = append(capabilities, model.CapabilityCompletion)
-			}
-			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityVision)
-			}
+		if f.KeyValue("pooling_type").Valid() {
+			capabilities = append(capabilities, model.CapabilityEmbedding)
 		} else {
-			slog.Error("couldn't decode ggml", "error", err)
+			// If no embedding is specified, we assume the model supports completion
+			capabilities = append(capabilities, model.CapabilityCompletion)
+		}
+		if f.KeyValue("vision.block_count").Valid() {
+			capabilities = append(capabilities, model.CapabilityVision)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
diff --git a/server/images_test.go b/server/images_test.go
index 363b298e..a2fba8d9 100644
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,123 +1,42 @@
 package server
 
 import (
-	"bytes"
-	"encoding/binary"
-	"errors"
-	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )
 
-// Constants for GGUF magic bytes and version
-var (
-	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
-	ggufVer   = uint32(3)                      // Version 3
-)
-
-// Helper function to create mock GGUF data
-func createMockGGUFData(architecture string, vision bool) []byte {
-	var buf bytes.Buffer
-
-	// Write GGUF header
-	buf.Write(ggufMagic)
-	binary.Write(&buf, binary.LittleEndian, ggufVer)
-
-	// Write tensor count (0 for our test)
-	var numTensors uint64 = 0
-	binary.Write(&buf, binary.LittleEndian, numTensors)
-
-	// Calculate number of metadata entries
-	numMetaEntries := uint64(1) // architecture entry
-	if vision {
-		numMetaEntries++
-	}
-	// Add embedding entry if architecture is "bert"
-	if architecture == "bert" {
-		numMetaEntries++
-	}
-	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
-
-	// Write architecture metadata
-	archKey := "general.architecture"
-	keyLen := uint64(len(archKey))
-	binary.Write(&buf, binary.LittleEndian, keyLen)
-	buf.WriteString(archKey)
-
-	// String type (8)
-	var strType uint32 = 8
-	binary.Write(&buf, binary.LittleEndian, strType)
-
-	// String length
-	strLen := uint64(len(architecture))
-	binary.Write(&buf, binary.LittleEndian, strLen)
-	buf.WriteString(architecture)
-
-	if vision {
-		visionKey := architecture + ".vision.block_count"
-		keyLen = uint64(len(visionKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(visionKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var countVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, countVal)
-	}
-	// Write embedding metadata if architecture is "bert"
-	if architecture == "bert" {
-		poolKey := architecture + ".pooling_type"
-		keyLen = uint64(len(poolKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(poolKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var poolingVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, poolingVal)
-	}
-
-	return buf.Bytes()
-}
-
 func TestModelCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create completion model (llama architecture without vision)
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	// Create different types of mock model files
-	completionModelPath := filepath.Join(tempDir, "model.bin")
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
-	// Create a simple model file for tests that don't depend on GGUF content
-	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -145,21 +64,13 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
-		{
-			name: "model with tools and insert capability",
-			model: Model{
-				ModelPath: simpleModelPath,
-				Template:  toolsInsertTemplate,
-			},
-			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
-		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -224,29 +135,33 @@ func TestModelCapabilities(t *testing.T) {
 }
 
 func TestModelCheckCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create simple model file for tests that don't depend on GGUF content
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	simpleModelPath := filepath.Join(tempDir, "model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -261,7 +176,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -270,7 +185,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -278,7 +193,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -287,7 +202,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -312,7 +227,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
diff --git a/server/quantization_test.go b/server/quantization_test.go
index 4f717c2c..8b726c83 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,16 +257,8 @@ func TestQuantizeModel(t *testing.T) {
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			f, err := os.CreateTemp(t.TempDir(), tt.name)
-			if err != nil {
-				t.Fatal(err.Error())
-			}
-			defer f.Close()
-			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
-			if err != nil {
-				t.Fatalf("failed to create initial model: %s", err)
-			}
-			fp, err := os.Open(f.Name())
+			p, _ := createBinFile(t, tt.kv, tt.tensors)
+			fp, err := os.Open(p)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
diff --git a/server/sched_test.go b/server/sched_test.go
index 01fb9a70..3892fbba 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,11 +112,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
 
-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-
-	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
+	p, _ := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -129,14 +125,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}))
-	require.NoError(t, err)
-
-	fname := f.Name()
-	model := &Model{Name: modelName, ModelPath: fname}
-	b.f, err = llm.LoadModel(model.ModelPath, 0)
-	require.NoError(t, err)
+	})
 
+	model := &Model{Name: modelName, ModelPath: p}
+	f, err := llm.LoadModel(model.ModelPath, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}

From c088ac0e79a4a995e8a5a3733f7db2a981ac3364 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 20 Jun 2025 11:12:01 -0700
Subject: [PATCH 06/24] convert: utility for merging tensors (#11069)

---
 convert/convert_mixtral.go | 76 +++++++++--------------------
 convert/tensor.go          | 53 +++++++++++++++++++++
 convert/tensor_test.go     | 98 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 53 deletions(-)

diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index 17580ff8..7d60146b 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,9 +2,6 @@ package convert
 
 import (
 	"fmt"
-	"io"
-	"slices"
-	"strings"
 
 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }
 
 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
-		"model.layers", "blk",
-		"w1", "ffn_gate_exps",
-		"w2", "ffn_down_exps",
-		"w3", "ffn_up_exps",
-	}
-
-	for i := range p.NumLocalExperts {
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
-	}
-
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
-	namer := strings.NewReplacer(oldnew...)
-	experts := make(map[string]experts)
-
-	// merge experts into a single tensor while removing them from ts
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
-			return false
-		}
-
-		name := namer.Replace(t.Name())
-		experts[name] = append(experts[name], t)
-		return true
-	})
-
-	var out []*ggml.Tensor
-	for n, e := range experts {
-		// TODO(mxyng): sanity check experts
-		out = append(out, &ggml.Tensor{
-			Name:     n,
-			Kind:     e[0].Kind(),
-			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
-			WriterTo: e,
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
+	for i := range p.NumHiddenLayers {
+		merges = append(merges, merge{
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		})
 	}
 
+	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }
 
 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
+		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
+		"block_sparse_moe.experts.", ".",
 	)
 }
-
-type experts []Tensor
-
-func (e experts) WriteTo(w io.Writer) (int64, error) {
-	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
-	for _, t := range e {
-		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
-		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
-		// this accomplishes the same thing by writing each expert tensor in sequence
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
diff --git a/convert/tensor.go b/convert/tensor.go
index 9d6919e3..c9565ed4 100644
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,7 +2,9 @@ package convert
 
 import (
 	"cmp"
+	"io"
 	"iter"
+	"path"
 	"slices"
 	"strings"
 
@@ -74,3 +76,54 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 		}
 	}
 }
+
+type merge struct {
+	pattern, name string
+}
+
+// mergeTensors merges tensors that match a given pattern into a single tensor.
+func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
+	var matched []Tensor
+	for i := range merges {
+		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
+			matched, _ := path.Match(merges[i].pattern, t.Name())
+			return matched
+		})
+
+		if len(matched) > 0 {
+			out = append(out, &ggml.Tensor{
+				Name:     merges[i].name,
+				Kind:     matched[0].Kind(),
+				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
+				WriterTo: mergeGroup(matched),
+			})
+		}
+	}
+
+	return out, unmatched
+}
+
+// slicesSplitFunc splits a slice into two slices based on a predicate function.
+func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
+	for _, e := range s {
+		if fn(e) {
+			matched = append(matched, e)
+		} else {
+			unmatched = append(unmatched, e)
+		}
+	}
+
+	return matched, unmatched
+}
+
+type mergeGroup []Tensor
+
+func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
+	for _, t := range g {
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
diff --git a/convert/tensor_test.go b/convert/tensor_test.go
index ea12d0f5..0b2db5ba 100644
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )
 
@@ -302,3 +304,99 @@ func TestSplitDim(t *testing.T) {
 		}
 	})
 }
+
+func TestMerge(t *testing.T) {
+	unmatched := []Tensor{
+		&fakeTensor{
+			name:  "a.0.b",
+			shape: []uint64{5, 2},
+			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+		},
+		&fakeTensor{
+			name:  "a.1.b",
+			shape: []uint64{5, 2},
+			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
+		},
+		&fakeTensor{
+			name:  "c.0.d",
+			shape: []uint64{5, 2},
+			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
+		},
+		&fakeTensor{
+			name:  "c.1.d",
+			shape: []uint64{5, 2},
+			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
+		},
+		&fakeTensor{
+			name:  "e.0.f",
+			shape: []uint64{5, 2},
+			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
+		},
+	}
+
+	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
+		for i := range n {
+			got := matched[i]
+			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
+				t.Errorf("unexpected (-want +got):\n%s", diff)
+			}
+
+			var b bytes.Buffer
+			if _, err := got.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, 20)
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			offset := 10 + (i * 20)
+			want := make([]float32, 20)
+			for j := range 20 {
+				want[j] = float32(offset + j)
+			}
+
+			if diff := cmp.Diff(want, f32s); diff != "" {
+				t.Errorf("unexpected data (-want +got):\n%s", diff)
+			}
+		}
+	}
+
+	t.Run("single merge", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
+		if len(unmatched) != 3 {
+			t.Error("expected 3 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 1 {
+			t.Error("expected 1 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 1, matched)
+	})
+
+	t.Run("multiple merges", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
+		if len(unmatched) != 1 {
+			t.Error("expected 1 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 2 {
+			t.Error("expected 2 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 2, matched)
+	})
+
+	t.Run("no match", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
+		if len(unmatched) != 5 {
+			t.Error("expected 5 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 0 {
+			t.Error("expected no merged tensors, got", len(matched))
+		}
+	})
+}

From 65bff664cb39ed16a1fa814b0228e4e48d7234ba Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:32:51 -0700
Subject: [PATCH 07/24] build speedups (#11142)

Enable parallel building of the GPU architectures.
---
 CMakePresets.json | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 0b70d8ba..3234ce2c 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -22,7 +22,7 @@
       "inherits": [ "CUDA" ],
       "cacheVariables": {
         "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
       }
     },
     {
@@ -30,7 +30,7 @@
       "inherits": [ "CUDA" ],
       "cacheVariables": {
         "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
       }
     },
     {
@@ -58,6 +58,7 @@
       "name": "ROCm 6",
       "inherits": [ "ROCm" ],
       "cacheVariables": {
+        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
         "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
       }
     }

From 2bb69b40c7f6f2783290f0b7e4f7d5ec0a41f69c Mon Sep 17 00:00:00 2001
From: AJ <abduljaberaj00786@gmail.com>
Date: Mon, 23 Jun 2025 21:51:12 +0530
Subject: [PATCH 08/24] readme: add ai-hub to community integrations (#11169)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e148f9af..366fe94b 100644
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
+- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 
 ### Cloud
 

From 1c6669e64cc8a482fbf1e35c0249f17b35a4e87a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 23 Jun 2025 14:07:00 -0700
Subject: [PATCH 09/24] Re-remove cuda v11 (#10694)

* Re-remove cuda v11

Revert the revert - drop v11 support requiring drivers newer than Feb 23

This reverts commit c6bcdc4223c50071b59a19c42cc54ec9932f696f.

* Simplify layout

With only one version of the GPU libraries, we can simplify things down somewhat.  (Jetsons still require special handling)

* distinct sbsa variant for linux arm64

This avoids accidentally trying to load the sbsa cuda libraries on
a jetson system which results in crashes.

* temporary prevent rocm+cuda mixed loading
---
 .github/workflows/release.yaml                |  7 ----
 .github/workflows/test.yaml                   |  6 ++--
 CMakeLists.txt                                | 11 ++++---
 CMakePresets.json                             | 13 --------
 Dockerfile                                    | 24 ++++----------
 discover/cuda_common.go                       |  4 +++
 discover/path.go                              |  2 +-
 docs/gpu.md                                   |  2 +-
 docs/troubleshooting.md                       |  2 +-
 ...rary-prevent-rocm-cuda-mixed-loading.patch | 32 +++++++++++++++++++
 llm/server.go                                 |  2 +-
 ml/backend/ggml/ggml/src/ggml-backend-reg.cpp | 12 +++++--
 scripts/build_windows.ps1                     | 14 --------
 scripts/env.sh                                |  2 --
 14 files changed, 67 insertions(+), 66 deletions(-)
 create mode 100644 llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f423106e..4e5a5d47 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,11 +103,6 @@ jobs:
         arch: [amd64]
         preset: ['CPU']
         include:
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-version: '11.3'
           - os: windows
             arch: amd64
             preset: 'CUDA 12'
@@ -324,8 +319,6 @@ jobs:
             case "$COMPONENT" in
               bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
               lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
               lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
               lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
               lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 27e229fc..2e709339 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
         include:
           - preset: CPU
           - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
             flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
           - preset: ROCm
             container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,7 +78,7 @@ jobs:
         include:
           - preset: CPU
           - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
             flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
           - preset: ROCm
             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@@ -102,7 +102,7 @@ jobs:
           $ErrorActionPreference = "Stop"
           if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
             Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
           }
 
           $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c005d014..b3b5438a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,14 +78,13 @@ if(CMAKE_CUDA_COMPILER)
 
     find_package(CUDAToolkit)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
-    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
     install(TARGETS ggml-cuda
         RUNTIME_DEPENDENCIES
             DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
             PRE_INCLUDE_REGEXES cublas cublasLt cudart
             PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
     )
 endif()
 
@@ -116,7 +115,11 @@ if(CMAKE_HIP_COMPILER)
 
         set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
         install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCIES
+            RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+        )
+        install(RUNTIME_DEPENDENCY_SET rocm
                 DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                 PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                 PRE_EXCLUDE_REGEXES ".*"
diff --git a/CMakePresets.json b/CMakePresets.json
index 3234ce2c..9a4dcc06 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,14 +17,6 @@
       "name": "CUDA",
       "inherits": [ "Default" ]
     },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
-      }
-    },
     {
       "name": "CUDA 12",
       "inherits": [ "CUDA" ],
@@ -79,11 +71,6 @@
       "configurePreset": "CUDA",
       "targets": [ "ggml-cuda" ]
     },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
     {
       "name": "CUDA 12",
       "inherits": [ "CUDA" ],
diff --git a/Dockerfile b/Dockerfile
index 4c6619e7..da2ae3db 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,12 +7,13 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 
-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
     && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
     && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
     && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && dnf install -y ccache \
     && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 
@@ -38,15 +39,6 @@ RUN --mount=type=cache,target=/root/.ccache \
         && cmake --build --parallel --preset 'CPU' \
         && cmake --install build --component CPU --strip --parallel 8
 
-FROM base AS cuda-11
-ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -98,17 +90,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
     go build -trimpath -buildmode=pie -o /bin/ollama .
 
 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama
 
 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
+COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
 
 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
+COPY --from=rocm-6 dist/lib/ollama /lib/ollama
 
 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
diff --git a/discover/cuda_common.go b/discover/cuda_common.go
index 04829529..3c7cb669 100644
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,6 +3,7 @@
 package discover
 
 import (
+	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -55,10 +56,13 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
+		return "sbsa"
 	}
 
 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
diff --git a/discover/path.go b/discover/path.go
index 8a20d8c2..68e63009 100644
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
diff --git a/docs/gpu.md b/docs/gpu.md
index b54c66ab..61ff6e45 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
 
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index ba5487fe..995b33ac 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):
 
 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```
 
 **Experimental LLM Library Override**
diff --git a/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
new file mode 100644
index 00000000..205dc64a
--- /dev/null
+++ b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -0,0 +1,32 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 22 Jun 2025 09:22:05 -0700
+Subject: [PATCH] temporary prevent rocm+cuda mixed loading
+
+---
+ ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 4e67d243..8f49f084 100644
+--- a/ggml/src/ggml-backend-reg.cpp
++++ b/ggml/src/ggml-backend-reg.cpp
+@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
+ 
+     ggml_backend_load_best("blas", silent, dir_path);
+     ggml_backend_load_best("cann", silent, dir_path);
+-    ggml_backend_load_best("cuda", silent, dir_path);
+-    ggml_backend_load_best("hip", silent, dir_path);
++
++    // Avoid mixed hip+cuda configurations
++    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
++    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
++    if (!hip_devices && !rocr_devices) {
++        ggml_backend_load_best("cuda", silent, dir_path);
++    } else {
++        ggml_backend_load_best("hip", silent, dir_path);
++    }
++    
+     ggml_backend_load_best("kompute", silent, dir_path);
+     ggml_backend_load_best("metal", silent, dir_path);
+     ggml_backend_load_best("rpc", silent, dir_path);
diff --git a/llm/server.go b/llm/server.go
index 373f6fae..373eaf1f 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -311,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}
 
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index 4e67d243..8f49f084 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
 
     ggml_backend_load_best("blas", silent, dir_path);
     ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+    
     ggml_backend_load_best("kompute", silent, dir_path);
     ggml_backend_load_best("metal", silent, dir_path);
     ggml_backend_load_best("rpc", silent, dir_path);
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index e4c0b3d9..eaac2c60 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -27,7 +27,6 @@ function checkEnv() {
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
     # Locate CUDA versions
-    # Note: this assumes every version found will be built
     $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
     if ($cudaList.length -eq 0) {
         $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -94,19 +93,6 @@ function buildOllama() {
 
         $hashEnv = @{}
         Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v11")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
-            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
-            write-host "Building CUDA v11 backend libraries"
-            # Note: cuda v11 requires msvc 2019 so force the older generator
-            # to avoid 2022 (or newer) from being used as the default
-            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
         if ("$script:CUDA_DIRS".Contains("v12")) {
             $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
             $env:CUDAToolkit_ROOT=$hashEnv[$v12]
diff --git a/scripts/env.sh b/scripts/env.sh
index c5e6f530..65a970bd 100644
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
     --build-arg=GOFLAGS \
     --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
     --build-arg=OLLAMA_SKIP_CUDA_GENERATE \
-    --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
     --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
-    --build-arg=CUDA_V11_ARCHITECTURES \
     --build-arg=CUDA_V12_ARCHITECTURES \
     --build-arg=OLLAMA_SKIP_ROCM_GENERATE \
     --build-arg=OLLAMA_FAST_BUILD \

From 10a8e04a8dcdd06de4ccaa3c2fe19452d2714b15 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 23 Jun 2025 15:52:50 -0700
Subject: [PATCH 10/24] avoid context overflow (#11175)

For smaller context models, make sure we do not exceed the training size.
---
 llm/server.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llm/server.go b/llm/server.go
index 373eaf1f..7d921f14 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -139,6 +139,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}
 
+	// Verify the requested context size is <= the model training size
+	trainCtx := f.KV().ContextLength()
+	if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
+		slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
+		opts.NumCtx = int(trainCtx) * numParallel
+	}
+
 	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {

From c85c0ebf895016c36bab10be4dd92f594c400df3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Jun 2025 13:26:55 -0700
Subject: [PATCH 11/24] CI: switch windows to vs 2022 (#11184)

* CI: switch windows to vs 2022

* ci: fix regex match
---
 .github/workflows/release.yaml | 9 ++++++---
 .github/workflows/test.yaml    | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4e5a5d47..97c07ccc 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -155,6 +155,9 @@ jobs:
           echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
           echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
           echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
       - if: matrix.preset == 'CPU'
         run: |
           echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -173,8 +176,8 @@ jobs:
           key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
       - name: Build target "${{ matrix.preset }}"
         run: |
-          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
           cmake --preset "${{ matrix.preset }}"
           cmake --build --parallel --preset "${{ matrix.preset }}"
           cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
@@ -241,7 +244,7 @@ jobs:
             dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
 
   windows-sign:
-    runs-on: windows-2022
+    runs-on: windows
     environment: release
     needs: [windows-depends, windows-build]
     steps:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2e709339..00b2ad79 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -36,7 +36,7 @@ jobs:
               | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
           }
 
-          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
 
   linux:
     needs: [changes]
@@ -120,6 +120,9 @@ jobs:
           echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
           echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
           echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
       - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
         uses: actions/cache/save@v4
         with:
@@ -133,8 +136,8 @@ jobs:
           path: ${{ github.workspace }}\.ccache
           key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
       - run: |
-          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
           cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
           cmake --build --parallel --preset "${{ matrix.preset }}"
         env:

From 405d2f628fe59ef1269861650d16e2943645c45b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Jun 2025 15:27:09 -0700
Subject: [PATCH 12/24] ci: rocm parallel builds on windows (#11187)

The preset CMAKE_HIP_FLAGS isn't getting used on Windows.
This passes the parallel flag in through the C/CXX flags, along
with suppression for some log spew warnings to quiet down the build.
---
 .github/workflows/release.yaml | 4 +++-
 .github/workflows/test.yaml    | 2 +-
 scripts/build_windows.ps1      | 9 +++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 97c07ccc..32dd5e82 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -108,11 +108,13 @@ jobs:
             preset: 'CUDA 12'
             install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
             cuda-version: '12.8'
+            flags: ''
           - os: windows
             arch: amd64
             preset: 'ROCm 6'
             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
             rocm-version: '6.2'
+            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
     runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
     environment: release
     env:
@@ -178,7 +180,7 @@ jobs:
         run: |
           Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}"
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
           cmake --build --parallel --preset "${{ matrix.preset }}"
           cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
         env:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 00b2ad79..4d8cf773 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -82,7 +82,7 @@ jobs:
             flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
           - preset: ROCm
             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
-            flags: '-DAMDGPU_TARGETS=gfx1010'
+            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
     runs-on: windows
     steps:
       - run: |
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index eaac2c60..27f3eb9d 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -113,12 +113,17 @@ function buildOllama() {
             $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
             $env:HIP_PLATFORM="amd"
             $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-            & cmake --fresh --preset "ROCm 6" -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ --install-prefix $script:DIST_DIR
+            & cmake --fresh --preset "ROCm 6" -G Ninja `
+                -DCMAKE_C_COMPILER=clang `
+                -DCMAKE_CXX_COMPILER=clang++ `
+                -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
+                -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
+                --install-prefix $script:DIST_DIR
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
             $env:HIPCXX=""
             $env:HIP_PLATFORM=""
             $env:CMAKE_PREFIX_PATH=""
-            & cmake --build --preset "ROCm"  --config Release --parallel $script:JOBS
+            & cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
             & cmake --install build --component "HIP" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}

From 03274a6b2f2f6bd5cb109d5aa00c9b9d183500cf Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Jun 2025 18:45:01 -0700
Subject: [PATCH 13/24] ci: recombine linux amd64 binaries (#11188)

Glue the rocm and archive builds back together.
---
 .github/workflows/release.yaml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 32dd5e82..f56231aa 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -468,8 +468,18 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           pattern: dist-linux-*
-          path: dist
-          merge-multiple: true
+          path: stage
+          merge-multiple: false
+      - name: Merge linux amd64 payload
+        working-directory: stage/dist-linux-amd64-archive
+        run: |
+          tar zxf ollama-linux-amd64.tgz
+          tar zxf ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
+          rm -f ollama-linux-amd64.tgz ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
+          tar -c -f- --owner 0 --group 0 . | pigz -9vc > ../ollama-linux-amd64.tgz
+      - name: Cleanup linux payloads
+        run: |
+          find stage -name ollama-linux\*.tgz -exec mv {} dist/ \;
       - run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
         working-directory: dist
       - name: Create or update Release

From 4b4a90f233ff807994f8de78b1f9b1687b6328a4 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Jun 2025 18:59:22 -0700
Subject: [PATCH 14/24] ci: pick up arm sbsa cuda libs (#11192)

---
 .github/workflows/release.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f56231aa..5113b98a 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -322,11 +322,12 @@ jobs:
       - run: |
           for COMPONENT in bin/* lib/ollama/*; do
             case "$COMPONENT" in
-              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
             esac
           done
         working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}

From f08534137b127e1159f4b162881b8c2defc93158 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 24 Jun 2025 20:27:43 -0700
Subject: [PATCH 15/24] ci: include dependencies

---
 .github/workflows/release.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5113b98a..9cfedb6b 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -322,15 +322,21 @@ jobs:
       - run: |
           for COMPONENT in bin/* lib/ollama/*; do
             case "$COMPONENT" in
-              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)                 echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)            echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
             esac
           done
         working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
+      - run: |
+          echo "Manifests"
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
+            echo $ARCHIVE
+            cat $ARCHIVE
+          done
       - run: |
           for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
             tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);

From ad118d8b1306063d0ab2e967a20bd7914a135b28 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Jun 2025 21:00:15 -0700
Subject: [PATCH 16/24] ci: arm sbsa fixes (#11194)

---
 .github/workflows/release.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 9cfedb6b..5178eb5f 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -322,12 +322,12 @@ jobs:
       - run: |
           for COMPONENT in bin/* lib/ollama/*; do
             case "$COMPONENT" in
-              bin/ollama)                 echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)            echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
             esac
           done
         working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}

From 73b642e6f341287163c784e1e99a18426ee2ccea Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 25 Jun 2025 21:47:09 -0700
Subject: [PATCH 17/24] add new gemma model (#11204)

* update patches

* cherry pick metal mean kernel

* cherry pick cuda mean kernel

* gemma3n
---
 convert/convert.go                            |    2 +
 convert/convert_gemma3n.go                    |  168 +
 fs/config.go                                  |    1 +
 fs/ggml/ggml.go                               |    5 +
 fs/ggml/gguf.go                               |    4 +
 go.mod                                        |    2 +-
 llama/patches/0005-solar-pro.patch            |    2 +-
 ...nsure-KV-cache-is-fully-defragmented.patch |    6 +-
 ...15-add-argsort-and-cuda-copy-for-i32.patch |   12 +-
 .../0019-metal-add-mean-kernel-14267.patch    |  169 +
 .../0020-CUDA-add-mean-operation-14313.patch  | 5089 +++++++++++++++++
 ml/backend.go                                 |    8 +
 ml/backend/ggml/ggml.go                       |   57 +-
 ml/backend/ggml/ggml/src/ggml-cuda/common.cuh |   20 +
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |    5 +
 ml/backend/ggml/ggml/src/ggml-cuda/mean.cu    |   19 +
 ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh   |    3 +
 ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu |   23 +-
 .../ggml/ggml/src/ggml-cuda/sumrows.cuh       |    1 -
 .../src/ggml-metal/ggml-metal-embed.metal     |   48 +-
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   33 +-
 .../ggml/ggml/src/ggml-metal/ggml-metal.metal |   48 +-
 model/models/gemma3n/model.go                 |   52 +
 model/models/gemma3n/model_text.go            |  360 ++
 model/models/models.go                        |    1 +
 25 files changed, 6084 insertions(+), 54 deletions(-)
 create mode 100644 convert/convert_gemma3n.go
 create mode 100644 llama/patches/0019-metal-add-mean-kernel-14267.patch
 create mode 100644 llama/patches/0020-CUDA-add-mean-operation-14313.patch
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mean.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh
 create mode 100644 model/models/gemma3n/model.go
 create mode 100644 model/models/gemma3n/model_text.go

diff --git a/convert/convert.go b/convert/convert.go
index 4a6df66c..63b3bf66 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -190,6 +190,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &gemma2Model{}
 	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration":
 		conv = &gemma3Model{Architecture: p.Architectures[0]}
+	case "Gemma3nForConditionalGeneration":
+		conv = &gemma3nModel{}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
diff --git a/convert/convert_gemma3n.go b/convert/convert_gemma3n.go
new file mode 100644
index 00000000..bf667e38
--- /dev/null
+++ b/convert/convert_gemma3n.go
@@ -0,0 +1,168 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+type gemma3nModel struct {
+	ModelParameters
+
+	TextModel struct {
+		ActivationSparsityPattern []float32 `json:"activation_sparsity_pattern"`
+		AltupActiveIdx            uint32    `json:"altup_active_idx"`
+		AltupCoefClip             float32   `json:"altup_coef_clip"`
+		AltupCorrectScale         bool      `json:"altup_correct_scale"`
+		AltupLRMultiplier         float32   `json:"altup_lr_multiplier"`
+		AltupNumInputs            uint32    `json:"altup_num_inputs"`
+		HeadDim                   uint32    `json:"head_dim"`
+		HiddenSize                uint32    `json:"hidden_size"`
+		HiddenSizePerLayerInput   uint32    `json:"hidden_size_per_layer_input"`
+		IntermediateSize          uint32    `json:"intermediate_size"`
+		LaurelRank                uint32    `json:"laurel_rank"`
+		MaxPositionEmbeddings     uint32    `json:"max_position_embeddings"`
+		NumAttentionHeads         uint32    `json:"num_attention_heads"`
+		NumHiddenLayers           uint32    `json:"num_hidden_layers"`
+		NumKeyValueHeads          uint32    `json:"num_key_value_heads"`
+		NumKVSharedLayers         uint32    `json:"num_kv_shared_layers"`
+		RMSNormEPS                float32   `json:"rms_norm_eps"`
+		RopeLocalBaseFreq         float32   `json:"rope_local_base_freq"`
+		RopeTheta                 float32   `json:"rope_theta"`
+		SlidingWindow             uint32    `json:"sliding_window"`
+		LayerTypes                []string  `json:"layer_types"`
+	} `json:"text_config"`
+	VisionModel struct{} `json:"vision_config"`
+}
+
+func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma3n"
+	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
+		norm := distuv.Normal{Mu: 0, Sigma: 1}
+		for _, v := range m.TextModel.ActivationSparsityPattern {
+			if !yield(float32(norm.Quantile(float64(v)))) {
+				break
+			}
+		}
+	})
+	kv["gemma3n.altup.active_idx"] = m.TextModel.AltupActiveIdx
+	kv["gemma3n.altup.correct_scale"] = m.TextModel.AltupCorrectScale
+	kv["gemma3n.altup.lr_multiplier"] = m.TextModel.AltupLRMultiplier
+	kv["gemma3n.altup.num_inputs"] = m.TextModel.AltupNumInputs
+	kv["gemma3n.attention.head_count_kv"] = m.TextModel.NumKeyValueHeads
+	kv["gemma3n.attention.head_count"] = m.TextModel.NumAttentionHeads
+	kv["gemma3n.attention.layer_norm_rms_epsilon"] = m.TextModel.RMSNormEPS
+	kv["gemma3n.attention.sliding_window"] = m.TextModel.SlidingWindow
+	kv["gemma3n.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
+		for _, t := range m.TextModel.LayerTypes {
+			if !yield(t == "sliding_attention") {
+				break
+			}
+		}
+	})
+	kv["gemma3n.attention.shared_kv_layers"] = m.TextModel.NumKVSharedLayers
+	kv["gemma3n.block_count"] = m.TextModel.NumHiddenLayers
+	kv["gemma3n.context_length"] = m.TextModel.MaxPositionEmbeddings
+	kv["gemma3n.embedding_length_per_layer_input"] = m.TextModel.HiddenSizePerLayerInput
+	kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
+	kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
+	kv["gemma3n.head_dim"] = m.TextModel.HeadDim
+	kv["gemma3n.laurel_rank"] = m.TextModel.LaurelRank
+	kv["gemma3n.num_kv_shared_layers"] = m.TextModel.NumKVSharedLayers
+	kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
+	kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
+	return kv
+}
+
+func (m *gemma3nModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	out, ts := mergeTensors(ts,
+		merge{"altup_proj.*.weight", "altup_proj.weight"},
+		merge{"altup_unembd_proj.*.weight", "altup_unembd_proj.weight"},
+	)
+
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "audio_tower"),
+			strings.Contains(t.Name(), "embed_audio"),
+			strings.Contains(t.Name(), "vision_tower"),
+			strings.Contains(t.Name(), "embed_vision"):
+			// TODO: handle audio and vision towers
+			continue
+		case strings.Contains(t.Name(), "altup_predict_coef"),
+			strings.Contains(t.Name(), "altup_correct_coef"):
+			if m.TextModel.AltupCoefClip > 0 {
+				t.SetRepacker(func(name string, data []float32, shape []uint64) (_ []float32, err error) {
+					dims := make([]int, len(shape))
+					for i := range shape {
+						dims[i] = int(shape[i])
+					}
+
+					var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+					t, err = tensor.Clamp(t, -m.TextModel.AltupCoefClip, m.TextModel.AltupCoefClip)
+					if err != nil {
+						return nil, err
+					}
+
+					if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+						return nil, err
+					}
+
+					return native.VectorF32(t.(*tensor.Dense))
+				})
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (m *gemma3nModel) Replacements() []string {
+	return []string{
+		"model.language_model.embed_tokens_per_layer", "per_layer_token_embd",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.per_layer_model_projection", "per_layer_model_proj",
+		"model.language_model.per_layer_projection_norm", "per_layer_proj_norm", "model.language_model.altup_projections", "altup_proj",
+		"model.language_model.altup_unembed_projections", "altup_unembd_proj",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
+
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.down_proj", "ffn_down",
+		"post_feedforward_layernorm", "post_ffw_norm",
+		"per_layer_input_gate", "inp_gate",
+		"per_layer_projection", "proj",
+		"post_per_layer_input_norm", "post_norm",
+		"altup.", "altup_",
+		"modality_router", "router",
+		"prediction_coefs", "predict_coef",
+		"correction_coefs", "correct_coef",
+		"correct_output_scale", "correct_scale.weight",
+		"laurel.", "laurel_",
+		"linear_left", "l",
+		"linear_right", "r",
+		"post_laurel_norm", "post_norm",
+	}
+}
diff --git a/fs/config.go b/fs/config.go
index 89a1b134..3d6ae90e 100644
--- a/fs/config.go
+++ b/fs/config.go
@@ -10,4 +10,5 @@ type Config interface {
 	Strings(string, ...[]string) []string
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
+	Bools(string, ...[]bool) []bool
 }
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index f3fbdbaa..a0c2003f 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -166,6 +166,11 @@ func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
 	return val.values
 }
 
+func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
+	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
+	return val.values
+}
+
 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go
index 33b596cc..413eab5e 100644
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -609,6 +609,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 		err = writeGGUFArray(ws, ggufTypeString, v)
 	case *array[string]:
 		err = writeGGUFArray(ws, ggufTypeString, v.values)
+	case []bool:
+		err = writeGGUFArray(ws, ggufTypeBool, v)
+	case *array[bool]:
+		err = writeGGUFArray(ws, ggufTypeBool, v.values)
 	default:
 		return fmt.Errorf("improper type for '%s'", k)
 	}
diff --git a/go.mod b/go.mod
index 6de5959b..ec3f61bb 100644
--- a/go.mod
+++ b/go.mod
@@ -25,6 +25,7 @@ require (
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
 	golang.org/x/tools v0.30.0
+	gonum.org/v1/gonum v0.15.0
 )
 
 require (
@@ -44,7 +45,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index deb53c22..b4553149 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -150,7 +150,7 @@ index 4cce5166..7f6617fa 100644
  llama_model_loader::llama_model_loader(
          const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 3a4e72a3..831b68c0 100644
+index 3a4e72a3..db62973f 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
 @@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
diff --git a/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
index 52116ce3..82fe219c 100644
--- a/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
  4 files changed, 59 insertions(+), 79 deletions(-)
 
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index c22687e4..c5948e8f 100644
+index dca22d8b..1f3a3956 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -947,9 +947,12 @@ int llama_context::decode(llama_batch & inp_batch) {
  
          // find KV slot
          if (!kv_self->find_slot(ubatch)) {
@@ -41,7 +41,7 @@ index c22687e4..c5948e8f 100644
          }
  
          ggml_backend_sched_reset(sched.get());
-@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
+@@ -1965,9 +1968,12 @@ void llama_context::opt_epoch_iter(
  
              // TODO: not sure if this is needed
              if (!kv_self->find_slot(ubatch)) {
diff --git a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
index b71295c7..174c45a5 100644
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@@ -10,10 +10,10 @@ Subject: [PATCH] add argsort and cuda copy for i32
  3 files changed, 192 insertions(+), 2 deletions(-)
 
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index becdae07..7a44b6cf 100644
+index 955fec59..654e2f28 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -6822,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32(
      }
  }
  
@@ -59,7 +59,7 @@ index becdae07..7a44b6cf 100644
  void ggml_compute_forward_argsort(
      const ggml_compute_params * params,
      ggml_tensor * dst) {
-@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
+@@ -6833,6 +6872,10 @@ void ggml_compute_forward_argsort(
              {
                  ggml_compute_forward_argsort_f32(params, dst);
              } break;
@@ -195,7 +195,7 @@ index 607ded85..53b02634 100644
 +    }
  }
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index 2d46176e..47383486 100644
+index d027271f..4abd01d7 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
 @@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
@@ -257,7 +257,7 @@ index 2d46176e..47383486 100644
  static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
      const float * xi = (const float *) cxi;
      block_q8_0 * dsti = (block_q8_0 *) cdsti;
-@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -633,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
          ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
      } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
          ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -266,7 +266,7 @@ index 2d46176e..47383486 100644
      } else {
          GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                  ggml_type_name(src0->type), ggml_type_name(src1->type));
-@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -688,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
          return (void*) cpy_f32_f16<cpy_1_f32_f16>;
      } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
          return (void*) cpy_f32_f16<cpy_1_f16_f32>;
diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
new file mode 100644
index 00000000..a52f0fdf
--- /dev/null
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -0,0 +1,169 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Georgi Gerganov <ggerganov@gmail.com>
+Date: Thu, 19 Jun 2025 08:05:21 +0300
+Subject: [PATCH] metal : add mean kernel (#14267)
+
+* metal : add mean kernel
+
+ggml-ci
+
+* cont : dedup implementation
+
+ggml-ci
+---
+ ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++---
+ ggml/src/ggml-metal/ggml-metal.metal | 48 ++++++++++++++++++++++------
+ 2 files changed, 67 insertions(+), 14 deletions(-)
+
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index ee4f2dcb..f20f5615 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
+     GGML_METAL_KERNEL_TYPE_COS,
+     GGML_METAL_KERNEL_TYPE_NEG,
+     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
++    GGML_METAL_KERNEL_TYPE_MEAN,
+     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
+     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
+     GGML_METAL_KERNEL_TYPE_ARGMAX,
+@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
+@@ -1634,6 +1636,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_LOG:
+             return false; // TODO: implement
+         case GGML_OP_SUM_ROWS:
++        case GGML_OP_MEAN:
+         case GGML_OP_SOFT_MAX:
+         case GGML_OP_GROUP_NORM:
+             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
+@@ -2362,11 +2365,30 @@ static bool ggml_metal_encode_node(
+                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+             } break;
+         case GGML_OP_SUM_ROWS:
++        case GGML_OP_MEAN:
+             {
+                 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+ 
+-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
++                id<MTLComputePipelineState> pipeline = nil;
++
++                switch (dst->op) {
++                    case GGML_OP_SUM_ROWS:
++                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
++                        break;
++                    case GGML_OP_MEAN:
++                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
++                        break;
++                    default:
++                        GGML_ABORT("fatal error");
++                }
++
++                int nth = 32; // SIMD width
++
++                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
++                    nth *= 2;
++                }
+ 
++                nth = MIN(nth, ne00);
+ 
+                 ggml_metal_kargs_sum_rows args = {
+                    /*.ne00 =*/ ne00,
+@@ -2396,11 +2418,12 @@ static bool ggml_metal_encode_node(
+                 };
+ 
+                 [encoder setComputePipelineState:pipeline];
+-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
++                [encoder setBytes:&args length:sizeof(args) atIndex:0];
++                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
++                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
++                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+ 
+-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
++                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+             } break;
+         case GGML_OP_SOFT_MAX:
+             {
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 9cfddf45..08e8d807 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
++++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -956,31 +956,61 @@ kernel void kernel_neg(
+     dst[tpig] = -src0[tpig];
+ }
+ 
++template <bool norm>
+ kernel void kernel_sum_rows(
++        constant ggml_metal_kargs_sum_rows & args,
+         device const float * src0,
+         device       float * dst,
+-        constant ggml_metal_kargs_sum_rows & args,
+-        uint3 tpig[[thread_position_in_grid]]) {
+-    int64_t i3 = tpig.z;
+-    int64_t i2 = tpig.y;
+-    int64_t i1 = tpig.x;
++        threadgroup  float * shmem_f32 [[threadgroup(0)]],
++        uint3   tgpig[[threadgroup_position_in_grid]],
++        ushort3 tpitg[[thread_position_in_threadgroup]],
++        ushort  sgitg[[simdgroup_index_in_threadgroup]],
++        ushort  tiisg[[thread_index_in_simdgroup]],
++        ushort3   ntg[[threads_per_threadgroup]]) {
++    int64_t i3 = tgpig.z;
++    int64_t i2 = tgpig.y;
++    int64_t i1 = tgpig.x;
+ 
+     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
+         return;
+     }
+ 
++    if (sgitg == 0) {
++        shmem_f32[tiisg] = 0.0f;
++    }
++
+     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
+     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+ 
+-    float row_sum = 0;
++    float sumf = 0;
+ 
+-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
+-        row_sum += src_row[i0];
++    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
++        sumf += src_row[i0];
+     }
+ 
+-    dst_row[0] = row_sum;
++    sumf = simd_sum(sumf);
++
++    threadgroup_barrier(mem_flags::mem_threadgroup);
++
++    if (tiisg == 0) {
++        shmem_f32[sgitg] = sumf;
++    }
++
++    threadgroup_barrier(mem_flags::mem_threadgroup);
++
++    sumf = shmem_f32[tiisg];
++    sumf = simd_sum(sumf);
++
++    if (tpitg.x == 0) {
++        dst_row[0] = norm ? sumf / args.ne00 : sumf;
++    }
+ }
+ 
++typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
++
++template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
++template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
++
+ template<typename T>
+ kernel void kernel_soft_max(
+         device const  char * src0,
diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
new file mode 100644
index 00000000..efcb1e8b
--- /dev/null
+++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
@@ -0,0 +1,5089 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Aman Gupta <amangupta052@gmail.com>
+Date: Sun, 22 Jun 2025 12:39:54 +0800
+Subject: [PATCH] CUDA: add mean operation (#14313)
+
+* CUDA: add mean operation
+
+* add back sum_rows_f32_cuda
+
+* Review: early exit if col!=0
+---
+ ggml/src/ggml-cuda/common.cuh   |   20 +
+ ggml/src/ggml-cuda/ggml-cuda.cu |    5 +
+ ggml/src/ggml-cuda/mean.cu      |   19 +
+ ggml/src/ggml-cuda/mean.cuh     |    3 +
+ ggml/src/ggml-cuda/sumrows.cu   |   23 +-
+ ggml/src/ggml-cuda/sumrows.cuh  |    1 -
+ tests/test-backend-ops.cpp      | 2990 ++++++++++++++++---------------
+ 7 files changed, 1554 insertions(+), 1507 deletions(-)
+ create mode 100644 ggml/src/ggml-cuda/mean.cu
+ create mode 100644 ggml/src/ggml-cuda/mean.cuh
+
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index 64fb4ff4..5b9a0fe3 100644
+--- a/ggml/src/ggml-cuda/common.cuh
++++ b/ggml/src/ggml-cuda/common.cuh
+@@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+ #endif // FP16_AVAILABLE
+ }
+ 
++// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
++template<bool norm>
++static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
++    const int row = blockIdx.x;
++    const int col = threadIdx.x;
++
++    float sum = 0.0f;
++    for (int i = col; i < ncols; i += blockDim.x) {
++        sum += x[row * ncols + i];
++    }
++
++    sum = warp_reduce_sum(sum);
++
++    if (col != 0) {
++        return;
++    }
++
++    dst[row] = norm ? sum / ncols : sum;
++}
++
+ template<int width = WARP_SIZE>
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
+ #pragma unroll
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 4c829153..9e64e5ae 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -35,6 +35,7 @@
+ #include "ggml-cuda/ssm-scan.cuh"
+ #include "ggml-cuda/sum.cuh"
+ #include "ggml-cuda/sumrows.cuh"
++#include "ggml-cuda/mean.cuh"
+ #include "ggml-cuda/tsembd.cuh"
+ #include "ggml-cuda/unary.cuh"
+ #include "ggml-cuda/upscale.cuh"
+@@ -2322,6 +2323,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+         case GGML_OP_SUM_ROWS:
+             ggml_cuda_op_sum_rows(ctx, dst);
+             break;
++        case GGML_OP_MEAN:
++            ggml_cuda_op_mean(ctx, dst);
++            break;
+         case GGML_OP_SSM_CONV:
+             ggml_cuda_op_ssm_conv(ctx, dst);
+             break;
+@@ -3211,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+         case GGML_OP_POOL_2D:
+         case GGML_OP_SUM:
+         case GGML_OP_SUM_ROWS:
++        case GGML_OP_MEAN:
+         case GGML_OP_ARGSORT:
+         case GGML_OP_ACC:
+             return true;
+diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
+new file mode 100644
+index 00000000..4b238a39
+--- /dev/null
++++ b/ggml/src/ggml-cuda/mean.cu
+@@ -0,0 +1,19 @@
++#include "mean.cuh"
++
++void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
++    const ggml_tensor * src0   = dst->src[0];
++    const float *       src0_d = (const float *) src0->data;
++    float *             dst_d  = (float *) dst->data;
++    cudaStream_t        stream = ctx.stream();
++
++    GGML_ASSERT(src0->type == GGML_TYPE_F32);
++    GGML_ASSERT(dst->type == GGML_TYPE_F32);
++    GGML_ASSERT(ggml_is_contiguous(src0));
++
++    const int64_t ncols = src0->ne[0];
++    const int64_t nrows = ggml_nrows(src0);
++
++    const dim3 block_dims(WARP_SIZE, 1, 1);
++    const dim3 block_nums(nrows, 1, 1);
++    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
++}
+diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh
+new file mode 100644
+index 00000000..2b9b1043
+--- /dev/null
++++ b/ggml/src/ggml-cuda/mean.cuh
+@@ -0,0 +1,3 @@
++#include "common.cuh"
++
++void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
+index 38dbf1b5..2eee08fa 100644
+--- a/ggml/src/ggml-cuda/sumrows.cu
++++ b/ggml/src/ggml-cuda/sumrows.cu
+@@ -1,25 +1,9 @@
+ #include "sumrows.cuh"
+ 
+-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
+-    const int row = blockIdx.x;
+-    const int col = threadIdx.x;
+-
+-    float sum = 0.0f;
+-    for (int i = col; i < ncols; i += blockDim.x) {
+-        sum += x[row * ncols + i];
+-    }
+-
+-    sum = warp_reduce_sum(sum);
+-
+-    if (col == 0) {
+-        dst[row] = sum;
+-    }
+-}
+-
+ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+     const dim3 block_dims(WARP_SIZE, 1, 1);
+     const dim3 block_nums(nrows, 1, 1);
+-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
++    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+ }
+ 
+ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+     const int64_t ncols = src0->ne[0];
+     const int64_t nrows = ggml_nrows(src0);
+ 
+-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
++    const dim3 block_dims(WARP_SIZE, 1, 1);
++    const dim3 block_nums(nrows, 1, 1);
++
++    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+ }
+diff --git a/ggml/src/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh
+index 191db1c1..3431c599 100644
+--- a/ggml/src/ggml-cuda/sumrows.cuh
++++ b/ggml/src/ggml-cuda/sumrows.cuh
+@@ -1,5 +1,4 @@
+ #include "common.cuh"
+ 
+ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+-
+ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
+index 543db934..58bdc874 100644
+--- a/tests/test-backend-ops.cpp
++++ b/tests/test-backend-ops.cpp
+@@ -9,16 +9,14 @@
+ // Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
+ // then go to section 3 and add an instantiation of your struct.
+ 
+-
+ // ##############################
+ // ## Section 1: General Setup ##
+ // ##############################
+ 
+-
+-#include <ggml.h>
+ #include <ggml-alloc.h>
+ #include <ggml-backend.h>
+ #include <ggml-cpp.h>
++#include <ggml.h>
+ 
+ #include <algorithm>
+ #include <array>
+@@ -37,24 +35,26 @@
+ #include <vector>
+ 
+ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+-    size_t nels = ggml_nelements(tensor);
++    size_t             nels = ggml_nelements(tensor);
+     std::vector<float> data(nels);
+     {
+         // parallel initialization
+-        static const size_t n_threads = std::thread::hardware_concurrency();
++        static const size_t                            n_threads  = std::thread::hardware_concurrency();
+         // static RNG initialization (revisit if n_threads stops being constant)
+         static std::vector<std::default_random_engine> generators = []() {
+-            std::random_device rd;
++            std::random_device                      rd;
+             std::vector<std::default_random_engine> vec;
+             vec.reserve(n_threads);
+             //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+-            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
++            for (size_t i = 0; i < n_threads; i++) {
++                vec.emplace_back(rd());
++            }
+             return vec;
+         }();
+ 
+         auto init_thread = [&](size_t ith, size_t start, size_t end) {
+             std::uniform_real_distribution<float> distribution(min, max);
+-            auto & gen = generators[ith];
++            auto &                                gen = generators[ith];
+             for (size_t i = start; i < end; i++) {
+                 data[i] = distribution(gen);
+             }
+@@ -63,8 +63,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
+         std::vector<std::future<void>> tasks;
+         tasks.reserve(n_threads);
+         for (size_t i = 0; i < n_threads; i++) {
+-            size_t start =     i*nels/n_threads;
+-            size_t end   = (i+1)*nels/n_threads;
++            size_t start = i * nels / n_threads;
++            size_t end   = (i + 1) * nels / n_threads;
+             tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
+         }
+         for (auto & t : tasks) {
+@@ -77,13 +77,13 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
+     } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+         GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+ 
+-         // dummy importance matrix
++        // dummy importance matrix
+         std::vector<float> imatrix(tensor->ne[0], 1.0f);
+-        const float * im = imatrix.data();
++        const float *      im = imatrix.data();
+         if (!ggml_quantize_requires_imatrix(tensor->type)) {
+             // when the imatrix is optional, we want to test both quantization with and without imatrix
+             // use one of the random numbers to decide
+-            if (data[0] > 0.5f*(min + max)) {
++            if (data[0] > 0.5f * (min + max)) {
+                 im = nullptr;
+             }
+         }
+@@ -92,21 +92,21 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
+         {
+             // parallel quantization by block
+             size_t blck_size = ggml_blck_size(tensor->type);
+-            size_t n_blocks = nels / blck_size;
++            size_t n_blocks  = nels / blck_size;
+ 
+             auto quantize_thread = [&](size_t start, size_t end) {
+-                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+-                    start * blck_size, end - start, blck_size, im);
++                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), start * blck_size, end - start, blck_size,
++                                    im);
+             };
+ 
+-            const size_t min_blocks_per_thread = 1;
+-            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
+-                                                      std::max<size_t>(1, n_blocks / min_blocks_per_thread));
++            const size_t                   min_blocks_per_thread = 1;
++            const size_t                   n_threads = std::min<size_t>(std::thread::hardware_concurrency() / 2,
++                                                                        std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+             std::vector<std::future<void>> tasks;
+             tasks.reserve(n_threads);
+             for (size_t i = 0; i < n_threads; i++) {
+-                size_t start =     i*n_blocks/n_threads;
+-                size_t end   = (i+1)*n_blocks/n_threads;
++                size_t start = i * n_blocks / n_threads;
++                size_t end   = (i + 1) * n_blocks / n_threads;
+                 tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+             }
+             for (auto & t : tasks) {
+@@ -119,9 +119,9 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
+         ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+     } else if (tensor->type == GGML_TYPE_I64) {
+         // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+-        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+-        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+-        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
++        const size_t nbytes_half = ggml_nbytes(tensor) / 2;
++        ggml_backend_tensor_set(tensor, data.data(), 0 * nbytes_half, nbytes_half);
++        ggml_backend_tensor_set(tensor, data.data(), 1 * nbytes_half, nbytes_half);
+     } else {
+         GGML_ABORT("fatal error");
+     }
+@@ -134,31 +134,31 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
+     std::vector<uint8_t> buf(ggml_nbytes(t));
+     ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+ 
+-    const auto * tt = ggml_get_type_traits(t->type);
+-    size_t bs = ggml_blck_size(t->type);
++    const auto *       tt = ggml_get_type_traits(t->type);
++    size_t             bs = ggml_blck_size(t->type);
+     std::vector<float> vq(ggml_blck_size(t->type));
+-    bool quantized = ggml_is_quantized(t->type);
++    bool               quantized = ggml_is_quantized(t->type);
+ 
+     // access elements by index to avoid gaps in views
+     for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
+         for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
+             for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
+                 for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
+-                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
++                    size_t i = i3 * t->nb[3] + i2 * t->nb[2] + i1 * t->nb[1] + i0 / bs * t->nb[0];
+                     if (t->type == GGML_TYPE_F16) {
+-                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
++                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t *) &buf[i]));
+                     } else if (t->type == GGML_TYPE_BF16) {
+-                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
++                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t *) &buf[i]));
+                     } else if (t->type == GGML_TYPE_F32) {
+                         tv.push_back(*(float *) &buf[i]);
+                     } else if (t->type == GGML_TYPE_I64) {
+-                        tv.push_back((float)*(int64_t *) &buf[i]);
++                        tv.push_back((float) *(int64_t *) &buf[i]);
+                     } else if (t->type == GGML_TYPE_I32) {
+-                        tv.push_back((float)*(int32_t *) &buf[i]);
++                        tv.push_back((float) *(int32_t *) &buf[i]);
+                     } else if (t->type == GGML_TYPE_I16) {
+-                        tv.push_back((float)*(int16_t *) &buf[i]);
++                        tv.push_back((float) *(int16_t *) &buf[i]);
+                     } else if (t->type == GGML_TYPE_I8) {
+-                        tv.push_back((float)*(int8_t *) &buf[i]);
++                        tv.push_back((float) *(int8_t *) &buf[i]);
+                     } else if (quantized) {
+                         tt->to_float(&buf[i], vq.data(), bs);
+                         tv.insert(tv.end(), vq.begin(), vq.end());
+@@ -195,7 +195,8 @@ static double nmse(const float * a, const float * b, size_t n) {
+ // n: number of values to compare.
+ // expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
+ //     a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
+-static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
++static double mean_abs_asymm(const float * a, const float * b, const size_t n,
++                             const std::vector<float> & expected_vals) {
+     double sum = 0.0f;
+ 
+     size_t nvalid = 0;
+@@ -219,18 +220,16 @@ static double mean_abs_asymm(const float * a, const float * b, const size_t n, c
+         nvalid++;
+     }
+ 
+-    return sum/nvalid;
++    return sum / nvalid;
+ }
+ 
+ // utils for printing the variables of the test cases
+ 
+-template<typename T>
+-static std::string var_to_str(const T & x) {
++template <typename T> static std::string var_to_str(const T & x) {
+     return std::to_string(x);
+ }
+ 
+-template<typename T, size_t N>
+-static std::string var_to_str(const T (&x)[N]) {
++template <typename T, size_t N> static std::string var_to_str(const T (&x)[N]) {
+     std::string s = "[";
+     for (size_t i = 0; i < N; i++) {
+         if (i > 0) {
+@@ -242,8 +241,7 @@ static std::string var_to_str(const T (&x)[N]) {
+     return s;
+ }
+ 
+-template<typename T, size_t N>
+-static std::string var_to_str(const std::array<T, N> & x) {
++template <typename T, size_t N> static std::string var_to_str(const std::array<T, N> & x) {
+     std::string s = "[";
+     for (size_t i = 0; i < N; i++) {
+         if (i > 0) {
+@@ -265,41 +263,50 @@ static std::string var_to_str(ggml_prec prec) {
+ 
+ static std::string var_to_str(ggml_op_pool pool) {
+     switch (pool) {
+-        case GGML_OP_POOL_AVG:  return "avg";
+-        case GGML_OP_POOL_MAX:  return "max";
+-        default:                return std::to_string(pool);
++        case GGML_OP_POOL_AVG:
++            return "avg";
++        case GGML_OP_POOL_MAX:
++            return "max";
++        default:
++            return std::to_string(pool);
+     }
+ }
+ 
+ static std::string var_to_str(ggml_scale_mode mode) {
+     switch (mode) {
+-        case GGML_SCALE_MODE_NEAREST:  return "nearest";
+-        case GGML_SCALE_MODE_BILINEAR: return "bilinear";
+-        default:                      return std::to_string(mode);
++        case GGML_SCALE_MODE_NEAREST:
++            return "nearest";
++        case GGML_SCALE_MODE_BILINEAR:
++            return "bilinear";
++        default:
++            return std::to_string(mode);
+     }
+ }
+ 
+ #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+ 
+-#define VARS_TO_STR1(a) VAR_TO_STR(a)
+-#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
+-#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
+-#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
+-#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
+-#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
+-#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
+-#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
+-#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
+-#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
++#define VARS_TO_STR1(a)                                VAR_TO_STR(a)
++#define VARS_TO_STR2(a, b)                             VAR_TO_STR(a) + "," + VAR_TO_STR(b)
++#define VARS_TO_STR3(a, b, c)                          VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
++#define VARS_TO_STR4(a, b, c, d)                       VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
++#define VARS_TO_STR5(a, b, c, d, e)                    VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
++#define VARS_TO_STR6(a, b, c, d, e, f)                 VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
++#define VARS_TO_STR7(a, b, c, d, e, f, g)              VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
++#define VARS_TO_STR8(a, b, c, d, e, f, g, h)           VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
++#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i)        VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
++#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j)    VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
+ #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
+-#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
++#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) \
++    VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
+ 
+ #ifdef GGML_USE_SYCL
+ static bool inline _isinf(float f) {
+-    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
++    return (*(uint32_t *) &f & 0x7fffffff) == 0x7f800000;
+ }
+ #else
+-static bool inline _isinf(float f) { return std::isinf(f); }
++static bool inline _isinf(float f) {
++    return std::isinf(f);
++}
+ #endif
+ 
+ // accept FLT_MAX as infinity
+@@ -320,45 +327,29 @@ enum test_mode {
+ struct test_case {
+     virtual ~test_case() {}
+ 
+-    virtual std::string op_desc(ggml_tensor * t) {
+-        return ggml_op_desc(t);
+-    }
++    virtual std::string op_desc(ggml_tensor * t) { return ggml_op_desc(t); }
+ 
+-    virtual std::string vars() {
+-        return "";
+-    }
++    virtual std::string vars() { return ""; }
+ 
+     virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
+ 
+-    virtual double max_nmse_err() {
+-        return 1e-7;
+-    }
++    virtual double max_nmse_err() { return 1e-7; }
+ 
+-    virtual double max_maa_err() {
+-        return 1e-4;
+-    }
++    virtual double max_maa_err() { return 1e-4; }
+ 
+-    virtual float grad_eps() {
+-        return 1e-1f;
+-    }
++    virtual float grad_eps() { return 1e-1f; }
+ 
+     // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
+     // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
+-    virtual bool grad_precise() {
+-        return false;
+-    }
++    virtual bool grad_precise() { return false; }
+ 
+     // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
+-    virtual int64_t grad_nmax() {
+-        return 10000;
+-    }
++    virtual int64_t grad_nmax() { return 10000; }
+ 
+     // No effect if empty.
+     // If not empty, skip all gradient checks where the numerical result does not match any of the values.
+     // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
+-    virtual std::vector<float> grad_expect() {
+-        return {};
+-    }
++    virtual std::vector<float> grad_expect() { return {}; }
+ 
+     virtual void initialize_tensors(ggml_context * ctx) {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+@@ -426,7 +417,8 @@ struct test_case {
+         return t;
+     }
+ 
+-    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
++    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2,
++                                     int64_t ne3) {
+         ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
+         add_sentinel(ctx);
+         return t;
+@@ -436,7 +428,7 @@ struct test_case {
+         mode = MODE_TEST;
+ 
+         ggml_init_params params = {
+-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
++            /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+             /* .mem_base = */ NULL,
+             /* .no_alloc = */ true,
+         };
+@@ -461,7 +453,7 @@ struct test_case {
+ 
+         // check if the backends support the ops
+         bool supported = true;
+-        for (ggml_backend_t backend : {backend1, backend2}) {
++        for (ggml_backend_t backend : { backend1, backend2 }) {
+             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                 if (!ggml_backend_supports_op(backend, t)) {
+                     printf("not supported [%s] ", ggml_backend_name(backend));
+@@ -501,23 +493,18 @@ struct test_case {
+ 
+         // compare
+         struct callback_userdata {
+-            bool   ok;
+-            double max_err;
++            bool           ok;
++            double         max_err;
+             ggml_backend_t backend1;
+             ggml_backend_t backend2;
+         };
+ 
+-        callback_userdata ud {
+-            true,
+-            max_nmse_err(),
+-            backend1,
+-            backend2
+-        };
++        callback_userdata ud{ true, max_nmse_err(), backend1, backend2 };
+ 
+         auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
+-            callback_userdata * ud = (callback_userdata *) user_data;
+-            const char * bn1 = ggml_backend_name(ud->backend1);
+-            const char * bn2 = ggml_backend_name(ud->backend2);
++            callback_userdata * ud  = (callback_userdata *) user_data;
++            const char *        bn1 = ggml_backend_name(ud->backend1);
++            const char *        bn2 = ggml_backend_name(ud->backend2);
+ 
+             if (t1->op == GGML_OP_NONE) {
+                 // sentinels must be unchanged
+@@ -599,11 +586,11 @@ struct test_case {
+         static const size_t graph_nodes = 8192;
+ 
+         ggml_init_params params = {
+-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
++            /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead_custom(graph_nodes, false),
+             /* .mem_base = */ NULL,
+             /* .no_alloc = */ true,
+         };
+-        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
++        ggml_context_ptr ctx(ggml_init(params));  // smart ptr
+         GGML_ASSERT(ctx);
+ 
+         ggml_tensor * out = build_graph(ctx.get());
+@@ -624,14 +611,14 @@ struct test_case {
+ 
+         // align while also leaving some margin for variations in parameters
+         int align = 8;
+-        int last = (len + align - 1) / align * align;
++        int last  = (len + align - 1) / align * align;
+         if (last - len < 5) {
+             last += align;
+         }
+         printf("%*s", last - len, "");
+ 
+         // allocate
+-        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
++        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend));  // smart ptr
+ 
+         if (buf == NULL) {
+             printf("failed to allocate tensors\n");
+@@ -648,26 +635,27 @@ struct test_case {
+         // warmup run
+         ggml_status status = ggml_backend_graph_compute(backend, gf);
+         if (status != GGML_STATUS_SUCCESS) {
+-            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                    ggml_status_to_string(status));
+             return false;
+         }
+ 
+         // determine number of runs
+-        int n_runs;
++        int  n_runs;
+         bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
+         if (op_flops(out) > 0) {
+             // based on flops
+-            const uint64_t GFLOP = 1000 * 1000 * 1000;
+-            const uint64_t target_flops_cpu =   8ULL * GFLOP;
++            const uint64_t GFLOP            = 1000 * 1000 * 1000;
++            const uint64_t target_flops_cpu = 8ULL * GFLOP;
+             const uint64_t target_flops_gpu = 100ULL * GFLOP;
+-            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
++            uint64_t       target_flops     = is_cpu ? target_flops_cpu : target_flops_gpu;
+             n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+         } else {
+             // based on memory size
+-            const size_t GB = 1ULL << 30;
+-            const size_t target_size_cpu =  8 * GB;
++            const size_t GB              = 1ULL << 30;
++            const size_t target_size_cpu = 8 * GB;
+             const size_t target_size_gpu = 32 * GB;
+-            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
++            size_t       target_size     = is_cpu ? target_size_cpu : target_size_gpu;
+             n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+         }
+ 
+@@ -677,8 +665,8 @@ struct test_case {
+         }
+ 
+         // calculate memory
+-        size_t mem = n_runs * op_size(out);
+-        auto tensor_op_size = [](ggml_tensor * t) {
++        size_t mem            = n_runs * op_size(out);
++        auto   tensor_op_size = [](ggml_tensor * t) {
+             size_t size = ggml_nbytes(t);
+             // add source tensors
+             for (int i = 0; i < GGML_MAX_SRC; i++) {
+@@ -697,13 +685,14 @@ struct test_case {
+ 
+         // run
+         int64_t total_time_us = 0;
+-        int64_t total_mem = 0;
+-        int total_runs = 0;
++        int64_t total_mem     = 0;
++        int     total_runs    = 0;
+         do {
+-            int64_t start_time = ggml_time_us();
+-            ggml_status status = ggml_backend_graph_compute(backend, gf);
++            int64_t     start_time = ggml_time_us();
++            ggml_status status     = ggml_backend_graph_compute(backend, gf);
+             if (status != GGML_STATUS_SUCCESS) {
+-                fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++                fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                        ggml_status_to_string(status));
+                 return false;
+             }
+             int64_t end_time = ggml_time_us();
+@@ -711,15 +700,13 @@ struct test_case {
+             total_time_us += end_time - start_time;
+             total_mem += mem;
+             total_runs += n_runs;
+-        } while (total_time_us < 1000*1000); // run for at least 1 second
++        } while (total_time_us < 1000 * 1000);  // run for at least 1 second
+ 
+-        printf("    %8d runs - %8.2f us/run - ",
+-            total_runs,
+-            (double)total_time_us / total_runs);
++        printf("    %8d runs - %8.2f us/run - ", total_runs, (double) total_time_us / total_runs);
+ 
+         if (op_flops(out) > 0) {
+             double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
+-            auto format_flops = [](double flops) -> std::string {
++            auto   format_flops  = [](double flops) -> std::string {
+                 char buf[256];
+                 if (flops >= 1e12) {
+                     snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+@@ -732,14 +719,12 @@ struct test_case {
+                 }
+                 return buf;
+             };
+-            printf("%s/run - \033[1;34m%sS\033[0m",
+-                format_flops(op_flops(out)).c_str(),
+-                format_flops(flops_per_sec).c_str());
++            printf("%s/run - \033[1;34m%sS\033[0m", format_flops(op_flops(out)).c_str(),
++                   format_flops(flops_per_sec).c_str());
+ 
+         } else {
+-            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
+-                op_size(out) / 1024,
+-                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
++            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m", op_size(out) / 1024,
++                   total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+         }
+         printf("\n");
+ 
+@@ -747,15 +732,16 @@ struct test_case {
+     }
+ 
+     bool eval_grad(ggml_backend_t backend, const char * op_name) {
+-        mode = MODE_GRAD;
++        mode                            = MODE_GRAD;
+         const std::vector<float> expect = grad_expect();
+ 
+         ggml_init_params params = {
+-            /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
++            /* .mem_size = */ ggml_tensor_overhead() * 128 +
++                2 * ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
+             /* .mem_base = */ NULL,
+             /* .no_alloc = */ true,
+         };
+-        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
++        ggml_context_ptr ctx(ggml_init(params));  // smart ptr
+         GGML_ASSERT(ctx);
+ 
+         gf = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true);
+@@ -777,7 +763,7 @@ struct test_case {
+         }
+ 
+         // check if the backend supports the ops
+-        bool supported = true;
++        bool supported  = true;
+         bool any_params = false;
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+             if (!ggml_backend_supports_op(backend, t)) {
+@@ -814,7 +800,6 @@ struct test_case {
+             return true;
+         }
+ 
+-
+         if (!ggml_is_scalar(out)) {
+             out = ggml_sum(ctx.get(), out);
+             ggml_set_name(out, "sum_of_out");
+@@ -826,7 +811,8 @@ struct test_case {
+         ggml_build_backward_expand(ctx.get(), gb, nullptr);
+         if (expect.size() != 1 || expect[0] != 0.0f) {
+             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
+-            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
++            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL;
++                 t               = ggml_get_next_tensor(ctx.get(), t)) {
+                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
+             }
+         }
+@@ -849,44 +835,47 @@ struct test_case {
+         }
+ 
+         // allocate
+-        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
++        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend));  // smart ptr
+         if (buf == NULL) {
+             printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
+             return false;
+         }
+ 
+-        initialize_tensors(ctx.get()); // Randomizes all tensors (including gradients).
+-        ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise.
++        initialize_tensors(ctx.get());  // Randomizes all tensors (including gradients).
++        ggml_graph_reset(gb);           // Sets gradients to 1 if loss, 0 otherwise.
+ 
+         ggml_status status = ggml_backend_graph_compute(backend, gf);
+         if (status != GGML_STATUS_SUCCESS) {
+-            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                    ggml_status_to_string(status));
+             return false;
+         }
+         status = ggml_backend_graph_compute(backend, gb);
+         if (status != GGML_STATUS_SUCCESS) {
+-            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                    ggml_status_to_string(status));
+             return false;
+         }
+ 
+         bool ok = true;
+-        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
++        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr;
++             t                      = ggml_get_next_tensor(ctx.get(), t)) {
+             if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                 continue;
+             }
+ 
+-            const char * bn = ggml_backend_name(backend);
++            const char *  bn = ggml_backend_name(backend);
+             const int64_t ne = ggml_nelements(t);
+ 
+-            std::vector<float> ga;
++            std::vector<float>   ga;
+             struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
+             if (grad) {
+                 ga = tensor_to_float(grad);
+             } else {
+-                ga.resize(ne); // default value is 0.0f
++                ga.resize(ne);  // default value is 0.0f
+             }
+ 
+-            for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
++            for (int64_t i = 0; i < ne; ++i) {  // gradient algebraic
+                 // check for nans
+                 if (!std::isfinite(ga[i])) {
+                     printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
+@@ -898,58 +887,63 @@ struct test_case {
+                 break;
+             }
+ 
+-            std::vector<float> gn(ne); // gradient numeric
++            std::vector<float> gn(ne);  // gradient numeric
+             GGML_ASSERT(ga.size() == gn.size());
+ 
+-            std::vector<float> x0 = tensor_to_float(t); // original t data
++            std::vector<float> x0 = tensor_to_float(t);  // original t data
+             GGML_ASSERT(ggml_is_scalar(out));
+             GGML_ASSERT(out->type == GGML_TYPE_F32);
+ 
+             const float eps = grad_eps();
+             for (int64_t i = 0; i < ne; ++i) {
+-                const float xiu  = x0[i] + 1.0f*eps; // x, index i, up
+-                const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
+-                const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
+-                const float xid  = x0[i] - 1.0f*eps; // x, index i, down
++                const float xiu  = x0[i] + 1.0f * eps;  // x, index i, up
++                const float xiuh = x0[i] + 0.5f * eps;  // x, index i, up half
++                const float xidh = x0[i] - 0.5f * eps;  // x, index i, down half
++                const float xid  = x0[i] - 1.0f * eps;  // x, index i, down
+ 
+-                float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
++                float fu, fuh, fdh, fd;                 // output values for xiu, xiuh, xid, xidh
+ 
+-                ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
++                ggml_backend_tensor_set(t, &xiu, i * sizeof(float), sizeof(float));
+                 status = ggml_backend_graph_compute(backend, gf);
+                 if (status != GGML_STATUS_SUCCESS) {
+-                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                            ggml_status_to_string(status));
+                     return false;
+                 }
+                 ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
+ 
+-                ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
++                ggml_backend_tensor_set(t, &xid, i * sizeof(float), sizeof(float));
+                 status = ggml_backend_graph_compute(backend, gf);
+                 if (status != GGML_STATUS_SUCCESS) {
+-                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                            ggml_status_to_string(status));
+                     return false;
+                 }
+                 ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
+ 
+                 if (grad_precise()) {
+-                    ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
++                    ggml_backend_tensor_set(t, &xiuh, i * sizeof(float), sizeof(float));
+                     status = ggml_backend_graph_compute(backend, gf);
+                     if (status != GGML_STATUS_SUCCESS) {
+-                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                                ggml_status_to_string(status));
+                         return false;
+                     }
+                     ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
+ 
+-                    ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
++                    ggml_backend_tensor_set(t, &xidh, i * sizeof(float), sizeof(float));
+                     status = ggml_backend_graph_compute(backend, gf);
+                     if (status != GGML_STATUS_SUCCESS) {
+-                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
++                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__,
++                                ggml_status_to_string(status));
+                         return false;
+                     }
+                     ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
+ 
+-                    gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
++                    gn[i] =
++                        (8.0 * (double) fuh + (double) fd - (8.0 * (double) fdh + (double) fu)) / (6.0 * (double) eps);
+                 } else {
+-                    gn[i] = (fu - fd) / (2.0f*eps);
++                    gn[i] = (fu - fd) / (2.0f * eps);
+                 }
+ 
+                 ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
+@@ -980,82 +974,77 @@ struct test_case {
+     }
+ };
+ 
+-
+ // ###################################
+ // ## Section 2: GGML Op Defintions ##
+ // ###################################
+ 
+-
+ // The following is an example showing the bare minimum for creating a test for a GGML op.
+ 
+ // GGML_OP_EXAMPLE
+ struct test_example : public test_case {
+     // Always define these 2 or variants thereof:
+-    const ggml_type type; // The type of the input tensors.
+-    const std::array<int64_t, 4> ne; // The shape of the input tensors.
++    const ggml_type              type;  // The type of the input tensors.
++    const std::array<int64_t, 4> ne;    // The shape of the input tensors.
++
+     // For some ops it's necessary to define multiple types or shapes for the inputs.
+     // Or they may need additional parameters.
+ 
+     // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
+     // In most cases these are just the properties of the struct that you defined above.
+     // This is needed for info prints.
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+     // Define a constructor for the struct.
+     // In most cases it will be sufficient to have the same arguments as the struct has properties
+     // and just use initializer lists.
+-    test_example(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_example(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     // Define how a simple GGML compute graph can be constructed for the new GGML op.
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         // Step 1: create input tensors that don't depend on any other tensors:
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+-        ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
++        ggml_set_name(a, "a");  // Setting names is optional but it's useful for debugging.
+ 
+         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+         ggml_set_name(b, "b");
+ 
+         // Step 2: use the op that you want to test in the GGML compute graph.
+-        ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
++        ggml_tensor * out = ggml_add(ctx, a, b);  // For this example we're just doing a simple addition.
+         ggml_set_name(out, "out");
+ 
+         // Step 3: return the output tensor.
+         return out;
+     }
++
+     // In order to also check the gradients for your op, add calls like ggml_set_param(a)
+     // immediately after you create the tensors.
+     // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
+ };
+ 
+-
+ // GGML_OP_UNARY
+ struct test_unary : public test_case {
+-    const ggml_unary_op op;
+-    const ggml_type type;
++    const ggml_unary_op          op;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    int v; // view (1 : non-contiguous a)
++    int                          v;  // view (1 : non-contiguous a)
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne_a, v);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne_a, v); }
+ 
+-    test_unary(ggml_unary_op op,
+-            ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+-            int v = 0)
+-        : op(op), type(type), ne_a(ne_a), v(v) {}
++    test_unary(ggml_unary_op op, ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 128, 2, 2, 2 },
++               int v = 0) :
++        op(op),
++        type(type),
++        ne_a(ne_a),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
+-            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
++                                    op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
+ 
+         ggml_tensor * a;
+         if (v & 1) {
+-            auto ne = ne_a; ne[0] *= 3;
++            auto ne = ne_a;
++            ne[0] *= 3;
+             a = ggml_new_tensor(ctx, type, 4, ne.data());
+             if (grad_supported) {
+                 ggml_set_param(a);
+@@ -1085,40 +1074,40 @@ struct test_unary : public test_case {
+         }
+     }
+ 
+-    float grad_eps() override {
+-        return 15.0f;
+-    }
++    float grad_eps() override { return 15.0f; }
+ 
+     std::vector<float> grad_expect() override {
+         if (op == GGML_UNARY_OP_ABS) {
+-            return {-1.0f, 1.0f};
++            return { -1.0f, 1.0f };
+         }
+         if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
+-            return {0.0f};
++            return { 0.0f };
+         }
+         if (op == GGML_UNARY_OP_RELU) {
+-            return {0.0f, 1.0f};
++            return { 0.0f, 1.0f };
+         }
+         return {};
+     }
+-
+ };
+ 
+ // GGML_OP_GET_ROWS
+ struct test_get_rows : public test_case {
+     const ggml_type type;
+-    const int n; // cols
+-    const int m; // rows
+-    const int r; // rows to get
+-    const int b; // batch size
+-    const bool v; // view (non-contiguous src1)
+-
+-    std::string vars() override {
+-        return VARS_TO_STR6(type, n, m, r, b, v);
+-    }
+-
+-    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+-        : type(type), n(n), m(m), r(r), b(b), v(v) {}
++    const int       n;  // cols
++    const int       m;  // rows
++    const int       r;  // rows to get
++    const int       b;  // batch size
++    const bool      v;  // view (non-contiguous src1)
++
++    std::string vars() override { return VARS_TO_STR6(type, n, m, r, b, v); }
++
++    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false) :
++        type(type),
++        n(n),
++        m(m),
++        r(r),
++        b(b),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
+@@ -1127,7 +1116,7 @@ struct test_get_rows : public test_case {
+         ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+         ggml_set_name(rows, "rows");
+         if (v) {
+-            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
++            rows = ggml_view_2d(ctx, rows, r / 2, b, rows->nb[1], 0);
+             ggml_set_name(rows, "view_of_rows");
+         }
+ 
+@@ -1146,10 +1135,12 @@ struct test_get_rows : public test_case {
+     void initialize_tensors(ggml_context * ctx) override {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_I32) {
+-                if (ggml_is_view_op(t->op)) { continue; }
++                if (ggml_is_view_op(t->op)) {
++                    continue;
++                }
+                 // rows
+-                std::vector<int> data(r*b);
+-                for (int i = 0; i < r*b; i++) {
++                std::vector<int> data(r * b);
++                for (int i = 0; i < r * b; i++) {
+                     data[i] = rand() % m;
+                 }
+                 ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+@@ -1163,18 +1154,21 @@ struct test_get_rows : public test_case {
+ // GGML_OP_GET_ROWS_BACK
+ struct test_get_rows_back : public test_case {
+     const ggml_type type;
+-    const int n; // cols
+-    const int m; // rows
+-    const int r; // rows to get
+-    const int b; // batch size
+-    const bool v; // view (non-contiguous src1)
+-
+-    std::string vars() override {
+-        return VARS_TO_STR6(type, n, m, r, b, v);
+-    }
+-
+-    test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+-        : type(type), n(n), m(m), r(r), b(b), v(v) {}
++    const int       n;  // cols
++    const int       m;  // rows
++    const int       r;  // rows to get
++    const int       b;  // batch size
++    const bool      v;  // view (non-contiguous src1)
++
++    std::string vars() override { return VARS_TO_STR6(type, n, m, r, b, v); }
++
++    test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false) :
++        type(type),
++        n(n),
++        m(m),
++        r(r),
++        b(b),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * in_forward = ggml_new_tensor_3d(ctx, type, n, m, b);
+@@ -1183,7 +1177,7 @@ struct test_get_rows_back : public test_case {
+         ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+         ggml_set_name(rows, "rows");
+         if (v) {
+-            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
++            rows = ggml_view_2d(ctx, rows, r / 2, b, rows->nb[1], 0);
+             ggml_set_name(rows, "view_of_rows");
+         }
+ 
+@@ -1199,10 +1193,12 @@ struct test_get_rows_back : public test_case {
+     void initialize_tensors(ggml_context * ctx) override {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_I32) {
+-                if (ggml_is_view_op(t->op)) { continue; }
++                if (ggml_is_view_op(t->op)) {
++                    continue;
++                }
+                 // rows
+-                std::vector<int> data(r*b);
+-                for (int i = 0; i < r*b; i++) {
++                std::vector<int> data(r * b);
++                for (int i = 0; i < r * b; i++) {
+                     data[i] = rand() % m;
+                 }
+                 ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+@@ -1215,16 +1211,12 @@ struct test_get_rows_back : public test_case {
+ 
+ // GGML_OP_ARGMAX
+ struct test_argmax : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_argmax(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 100, 1, 1})
+-        : type(type), ne(ne) {}
++    test_argmax(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 100, 1, 1 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1237,7 +1229,7 @@ struct test_argmax : public test_case {
+     }
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+-        std::random_device rd;
++        std::random_device         rd;
+         std::default_random_engine rng(rd());
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_F32) {
+@@ -1256,23 +1248,19 @@ struct test_argmax : public test_case {
+         }
+     }
+ 
+-    double max_nmse_err() override {
+-        return 0.0;
+-    }
++    double max_nmse_err() override { return 0.0; }
+ };
+ 
+ // GGML_OP_COUNT_EQUAL
+ struct test_count_equal : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_count_equal(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {4, 500, 1, 1})
+-        : type(type), ne(ne) {}
++    test_count_equal(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 4, 500, 1, 1 }) :
++        type(type),
++        ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1293,32 +1281,28 @@ struct test_count_equal : public test_case {
+         return out;
+     }
+ 
+-    double max_nmse_err() override {
+-        return 0.0;
+-    }
++    double max_nmse_err() override { return 0.0; }
+ };
+ 
+ // GGML_OP_REPEAT
+ struct test_repeat : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const std::array<int, 4> nr;
++    const std::array<int, 4>     nr;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, nr);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, nr); }
+ 
+-    size_t op_size(ggml_tensor * t) override {
+-        return ggml_nbytes(t) * 2;
+-    }
++    size_t op_size(ggml_tensor * t) override { return ggml_nbytes(t) * 2; }
+ 
+-    test_repeat(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+-            std::array<int, 4> nr = {2, 2, 2, 2})
+-        : type(type), ne(ne), nr(nr) {}
++    test_repeat(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 },
++                std::array<int, 4> nr = { 2, 2, 2, 2 }) :
++        type(type),
++        ne(ne),
++        nr(nr) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+-        ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
++        ggml_tensor * target =
++            ggml_new_tensor_4d(ctx, type, ne[0] * nr[0], ne[1] * nr[1], ne[2] * nr[2], ne[3] * nr[3]);
+         ggml_set_name(target, "target");
+ 
+         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1334,27 +1318,24 @@ struct test_repeat : public test_case {
+ 
+ // GGML_OP_REPEAT_BACK
+ struct test_repeat_back : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const std::array<int, 4> nr;
+-    const bool v; // whether src is a noncontiguous view
++    const std::array<int, 4>     nr;
++    const bool                   v;  // whether src is a noncontiguous view
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, nr, v);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, nr, v); }
+ 
+-    size_t op_size(ggml_tensor * t) override {
+-        return ggml_nbytes(t) * 2;
+-    }
++    size_t op_size(ggml_tensor * t) override { return ggml_nbytes(t) * 2; }
+ 
+-    test_repeat_back(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {8, 6, 4, 2},
+-            std::array<int, 4> nr = {2, 2, 2, 2},
+-            bool v = false)
+-        : type(type), ne(ne), nr(nr), v(v) {}
++    test_repeat_back(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 8, 6, 4, 2 },
++                     std::array<int, 4> nr = { 2, 2, 2, 2 }, bool v = false) :
++        type(type),
++        ne(ne),
++        nr(nr),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+-        ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
++        ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0] * nr[0], ne[1] * nr[1], ne[2] * nr[2], ne[3] * nr[3]);
+         ggml_set_name(src, "src");
+ 
+         if (v) {
+@@ -1387,22 +1368,25 @@ struct test_repeat_back : public test_case {
+ 
+ // GGML_OP_DUP
+ struct test_dup : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+     const std::array<int64_t, 4> permute;
+-    bool _use_permute;
++    bool                         _use_permute;
+ 
+     std::string vars() override {
+         std::string v = VARS_TO_STR2(type, ne);
+-        if (_use_permute) v += "," + VAR_TO_STR(permute);
++        if (_use_permute) {
++            v += "," + VAR_TO_STR(permute);
++        }
+         return v;
+     }
+ 
+-    test_dup(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 20, 1},
+-            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+-        : type(type), ne(ne), permute(permute),
+-            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
++    test_dup(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 10, 20, 1 },
++             std::array<int64_t, 4> permute = { 0, 0, 0, 0 }) :
++        type(type),
++        ne(ne),
++        permute(permute),
++        _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1423,22 +1407,21 @@ struct test_dup : public test_case {
+ 
+ // GGML_OP_SET
+ struct test_set : public test_case {
+-    const ggml_type type_src;
+-    const ggml_type type_dst;
++    const ggml_type              type_src;
++    const ggml_type              type_dst;
+     const std::array<int64_t, 4> ne;
+-    const int dim;
++    const int                    dim;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type_src, type_dst, ne, dim);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type_src, type_dst, ne, dim); }
+ 
+-    size_t op_size(ggml_tensor * t) override {
+-        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+-    }
++    size_t op_size(ggml_tensor * t) override { return ggml_nbytes(t) + ggml_nbytes(t->src[0]); }
+ 
+     test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
+-        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
++             std::array<int64_t, 4> ne = { 6, 5, 4, 3 }, int dim = 1) :
++        type_src(type_src),
++        type_dst(type_dst),
++        ne(ne),
++        dim(dim) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+@@ -1449,17 +1432,17 @@ struct test_set : public test_case {
+         for (int i = 0; i < dim; ++i) {
+             ne_dst[i] *= 2;
+         }
+-        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
++        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
+         ggml_set_param(dst);
+         ggml_set_name(dst, "dst");
+ 
+         size_t offset = 0;
+         for (int i = 0; i < dim; ++i) {
+-            offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
++            offset += ((ne_dst[i] - ne[i]) / 2) * dst->nb[i];
+         }
+         ggml_tensor * out = ggml_set(ctx, dst, src,
+-            // The backward pass requires setting a contiguous region:
+-            src->nb[1], src->nb[2], src->nb[3], offset);
++                                     // The backward pass requires setting a contiguous region:
++                                     src->nb[1], src->nb[2], src->nb[3], offset);
+         ggml_set_name(out, "out");
+ 
+         return out;
+@@ -1468,33 +1451,30 @@ struct test_set : public test_case {
+ 
+ // GGML_OP_CPY
+ struct test_cpy : public test_case {
+-    const ggml_type type_src;
+-    const ggml_type type_dst;
++    const ggml_type              type_src;
++    const ggml_type              type_dst;
+     const std::array<int64_t, 4> ne;
+     const std::array<int64_t, 4> permute_src;
+     const std::array<int64_t, 4> permute_dst;
+-    bool _src_use_permute;
+-    bool _dst_use_permute;
++    bool                         _src_use_permute;
++    bool                         _dst_use_permute;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst); }
+ 
+-    double max_nmse_err() override {
+-        return 1e-6;
+-    }
++    double max_nmse_err() override { return 1e-6; }
+ 
+-    size_t op_size(ggml_tensor * t) override {
+-        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+-    }
++    size_t op_size(ggml_tensor * t) override { return ggml_nbytes(t) + ggml_nbytes(t->src[0]); }
+ 
+     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+-            std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
+-            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
+-        : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
+-          _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
+-          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
++             std::array<int64_t, 4> ne = { 10, 10, 10, 1 }, std::array<int64_t, 4> permute_src = { 0, 0, 0, 0 },
++             std::array<int64_t, 4> permute_dst = { 0, 0, 0, 0 }) :
++        type_src(type_src),
++        type_dst(type_dst),
++        ne(ne),
++        permute_src(permute_src),
++        permute_dst(permute_dst),
++        _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
++        _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+@@ -1523,16 +1503,12 @@ struct test_cpy : public test_case {
+ 
+ // GGML_OP_CONT
+ struct test_cont : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_cont(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
+-        : type(type), ne(ne) {}
++    test_cont(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 10, 10, 1 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1555,26 +1531,24 @@ struct test_cont : public test_case {
+ // GGML_OP_DIV
+ struct test_bin_bcast : public test_case {
+     using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
+-    op_t op;
+-    const ggml_type type;
++    op_t                         op;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const std::array<int, 4> nr;
++    const std::array<int, 4>     nr;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, nr);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, nr); }
+ 
+-    size_t op_size(ggml_tensor * t) override {
+-        return ggml_nbytes(t) * 3;
+-    }
++    size_t op_size(ggml_tensor * t) override { return ggml_nbytes(t) * 3; }
+ 
+-    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 1, 1},
+-            std::array<int, 4> nr = {1, 2, 1, 1})
+-        : op(op), type(type), ne(ne), nr(nr) {}
++    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 10, 1, 1 },
++                   std::array<int, 4> nr = { 1, 2, 1, 1 }) :
++        op(op),
++        type(type),
++        ne(ne),
++        nr(nr) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
++        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0] * nr[0], ne[1] * nr[1], ne[2] * nr[2], ne[3] * nr[3]);
+         ggml_set_name(a, "a");
+ 
+         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1604,31 +1578,21 @@ struct test_bin_bcast : public test_case {
+         }
+     }
+ 
+-    float grad_eps() override {
+-        return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
+-    }
++    float grad_eps() override { return 0.1f * (op == ggml_mul ? ne[0] * ne[1] * ne[2] * ne[3] : 1); }
+ 
+-    bool grad_precise() override {
+-        return op == ggml_div;
+-    }
++    bool grad_precise() override { return op == ggml_div; }
+ 
+-    double max_maa_err() override {
+-        return op == ggml_add ? 1e-4 : 1e-3;
+-    }
++    double max_maa_err() override { return op == ggml_add ? 1e-4 : 1e-3; }
+ };
+ 
+ // GGML_OP_ADD1
+ struct test_add1 : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_add1(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_add1(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1645,25 +1609,21 @@ struct test_add1 : public test_case {
+         return out;
+     }
+ 
+-    float grad_eps() override {
+-        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+-    }
++    float grad_eps() override { return 0.1f * ne[0] * ne[1] * ne[2] * ne[3]; }
+ };
+ 
+ // GGML_OP_SCALE
+ struct test_scale : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    float scale;
++    float                        scale;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, scale);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, scale); }
+ 
+-    test_scale(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+-            float scale = 2.0f)
+-        : type(type), ne(ne), scale(scale) {}
++    test_scale(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 10, 10, 10 }, float scale = 2.0f) :
++        type(type),
++        ne(ne),
++        scale(scale) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1679,18 +1639,16 @@ struct test_scale : public test_case {
+ 
+ // GGML_OP_SILU_BACK
+ struct test_silu_back : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    float eps;
++    float                        eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, eps);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, eps); }
+ 
+-    test_silu_back(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+-            float eps = 1e-6f)
+-        : type(type), ne(ne), eps(eps) {}
++    test_silu_back(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 5, 4, 3 }, float eps = 1e-6f) :
++        type(type),
++        ne(ne),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1705,34 +1663,32 @@ struct test_silu_back : public test_case {
+         return out;
+     }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_NORM
+ struct test_norm : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const bool v; // whether a is a non-contiguous view
+-    const float eps;
++    const bool                   v;  // whether a is a non-contiguous view
++    const float                  eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, v, eps);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, v, eps); }
+ 
+-    test_norm(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+-            bool v = false,
+-            float eps = 1e-6f)
+-        : type(type), ne(ne), v(v), eps(eps) {}
++    test_norm(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 5, 4, 3 }, bool v = false,
++              float eps = 1e-6f) :
++        type(type),
++        ne(ne),
++        v(v),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+         ggml_set_name(a, "a");
+ 
+         if (v) {
+-            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
++            a = ggml_view_4d(ctx, a, a->ne[0] / 2, a->ne[1] / 2, a->ne[2] / 2, a->ne[3] / 2, a->nb[1], a->nb[2],
++                             a->nb[3], 0);
+             ggml_set_name(a, "view of a");
+         }
+ 
+@@ -1745,20 +1701,19 @@ struct test_norm : public test_case {
+ 
+ // GGML_OP_RMS_NORM
+ struct test_rms_norm : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const bool v; // whether a is a non-contiguous view
+-    const float eps;
++    const bool                   v;  // whether a is a non-contiguous view
++    const float                  eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, v, eps);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, v, eps); }
+ 
+-    test_rms_norm(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+-            bool v = false,
+-            float eps = 1e-6f)
+-        : type(type), ne(ne), v(v), eps(eps) {}
++    test_rms_norm(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 5, 4, 3 }, bool v = false,
++                  float eps = 1e-6f) :
++        type(type),
++        ne(ne),
++        v(v),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1766,7 +1721,8 @@ struct test_rms_norm : public test_case {
+         ggml_set_name(a, "a");
+ 
+         if (v) {
+-            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
++            a = ggml_view_4d(ctx, a, a->ne[0] / 2, a->ne[1] / 2, a->ne[2] / 2, a->ne[3] / 2, a->nb[1], a->nb[2],
++                             a->nb[3], 0);
+             ggml_set_name(a, "view of a");
+         }
+ 
+@@ -1782,29 +1738,23 @@ struct test_rms_norm : public test_case {
+         }
+     }
+ 
+-    float grad_eps() override {
+-        return 1.0f;
+-    }
++    float grad_eps() override { return 1.0f; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_RMS_NORM_BACK
+ struct test_rms_norm_back : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const float eps;
++    const float                  eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, eps);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, eps); }
+ 
+-    test_rms_norm_back(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+-            float eps = 1e-6f)
+-        : type(type), ne(ne), eps(eps) {}
++    test_rms_norm_back(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 5, 4, 3 }, float eps = 1e-6f) :
++        type(type),
++        ne(ne),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -1828,18 +1778,17 @@ struct test_rms_norm_back : public test_case {
+ 
+ // GGML_OP_SSM_CONV
+ struct test_ssm_conv : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+     const std::array<int64_t, 4> ne_b;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne_a, ne_b);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne_a, ne_b); }
+ 
+-    test_ssm_conv(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
+-            std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
+-        : type(type), ne_a(ne_a), ne_b(ne_b) {}
++    test_ssm_conv(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 10, 10, 10, 1 },
++                  std::array<int64_t, 4> ne_b = { 3, 3, 1, 1 }) :
++        type(type),
++        ne_a(ne_a),
++        ne_b(ne_b) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a   = ggml_new_tensor(ctx, type, 4, ne_a.data());
+@@ -1858,21 +1807,27 @@ struct test_ssm_scan : public test_case {
+     const int64_t n_seq_tokens;
+     const int64_t n_seqs;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs); }
+ 
+-    test_ssm_scan(ggml_type type = GGML_TYPE_F32,
+-            int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+-        : type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
++    test_ssm_scan(ggml_type type = GGML_TYPE_F32, int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32,
++                  int64_t n_seqs = 32) :
++        type(type),
++        d_state(d_state),
++        d_inner(d_inner),
++        n_seq_tokens(n_seq_tokens),
++        n_seqs(n_seqs) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+-        ggml_tensor * s   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      n_seqs, 1 }.data());
+-        ggml_tensor * x   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
+-        ggml_tensor * dt  = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
+-        ggml_tensor * A   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      1     , 1 }.data());
+-        ggml_tensor * B   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
+-        ggml_tensor * C   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
++        ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
++        ggml_tensor * x =
++            ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
++        ggml_tensor * dt =
++            ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
++        ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1, 1 }.data());
++        ggml_tensor * B =
++            ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
++        ggml_tensor * C =
++            ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
+         ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
+         return out;
+     }
+@@ -1887,22 +1842,26 @@ struct test_rwkv_wkv6 : public test_case {
+     const int64_t n_seq_tokens;
+     const int64_t n_seqs;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); }
+ 
+-    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
+-            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+-        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
++    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32, int64_t head_count = 32, int64_t head_size = 64,
++                   int64_t n_seq_tokens = 32, int64_t n_seqs = 32) :
++        type(type),
++        head_count(head_count),
++        head_size(head_size),
++        n_seq_tokens(n_seq_tokens),
++        n_seqs(n_seqs) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         const int64_t n_tokens = n_seq_tokens * n_seqs;
+-        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
+-        ggml_tensor * td  = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
++        ggml_tensor * r = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
++        ggml_tensor * td =
++            ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * s =
++            ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+         ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
+         return out;
+     }
+@@ -1917,21 +1876,24 @@ struct test_gla : public test_case {
+     const int64_t n_seq_tokens;
+     const int64_t n_seqs;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); }
+ 
+-    test_gla(ggml_type type = GGML_TYPE_F32,
+-            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+-        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
++    test_gla(ggml_type type = GGML_TYPE_F32, int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32,
++             int64_t n_seqs = 32) :
++        type(type),
++        head_count(head_count),
++        head_size(head_size),
++        n_seq_tokens(n_seq_tokens),
++        n_seqs(n_seqs) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         const int64_t n_tokens = n_seq_tokens * n_seqs;
+-        ggml_tensor * q   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * g   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
++        ggml_tensor * q = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * g = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * s =
++            ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+         ggml_tensor * out = ggml_gated_linear_attn(ctx, k, v, q, g, s, pow(head_size, -0.5));
+         return out;
+     }
+@@ -1946,26 +1908,29 @@ struct test_rwkv_wkv7 : public test_case {
+     const int64_t n_seq_tokens;
+     const int64_t n_seqs;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); }
+ 
+-    test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32,
+-            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+-        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
++    test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32, int64_t head_count = 32, int64_t head_size = 64,
++                   int64_t n_seq_tokens = 32, int64_t n_seqs = 32) :
++        type(type),
++        head_count(head_count),
++        head_size(head_size),
++        n_seq_tokens(n_seq_tokens),
++        n_seqs(n_seqs) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         const int64_t n_tokens = n_seq_tokens * n_seqs;
+-        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * w   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * a   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+-        ggml_tensor * b   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * r = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * w = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * a = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
++        ggml_tensor * b = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+         // Outputs may become NaN with long seqlen without these normalization
+-        a = ggml_l2_norm(ctx, a, 1e-7F);
+-        b = ggml_l2_norm(ctx, b, 1e-7F);
+-        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
++        a               = ggml_l2_norm(ctx, a, 1e-7F);
++        b               = ggml_l2_norm(ctx, b, 1e-7F);
++        ggml_tensor * s =
++            ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+         ggml_tensor * out = ggml_rwkv_wkv7(ctx, r, w, k, v, a, b, s);
+         return out;
+     }
+@@ -1973,40 +1938,39 @@ struct test_rwkv_wkv7 : public test_case {
+ 
+ // GGML_OP_MUL_MAT
+ struct test_mul_mat : public test_case {
+-    const ggml_type type_a;
+-    const ggml_type type_b;
+-    const int64_t m;
+-    const int64_t n;
+-    const int64_t k;
+-    const std::array<int64_t, 2> bs;  // dims 3 and 4
+-    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
+-    const std::array<int64_t, 4> per; // permutation of dimensions
+-    const bool v; // whether a and b are non-contiguous views
++    const ggml_type              type_a;
++    const ggml_type              type_b;
++    const int64_t                m;
++    const int64_t                n;
++    const int64_t                k;
++    const std::array<int64_t, 2> bs;   // dims 3 and 4
++    const std::array<int64_t, 2> nr;   // repeat in dims 3 and 4
++    const std::array<int64_t, 4> per;  // permutation of dimensions
++    const bool                   v;    // whether a and b are non-contiguous views
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
+-    }
++    std::string vars() override { return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v); }
+ 
+-    double max_nmse_err() override {
+-        return 5e-4;
+-    }
++    double max_nmse_err() override { return 5e-4; }
+ 
+-    int64_t grad_nmax() override {
+-        return 20000;
+-    }
++    int64_t grad_nmax() override { return 20000; }
+ 
+     uint64_t op_flops(ggml_tensor * t) override {
+         GGML_UNUSED(t);
+         return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
+     }
+ 
+-    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+-            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+-            std::array<int64_t, 2> bs = {10, 10},
+-            std::array<int64_t, 2> nr = {2, 2},
+-            std::array<int64_t, 4> per = {0, 1, 2, 3},
+-            bool v = false)
+-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
++    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int64_t m = 32, int64_t n = 32,
++                 int64_t k = 32, std::array<int64_t, 2> bs = { 10, 10 }, std::array<int64_t, 2> nr = { 2, 2 },
++                 std::array<int64_t, 4> per = { 0, 1, 2, 3 }, bool v = false) :
++        type_a(type_a),
++        type_b(type_b),
++        m(m),
++        n(n),
++        k(k),
++        bs(bs),
++        nr(nr),
++        per(per),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+@@ -2016,13 +1980,13 @@ struct test_mul_mat : public test_case {
+         const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
+         if (npermuted > 0) {
+             GGML_ASSERT(npermuted == 2);
+-            GGML_ASSERT(!v); // not handled
++            GGML_ASSERT(!v);  // not handled
+             GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
+             GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
+ 
+             // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
+-            const int64_t ne_a[4] = {k, m, bs[0],       bs[1]};
+-            const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
++            const int64_t ne_a[4] = { k, m, bs[0], bs[1] };
++            const int64_t ne_b[4] = { k, n, bs[0] * nr[0], bs[1] * nr[1] };
+ 
+             a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
+             b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+@@ -2041,8 +2005,8 @@ struct test_mul_mat : public test_case {
+             ggml_set_name(b, "b_permuted");
+         } else {
+             if (v) {
+-                a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0],       bs[1]);
+-                b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
++                a = ggml_new_tensor_4d(ctx, type_a, k * 2, m, bs[0], bs[1]);
++                b = ggml_new_tensor_4d(ctx, type_b, k * 2, n, bs[0] * nr[0], bs[1] * nr[1]);
+ 
+                 if (!ggml_is_quantized(type_a)) {
+                     if (bs[1] == 1 && nr[1] == 1) {
+@@ -2051,11 +2015,11 @@ struct test_mul_mat : public test_case {
+                     ggml_set_param(b);
+                 }
+ 
+-                a = ggml_view_4d(ctx, a, k, m, bs[0],       bs[1],       a->nb[1], a->nb[2], a->nb[3], 0);
+-                b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
++                a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
++                b = ggml_view_4d(ctx, b, k, n, bs[0] * nr[0], bs[1] * nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
+             } else {
+-                a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+-                b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
++                a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
++                b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0] * nr[0], bs[1] * nr[1]);
+ 
+                 if (!ggml_is_quantized(type_a)) {
+                     if (bs[1] == 1 && nr[1] == 1) {
+@@ -2079,33 +2043,34 @@ struct test_mul_mat : public test_case {
+ struct test_mul_mat_id : public test_case {
+     const ggml_type type_a;
+     const ggml_type type_b;
+-    const int n_mats;
+-    const int n_used;
+-    const bool b; // broadcast b matrix
+-    const int64_t m;
+-    const int64_t n;
+-    const int64_t k;
++    const int       n_mats;
++    const int       n_used;
++    const bool      b;  // broadcast b matrix
++    const int64_t   m;
++    const int64_t   n;
++    const int64_t   k;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
+-    }
++    std::string vars() override { return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k); }
+ 
+-    double max_nmse_err() override {
+-        return 5e-4;
+-    }
++    double max_nmse_err() override { return 5e-4; }
+ 
+     uint64_t op_flops(ggml_tensor * t) override {
+         GGML_UNUSED(t);
+         return 2 * m * k * n * n_used;
+     }
+ 
+-    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+-            int n_mats = 8, int n_used = 2, bool b = false,
+-            int64_t m = 32, int64_t n = 32, int64_t k = 32)
+-        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
+-            m(m), n(n), k(k) {
+-            GGML_ASSERT(n_used <= n_mats);
+-        }
++    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int n_mats = 8, int n_used = 2,
++                    bool b = false, int64_t m = 32, int64_t n = 32, int64_t k = 32) :
++        type_a(type_a),
++        type_b(type_b),
++        n_mats(n_mats),
++        n_used(n_used),
++        b(b),
++        m(m),
++        n(n),
++        k(k) {
++        GGML_ASSERT(n_used <= n_mats);
++    }
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+@@ -2129,11 +2094,13 @@ struct test_mul_mat_id : public test_case {
+     }
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+-        std::random_device rd;
++        std::random_device         rd;
+         std::default_random_engine rng(rd());
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_I32) {
+-                if (ggml_is_view_op(t->op)) { continue; }
++                if (ggml_is_view_op(t->op)) {
++                    continue;
++                }
+                 // ids
+                 for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                     std::vector<int32_t> data(t->ne[0]);
+@@ -2152,29 +2119,30 @@ struct test_mul_mat_id : public test_case {
+ 
+ // GGML_OP_OUT_PROD
+ struct test_out_prod : public test_case {
+-    const ggml_type type_a;
+-    const ggml_type type_b;
+-    const int64_t m;
+-    const int64_t n;
+-    const int64_t k;
+-    const std::array<int64_t, 2> bs; // dims 3 and 4
+-    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+-    const bool trans_b;
++    const ggml_type              type_a;
++    const ggml_type              type_b;
++    const int64_t                m;
++    const int64_t                n;
++    const int64_t                k;
++    const std::array<int64_t, 2> bs;  // dims 3 and 4
++    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
++    const bool                   trans_b;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b);
+-    }
++    std::string vars() override { return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b); }
+ 
+-    double max_nmse_err() override {
+-        return 5e-4;
+-    }
++    double max_nmse_err() override { return 5e-4; }
+ 
+-    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+-            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+-            std::array<int64_t, 2> bs = {10, 10},
+-            std::array<int64_t, 2> nr = {2, 2},
+-            bool trans_b = false)
+-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), trans_b(trans_b) {}
++    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int64_t m = 32, int64_t n = 32,
++                  int64_t k = 32, std::array<int64_t, 2> bs = { 10, 10 }, std::array<int64_t, 2> nr = { 2, 2 },
++                  bool trans_b = false) :
++        type_a(type_a),
++        type_b(type_b),
++        m(m),
++        n(n),
++        k(k),
++        bs(bs),
++        nr(nr),
++        trans_b(trans_b) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
+@@ -2182,10 +2150,10 @@ struct test_out_prod : public test_case {
+ 
+         ggml_tensor * b;
+         if (trans_b) {
+-            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
++            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0] * nr[0], bs[1] * nr[1]);
+             b = ggml_transpose(ctx, b);
+         } else {
+-            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0]*nr[0], bs[1]*nr[1]);
++            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0] * nr[0], bs[1] * nr[1]);
+         }
+         ggml_set_name(b, "b");
+ 
+@@ -2198,16 +2166,12 @@ struct test_out_prod : public test_case {
+ 
+ // GGML_OP_SQR
+ struct test_sqr : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_sqr(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_sqr(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2221,22 +2185,18 @@ struct test_sqr : public test_case {
+     }
+ 
+     float grad_eps() override {
+-        return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
++        return 0.1f * 0.25f * ne[0] * ne[1] * ne[2] * ne[3];  // 10% of expected value of sum.
+     }
+ };
+ 
+ // GGML_OP_SQRT
+ struct test_sqrt : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_sqrt(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 3, 3, 2})
+-        : type(type), ne(ne) {}
++    test_sqrt(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 3, 3, 2 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2256,27 +2216,19 @@ struct test_sqrt : public test_case {
+         }
+     }
+ 
+-    float grad_eps() override {
+-        return 20.0f;
+-    }
++    float grad_eps() override { return 20.0f; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_LOG
+ struct test_log : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_log(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_log(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2296,23 +2248,17 @@ struct test_log : public test_case {
+         }
+     }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_SIN
+ struct test_sin : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_sin(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+-        : type(type), ne(ne) {}
++    test_sin(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 2, 2, 2 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2327,35 +2273,25 @@ struct test_sin : public test_case {
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+-            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
++            init_tensor_uniform(t, -6.5f, 6.5f);  // Covers interval [-2*pi, 2*pi].
+         }
+     }
+ 
+-    double max_maa_err() override {
+-        return 1e-3;
+-    }
++    double max_maa_err() override { return 1e-3; }
+ 
+-    float grad_eps() override {
+-        return 0.2f;
+-    }
++    float grad_eps() override { return 0.2f; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_COS
+ struct test_cos : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_cos(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+-        : type(type), ne(ne) {}
++    test_cos(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 2, 2, 2 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2370,38 +2306,32 @@ struct test_cos : public test_case {
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+-            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
++            init_tensor_uniform(t, -6.5f, 6.5f);  // Covers interval [-2*pi, 2*pi].
+         }
+     }
+ 
+-    double max_maa_err() override {
+-        return 1e-3;
+-    }
++    double max_maa_err() override { return 1e-3; }
+ 
+-    float grad_eps() override {
+-        return 0.2f;
+-    }
++    float grad_eps() override { return 0.2f; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_CLAMP
+ struct test_clamp : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    float min;
+-    float max;
++    float                        min;
++    float                        max;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, min, max);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, min, max); }
+ 
+-    test_clamp(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+-            float min = -0.5f, float max = 0.5f)
+-        : type(type), ne(ne), min(min), max(max) {}
++    test_clamp(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }, float min = -0.5f,
++               float max = 0.5f) :
++        type(type),
++        ne(ne),
++        min(min),
++        max(max) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2413,29 +2343,23 @@ struct test_clamp : public test_case {
+         return out;
+     }
+ 
+-    float grad_eps() override {
+-        return 1e-2f;
+-    }
++    float grad_eps() override { return 1e-2f; }
+ 
+-    std::vector<float> grad_expect() override {
+-        return {0.0f, 1.0f};
+-    }
++    std::vector<float> grad_expect() override { return { 0.0f, 1.0f }; }
+ };
+ 
+ // GGML_OP_DIAG_MASK_INF
+ struct test_diag_mask_inf : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const int n_past;
++    const int                    n_past;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, n_past);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, n_past); }
+ 
+-    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 10, 3, 2},
+-            int n_past = 5)
+-        : type(type), ne(ne), n_past(n_past) {}
++    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 10, 3, 2 }, int n_past = 5) :
++        type(type),
++        ne(ne),
++        n_past(n_past) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2451,30 +2375,27 @@ struct test_diag_mask_inf : public test_case {
+ 
+ // GGML_OP_SOFT_MAX
+ struct test_soft_max : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const bool mask;
+-    const ggml_type m_prec;
+-    const float scale;
+-    const float max_bias;
++    const bool                   mask;
++    const ggml_type              m_prec;
++    const float                  scale;
++    const float                  max_bias;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias);
+-    }
++    std::string vars() override { return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias); }
+ 
+     // the 1024 test with bias occasionally fails:
+     // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
+-    virtual double max_nmse_err() override {
+-        return 1e-6;
+-    }
++    virtual double max_nmse_err() override { return 1e-6; }
+ 
+-    test_soft_max(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+-            bool mask = false,
+-            ggml_type m_prec = GGML_TYPE_F32,
+-            float scale = 1.0f,
+-            float max_bias = 0.0f)
+-        : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {}
++    test_soft_max(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }, bool mask = false,
++                  ggml_type m_prec = GGML_TYPE_F32, float scale = 1.0f, float max_bias = 0.0f) :
++        type(type),
++        ne(ne),
++        mask(mask),
++        m_prec(m_prec),
++        scale(scale),
++        max_bias(max_bias) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2493,27 +2414,24 @@ struct test_soft_max : public test_case {
+         return out;
+     }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_SOFT_MAX_BACK
+ struct test_soft_max_back : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const float scale;
+-    const float max_bias;
++    const float                  scale;
++    const float                  max_bias;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, scale, max_bias);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, scale, max_bias); }
+ 
+-    test_soft_max_back(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+-            float scale = 1.0f,
+-            float max_bias = 0.0f)
+-        : type(type), ne(ne), scale(scale), max_bias(max_bias) {}
++    test_soft_max_back(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }, float scale = 1.0f,
++                       float max_bias = 0.0f) :
++        type(type),
++        ne(ne),
++        scale(scale),
++        max_bias(max_bias) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2531,33 +2449,45 @@ struct test_soft_max_back : public test_case {
+ 
+ // GGML_OP_ROPE + GGML_OP_ROPE_BACK
+ struct test_rope : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    int n_dims;
+-    int mode;
+-    int n_ctx; // used to generate positions
+-    float fs; // freq_scale
+-    float ef; // ext_factor
+-    float af; // attn_factor
+-    bool ff;
+-    int v; // view (1 : non-contiguous a)
+-    bool forward;
++    int                          n_dims;
++    int                          mode;
++    int                          n_ctx;  // used to generate positions
++    float                        fs;     // freq_scale
++    float                        ef;     // ext_factor
++    float                        af;     // attn_factor
++    bool                         ff;
++    int                          v;      // view (1 : non-contiguous a)
++    bool                         forward;
+ 
+     std::string vars() override {
+         // forward can be inferred from the op, does not need to be printed
+         return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
+     }
+ 
+-    test_rope(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
+-            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f,
+-            float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true)
+-        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {}
++    test_rope(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 10, 5, 3, 1 }, int n_dims = 10,
++              int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false,
++              int v = 0, bool forward = true) :
++        type(type),
++        ne_a(ne_a),
++        n_dims(n_dims),
++        mode(mode),
++        n_ctx(n_ctx),
++        fs(fs),
++        ef(ef),
++        af(af),
++        ff(ff),
++        v(v),
++        forward(forward) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a;
+         if (v & 1) {
+-            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
++            auto ne = ne_a;
++            ne[0] *= 2;
++            ne[1] *= 4;
++            ne[2] *= 3;
+             a = ggml_new_tensor(ctx, type, 4, ne.data());
+             if (forward) {
+                 ggml_set_param(a);
+@@ -2574,7 +2504,7 @@ struct test_rope : public test_case {
+             ggml_set_name(a, "a");
+         }
+ 
+-        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
++        const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+         const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+ 
+         ggml_tensor * pos;
+@@ -2587,32 +2517,37 @@ struct test_rope : public test_case {
+ 
+         ggml_tensor * freq = nullptr;
+         if (ff) {
+-            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
++            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims / 2);
+             ggml_set_name(freq, "freq");
+         }
+ 
+         ggml_tensor * out;
+         if (is_mrope) {
+             if (is_vision) {
+-                GGML_ASSERT(n_dims/4 > 0);
+-                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
++                GGML_ASSERT(n_dims / 4 > 0);
++                int rope_sections[4] = { n_dims / 4, n_dims / 4, 0,
++                                         0 };  // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                 if (forward) {
+-                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
++                    out = ggml_rope_multi(ctx, a, pos, freq, n_dims / 2, rope_sections, mode, 0, 10000.0f, fs, ef, af,
++                                          1.0f, 1.0f);
+                 } else {
+-                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
++                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims / 2, rope_sections, mode, 0, 10000.0f, fs, ef,
++                                               af, 1.0f, 1.0f);
+                 }
+             } else {
+-                GGML_ASSERT(n_dims/3 > 0);
+-                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
++                GGML_ASSERT(n_dims / 3 > 0);
++                int rope_sections[4] = { n_dims / 3, n_dims / 3, n_dims / 3, 0 };
+                 if (forward) {
+-                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
++                    out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f,
++                                          1.0f);
+                 } else {
+-                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
++                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af,
++                                               1.0f, 1.0f);
+                 }
+             }
+         } else {
+             if (forward) {
+-                out = ggml_rope_ext     (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
++                out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+             } else {
+                 out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+             }
+@@ -2628,14 +2563,14 @@ struct test_rope : public test_case {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_I32) {
+                 // pos
+-                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
++                const int        num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                 std::vector<int> data(num_pos_ids);
+                 for (int i = 0; i < num_pos_ids; i++) {
+                     data[i] = rand() % n_ctx;
+                 }
+                 ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
+             } else {
+-                if (t->ne[0] == n_dims/2) {
++                if (t->ne[0] == n_dims / 2) {
+                     // frequency factors in the range [0.9f, 1.1f]
+                     init_tensor_uniform(t, 0.9f, 1.1f);
+                 } else {
+@@ -2645,41 +2580,40 @@ struct test_rope : public test_case {
+         }
+     }
+ 
+-    double max_maa_err() override {
+-        return 1e-3;
+-    }
++    double max_maa_err() override { return 1e-3; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_POOL2D
+ struct test_pool2d : public test_case {
+-    enum ggml_op_pool pool_type;
+-    const ggml_type type_input;
++    enum ggml_op_pool            pool_type;
++    const ggml_type              type_input;
+     const std::array<int64_t, 4> ne_input;
+     // kernel size
+-    const int k0;
+-    const int k1;
++    const int                    k0;
++    const int                    k1;
+     // stride
+-    const int s0;
+-    const int s1;
++    const int                    s0;
++    const int                    s1;
+     // padding
+-    const int p0;
+-    const int p1;
+-
+-    std::string vars() override {
+-        return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
+-    }
+-
+-    test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
+-            ggml_type type_input = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+-            int k0 = 3, int k1 = 3,
+-            int s0 = 1, int s1 = 1,
+-            int p0 = 1, int p1 = 1)
+-        : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
++    const int                    p0;
++    const int                    p1;
++
++    std::string vars() override { return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1); }
++
++    test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG, ggml_type type_input = GGML_TYPE_F32,
++                std::array<int64_t, 4> ne_input = { 10, 10, 3, 1 },  // [input_width, input_height, input_channels, 1]
++                int k0 = 3, int k1 = 3, int s0 = 1, int s1 = 1, int p0 = 1, int p1 = 1) :
++        pool_type(pool_type),
++        type_input(type_input),
++        ne_input(ne_input),
++        k0(k0),
++        k1(k1),
++        s0(s0),
++        s1(s1),
++        p0(p0),
++        p1(p1) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+@@ -2698,18 +2632,21 @@ struct test_conv_transpose_1d : public test_case {
+     const std::array<int64_t, 4> ne_input;
+     const std::array<int64_t, 4> ne_kernel;
+ 
+-    const int s0; // stride
+-    const int p0; // padding
+-    const int d0; // dilation
++    const int s0;  // stride
++    const int p0;  // padding
++    const int d0;  // dilation
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+-    }
++    std::string vars() override { return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0); }
+ 
+-    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
+-                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
+-                           int s0 = 1, int p0 = 0, int d0 = 1)
+-        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
++    test_conv_transpose_1d(
++        std::array<int64_t, 4> ne_input  = { 197, 32, 1, 1 },  // [input_width, input_height, input_channels, 1]
++        std::array<int64_t, 4> ne_kernel = { 16, 32, 32, 1 },  // [kernel_width, kernel_height, input_channels, 1]
++        int s0 = 1, int p0 = 0, int d0 = 1) :
++        ne_input(ne_input),
++        ne_kernel(ne_kernel),
++        s0(s0),
++        p0(p0),
++        d0(d0) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+@@ -2727,35 +2664,44 @@ struct test_conv_transpose_1d : public test_case {
+ 
+ // GGML_OP_IM2COL
+ struct test_im2col : public test_case {
+-    const ggml_type type_input;
+-    const ggml_type type_kernel;
+-    const ggml_type dst_type;
++    const ggml_type              type_input;
++    const ggml_type              type_kernel;
++    const ggml_type              dst_type;
+     const std::array<int64_t, 4> ne_input;
+     const std::array<int64_t, 4> ne_kernel;
+     // stride
+-    const int s0;
+-    const int s1;
++    const int                    s0;
++    const int                    s1;
+     // padding
+-    const int p0;
+-    const int p1;
++    const int                    p0;
++    const int                    p1;
+     // dilation
+-    const int d0;
+-    const int d1;
++    const int                    d0;
++    const int                    d1;
+     // mode
+-    const bool is_2D;
++    const bool                   is_2D;
+ 
+     std::string vars() override {
+         return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
+     }
+ 
+-    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+-            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+-            int s0 = 1, int s1 = 1,
+-            int p0 = 1, int p1 = 1,
+-            int d0 = 1, int d1 = 1,
+-            bool is_2D = true)
+-        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
++    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
++                ggml_type              dst_type  = GGML_TYPE_F32,
++                std::array<int64_t, 4> ne_input  = { 10, 10, 3, 1 },  // [input_width, input_height, input_channels, 1]
++                std::array<int64_t, 4> ne_kernel = { 3, 3, 3, 1 },  // [kernel_width, kernel_height, input_channels, 1]
++                int s0 = 1, int s1 = 1, int p0 = 1, int p1 = 1, int d0 = 1, int d1 = 1, bool is_2D = true) :
++        type_input(type_input),
++        type_kernel(type_kernel),
++        dst_type(dst_type),
++        ne_input(ne_input),
++        ne_kernel(ne_kernel),
++        s0(s0),
++        s1(s1),
++        p0(p0),
++        p1(p1),
++        d0(d0),
++        d1(d1),
++        is_2D(is_2D) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+@@ -2776,19 +2722,22 @@ struct test_im2col : public test_case {
+ struct test_conv_2d_dw : public test_case {
+     const std::array<int64_t, 4> ne_input;
+     const std::array<int64_t, 4> ne_kernel;
+-    const int stride;
+-    const int padding;
+-    const int dilation;
+-    const bool cwhn;
+-
+-    std::string vars() override {
+-        return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
+-    }
+-
+-    test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
+-            std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
+-            int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
+-        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
++    const int                    stride;
++    const int                    padding;
++    const int                    dilation;
++    const bool                   cwhn;
++
++    std::string vars() override { return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn); }
++
++    test_conv_2d_dw(std::array<int64_t, 4> ne_input  = { 64, 64, 16, 1 },
++                    std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, int stride = 1, int padding = 0,
++                    int dilation = 1, bool cwhn = false) :
++        ne_input(ne_input),
++        ne_kernel(ne_kernel),
++        stride(stride),
++        padding(padding),
++        dilation(dilation),
++        cwhn(cwhn) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+@@ -2800,15 +2749,14 @@ struct test_conv_2d_dw : public test_case {
+         if (cwhn) {
+             // change memory layout to channel-most-contiguous (CWHN),
+             // then permute it back so NE matches the original input
+-            input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
+-            input = ggml_permute(ctx, input, 2, 0, 1, 3);
++            input  = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
++            input  = ggml_permute(ctx, input, 2, 0, 1, 3);
+             kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
+             kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
+         }
+ 
+-        ggml_tensor * out = ggml_conv_2d_dw_direct(
+-            ctx, kernel, input,
+-            stride, stride, padding, padding, dilation, dilation);
++        ggml_tensor * out =
++            ggml_conv_2d_dw_direct(ctx, kernel, input, stride, stride, padding, padding, dilation, dilation);
+         ggml_set_name(out, "out");
+         return out;
+     }
+@@ -2816,28 +2764,31 @@ struct test_conv_2d_dw : public test_case {
+ 
+ // GGML_OP_CONCAT
+ struct test_concat : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    const int64_t ne_b_d;
+-    const int dim;
+-    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
++    const int64_t                ne_b_d;
++    const int                    dim;
++    const int                    v;  // view (1 << 0: non-cont a, 1 << 1: non-cont b)
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v); }
+ 
+-    test_concat(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
+-            int64_t ne_b_d = 5,
+-            int dim = 2, int v = 0)
+-        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
++    test_concat(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 10, 5, 5, 5 }, int64_t ne_b_d = 5,
++                int dim = 2, int v = 0) :
++        type(type),
++        ne_a(ne_a),
++        ne_b_d(ne_b_d),
++        dim(dim),
++        v(v) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         auto ne_b = ne_a;
+         ne_b[dim] = ne_b_d;
+         ggml_tensor * a;
+         if (v & 1) {
+-            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
++            auto ne = ne_a;
++            ne[0] *= 2;
++            ne[1] *= 4;
++            ne[2] *= 3;
+             a = ggml_new_tensor(ctx, type, 4, ne.data());
+             ggml_set_name(a, "a");
+ 
+@@ -2849,7 +2800,10 @@ struct test_concat : public test_case {
+         }
+         ggml_tensor * b;
+         if (v & 2) {
+-            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
++            auto ne = ne_b;
++            ne[0] *= 3;
++            ne[1] *= 2;
++            ne[2] *= 4;
+             b = ggml_new_tensor(ctx, type, 4, ne.data());
+             ggml_set_name(b, "b");
+ 
+@@ -2869,18 +2823,17 @@ struct test_concat : public test_case {
+ 
+ // GGML_OP_ARGSORT
+ struct test_argsort : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    ggml_sort_order order;
++    ggml_sort_order              order;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne, order);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne, order); }
+ 
+-    test_argsort(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {16, 10, 10, 10},
+-            ggml_sort_order order = GGML_SORT_ORDER_ASC)
+-        : type(type), ne(ne), order(order) {}
++    test_argsort(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 16, 10, 10, 10 },
++                 ggml_sort_order order = GGML_SORT_ORDER_ASC) :
++        type(type),
++        ne(ne),
++        order(order) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2893,7 +2846,7 @@ struct test_argsort : public test_case {
+     }
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+-        std::random_device rd;
++        std::random_device         rd;
+         std::default_random_engine rng(rd());
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+             if (t->type == GGML_TYPE_I32) {
+@@ -2903,7 +2856,7 @@ struct test_argsort : public test_case {
+                     data[i] = rand();
+                 }
+                 std::shuffle(data.begin(), data.end(), rng);
+-                ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
++                ggml_backend_tensor_set(t, data.data(), 0, ne[0] * ne[1] * ne[2] * ne[3] * sizeof(int));
+             } else if (t->type == GGML_TYPE_F32) {
+                 // initialize with unique values to avoid ties
+                 for (int64_t r = 0; r < ggml_nrows(t); r++) {
+@@ -2923,16 +2876,12 @@ struct test_argsort : public test_case {
+ 
+ // GGML_OP_SUM
+ struct test_sum : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_sum(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_sum(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2945,23 +2894,17 @@ struct test_sum : public test_case {
+         return out;
+     }
+ 
+-    float grad_eps() override {
+-        return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
+-    }
++    float grad_eps() override { return 0.1f * sqrtf(ne[0] * ne[1] * ne[2] * ne[3]); }
+ };
+ 
+ // GGML_OP_SUM_ROWS
+ struct test_sum_rows : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_sum_rows(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_sum_rows(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2977,16 +2920,12 @@ struct test_sum_rows : public test_case {
+ 
+ // GGML_OP_MEAN
+ struct test_mean : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_mean(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_mean(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) : type(type), ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -2999,27 +2938,26 @@ struct test_mean : public test_case {
+         return out;
+     }
+ 
+-    float grad_eps() override {
+-        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+-    }
++    float grad_eps() override { return 0.1f * ne[0] * ne[1] * ne[2] * ne[3]; }
+ };
+ 
+ // GGML_OP_UPSCALE
+ struct test_upscale : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const int32_t scale_factor;
+-    const bool transpose;
+-    const ggml_scale_mode mode;
++    const int32_t                scale_factor;
++    const bool                   transpose;
++    const ggml_scale_mode        mode;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR5(type, ne, scale_factor, mode, transpose);
+-    }
++    std::string vars() override { return VARS_TO_STR5(type, ne, scale_factor, mode, transpose); }
+ 
+-    test_upscale(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {512, 512, 3, 1},
+-            int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false)
+-        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {}
++    test_upscale(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 512, 512, 3, 1 },
++                 int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false) :
++        type(type),
++        ne(ne),
++        scale_factor(scale_factor),
++        transpose(transpose),
++        mode(mode) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -3039,26 +2977,25 @@ struct test_upscale : public test_case {
+ 
+ // GGML_OP_UPSCALE (ext)
+ struct test_upscale_ext : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+     const std::array<int64_t, 4> ne_tgt;
+-    const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST;
++    const ggml_scale_mode        mode = GGML_SCALE_MODE_NEAREST;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, ne_tgt, mode);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, ne_tgt, mode); }
+ 
+-    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+-            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
+-            ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST)
+-        : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {}
++    test_upscale_ext(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 2, 5, 7, 11 },
++                     std::array<int64_t, 4> ne_tgt = { 5, 7, 11, 13 }, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST) :
++        type(type),
++        ne(ne),
++        ne_tgt(ne_tgt),
++        mode(mode) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+         ggml_set_name(a, "a");
+ 
+-        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode);
++        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1], ne_tgt[2], ne_tgt[3], mode);
+         ggml_set_name(out, "out");
+ 
+         return out;
+@@ -3067,20 +3004,19 @@ struct test_upscale_ext : public test_case {
+ 
+ // GGML_OP_GROUP_NORM
+ struct test_group_norm : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const int32_t num_groups;
+-    const float eps;
++    const int32_t                num_groups;
++    const float                  eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne, num_groups, eps);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne, num_groups, eps); }
+ 
+-    test_group_norm(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+-            int32_t num_groups = 32,
+-            float eps = 1e-6f)
+-        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
++    test_group_norm(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 64, 320, 1 },
++                    int32_t num_groups = 32, float eps = 1e-6f) :
++        type(type),
++        ne(ne),
++        num_groups(num_groups),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -3095,18 +3031,16 @@ struct test_group_norm : public test_case {
+ 
+ // GGML_OP_L2_NORM
+ struct test_l2_norm : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+-    const float eps;
++    const float                  eps;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_l2_norm(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+-            float eps = 1e-12f)
+-        : type(type), ne(ne), eps(eps) {}
++    test_l2_norm(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 64, 64, 320, 1 }, float eps = 1e-12f) :
++        type(type),
++        ne(ne),
++        eps(eps) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -3121,18 +3055,17 @@ struct test_l2_norm : public test_case {
+ 
+ // GGML_OP_ACC
+ struct test_acc : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+     const std::array<int64_t, 4> ne_b;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne_a, ne_b);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne_a, ne_b); }
+ 
+-    test_acc(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
+-            std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
+-        : type(type), ne_a(ne_a), ne_b(ne_b) {}
++    test_acc(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 256, 17, 1, 1 },
++             std::array<int64_t, 4> ne_b = { 256, 16, 1, 1 }) :
++        type(type),
++        ne_a(ne_a),
++        ne_b(ne_b) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+@@ -3152,19 +3085,19 @@ struct test_acc : public test_case {
+ 
+ // GGML_OP_PAD
+ struct test_pad : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    const int pad_0;
+-    const int pad_1;
++    const int                    pad_0;
++    const int                    pad_1;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne_a, pad_0, pad_1); }
+ 
+-    test_pad(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
+-            int pad_0 = 1, int pad_1 = 1)
+-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
++    test_pad(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 512, 512, 1, 1 }, int pad_0 = 1,
++             int pad_1 = 1) :
++        type(type),
++        ne_a(ne_a),
++        pad_0(pad_0),
++        pad_1(pad_1) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+@@ -3179,19 +3112,19 @@ struct test_pad : public test_case {
+ 
+ // GGML_OP_PAD_REFLECT_1D
+ struct test_pad_reflect_1d : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    const int pad_0;
+-    const int pad_1;
++    const int                    pad_0;
++    const int                    pad_1;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne_a, pad_0, pad_1); }
+ 
+-    test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
+-            int pad_0 = 10, int pad_1 = 9)
+-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
++    test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 512, 34, 2, 1 }, int pad_0 = 10,
++                        int pad_1 = 9) :
++        type(type),
++        ne_a(ne_a),
++        pad_0(pad_0),
++        pad_1(pad_1) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
+@@ -3207,17 +3140,17 @@ struct test_pad_reflect_1d : public test_case {
+ // GGML_OP_ARANGE
+ struct test_arange : public test_case {
+     const ggml_type type;
+-    const float start;
+-    const float stop;
+-    const float step;
++    const float     start;
++    const float     stop;
++    const float     step;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, start, stop, step);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, start, stop, step); }
+ 
+-    test_arange(ggml_type type = GGML_TYPE_F32,
+-            float start = 0.f, float stop = 10.f, float step = 1.f)
+-        : type(type), start(start), stop(stop), step(step)  {}
++    test_arange(ggml_type type = GGML_TYPE_F32, float start = 0.f, float stop = 10.f, float step = 1.f) :
++        type(type),
++        start(start),
++        stop(stop),
++        step(step) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * out = ggml_arange(ctx, start, stop, step);
+@@ -3229,19 +3162,19 @@ struct test_arange : public test_case {
+ 
+ // GGML_OP_TIMESTEP_EMBEDDING
+ struct test_timestep_embedding : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    const int dim;
+-    const int max_period;
++    const int                    dim;
++    const int                    max_period;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR4(type, ne_a, dim, max_period);
+-    }
++    std::string vars() override { return VARS_TO_STR4(type, ne_a, dim, max_period); }
+ 
+-    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
+-            int dim = 320, int max_period=10000)
+-        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
++    test_timestep_embedding(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 2, 1, 1, 1 }, int dim = 320,
++                            int max_period = 10000) :
++        type(type),
++        ne_a(ne_a),
++        dim(dim),
++        max_period(max_period) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+@@ -3256,18 +3189,17 @@ struct test_timestep_embedding : public test_case {
+ 
+ // GGML_OP_LEAKY_RELU
+ struct test_leaky_relu : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne_a;
+-    const float negative_slope;
++    const float                  negative_slope;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR3(type, ne_a, negative_slope);
+-    }
++    std::string vars() override { return VARS_TO_STR3(type, ne_a, negative_slope); }
+ 
+-    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
+-            float negative_slope = 0.1f)
+-        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
++    test_leaky_relu(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne_a = { 10, 5, 4, 3 },
++                    float negative_slope = 0.1f) :
++        type(type),
++        ne_a(ne_a),
++        negative_slope(negative_slope) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+@@ -3282,66 +3214,77 @@ struct test_leaky_relu : public test_case {
+ 
+ // GGML_OP_FLASH_ATTN_EXT
+ struct test_flash_attn_ext : public test_case {
+-    const int64_t hsk; // K head size
+-    const int64_t hsv; // V head size
+-    const int64_t nh; // num heads
+-    const int64_t nr; // repeat in Q, tests for grouped-query attention
+-    const int64_t kv; // kv size
+-    const int64_t nb; // batch size
++    const int64_t hsk;          // K head size
++    const int64_t hsv;          // V head size
++    const int64_t nh;           // num heads
++    const int64_t nr;           // repeat in Q, tests for grouped-query attention
++    const int64_t kv;           // kv size
++    const int64_t nb;           // batch size
+ 
+-    const bool mask; // use mask
++    const bool mask;            // use mask
+ 
+-    const float max_bias; // ALiBi
+-    const float logit_softcap; // Gemma 2
++    const float max_bias;       // ALiBi
++    const float logit_softcap;  // Gemma 2
+ 
+-    const ggml_prec prec;
+-    const ggml_type type_KV;
++    const ggml_prec        prec;
++    const ggml_type        type_KV;
+     std::array<int32_t, 4> permute;
+ 
+     std::string vars() override {
+         return VARS_TO_STR12(hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, permute);
+     }
+ 
+-    double max_nmse_err() override {
+-        return 5e-4;
+-    }
++    double max_nmse_err() override { return 5e-4; }
+ 
+     uint64_t op_flops(ggml_tensor * t) override {
+         GGML_UNUSED(t);
+         // Just counting matmul costs:
+         // Q*K^T is nb x hsk x kv, P*V is nb x kv x hsv, per head
+-        return 2 * nh*nr * nb * (hsk + hsv) * kv;
+-    }
+-
+-    test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8,
+-                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_prec prec = GGML_PREC_F32,
+-                        ggml_type type_KV = GGML_TYPE_F16, std::array<int32_t, 4> permute = {0, 1, 2, 3})
+-        : hsk(hsk), hsv(hsv), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {}
++        return 2 * nh * nr * nb * (hsk + hsv) * kv;
++    }
++
++    test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96,
++                        int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f,
++                        ggml_prec prec = GGML_PREC_F32, ggml_type type_KV = GGML_TYPE_F16,
++                        std::array<int32_t, 4> permute = { 0, 1, 2, 3 }) :
++        hsk(hsk),
++        hsv(hsv),
++        nh(nh),
++        nr(nr),
++        kv(kv),
++        nb(nb),
++        mask(mask),
++        max_bias(max_bias),
++        logit_softcap(logit_softcap),
++        prec(prec),
++        type_KV(type_KV),
++        permute(permute) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
+         const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
+ 
+-        auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
+-            int64_t ne[4] = {ne0, ne1, ne2, ne3};
++        const auto & create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2,
++                                           int64_t ne3) -> ggml_tensor * {
++            int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+             int64_t ne_perm[4];
+             for (int i = 0; i < 4; ++i) {
+                 ne_perm[permute[i]] = ne[i];
+             }
+             ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
+-            if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
++            if (permute != std::array<int32_t, 4>{ 0, 1, 2, 3 }) {
+                 t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
+             }
+             return t;
+         };
+ 
+-        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr, 1);
++        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh * nr, 1);
+         ggml_set_name(q, "q");
+ 
+-        ggml_tensor * k = create_permuted(type_KV,       hsk_padded, kv, nh,    1);
++        ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, 1);
+         ggml_set_name(k, "k");
+ 
+-        ggml_tensor * v = create_permuted(type_KV,       hsv_padded, kv, nh,    1);
++        ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, 1);
+         ggml_set_name(v, "v");
+ 
+         ggml_tensor * m = nullptr;
+@@ -3350,30 +3293,26 @@ struct test_flash_attn_ext : public test_case {
+             ggml_set_name(m, "m");
+         }
+ 
+-        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hsk), max_bias, logit_softcap);
++        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f / sqrtf(hsk), max_bias, logit_softcap);
+         ggml_flash_attn_ext_set_prec(out, prec);
+         ggml_set_name(out, "out");
+ 
+         return out;
+     }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_CROSS_ENTROPY_LOSS
+ struct test_cross_entropy_loss : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) :
++        type(type),
++        ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+@@ -3401,27 +3340,21 @@ struct test_cross_entropy_loss : public test_case {
+         }
+     }
+ 
+-    float grad_eps() override {
+-        return 1.0f;
+-    }
++    float grad_eps() override { return 1.0f; }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+ struct test_cross_entropy_loss_back : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) :
++        type(type),
++        ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * grad = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+@@ -3446,20 +3379,18 @@ struct test_cross_entropy_loss_back : public test_case {
+ 
+ // GGML_OP_OPT_STEP_ADAMW
+ struct test_opt_step_adamw : public test_case {
+-    const ggml_type type;
++    const ggml_type              type;
+     const std::array<int64_t, 4> ne;
+ 
+-    std::string vars() override {
+-        return VARS_TO_STR2(type, ne);
+-    }
++    std::string vars() override { return VARS_TO_STR2(type, ne); }
+ 
+-    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
+-            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+-        : type(type), ne(ne) {}
++    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32, std::array<int64_t, 4> ne = { 10, 5, 4, 3 }) :
++        type(type),
++        ne(ne) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+-        ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
++        ggml_set_param(a);  // Despite tensor a having gradients the output tensor will not.
+         ggml_set_name(a, "a");
+ 
+         ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+@@ -3482,13 +3413,11 @@ struct test_opt_step_adamw : public test_case {
+ 
+     void initialize_tensors(ggml_context * ctx) override {
+         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+-            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
++            init_tensor_uniform(t, 0.0f, 1.0f);  // grad_v and adamw_params need non-negative values.
+         }
+     }
+ 
+-    bool grad_precise() override {
+-        return true;
+-    }
++    bool grad_precise() override { return true; }
+ };
+ 
+ enum llm_norm_type {
+@@ -3497,30 +3426,30 @@ enum llm_norm_type {
+ };
+ 
+ struct llama_hparams {
+-    uint32_t n_vocab;
+-    uint32_t n_embd;
+-    uint32_t n_head;
+-    uint32_t n_head_kv;
++    uint32_t                  n_vocab;
++    uint32_t                  n_embd;
++    uint32_t                  n_head;
++    uint32_t                  n_head_kv;
+     static constexpr uint32_t n_layer = 1;
+-    uint32_t n_rot;
+-    uint32_t n_embd_head; // dimension of values (d_v)
+-    uint32_t n_ff;
++    uint32_t                  n_rot;
++    uint32_t                  n_embd_head;  // dimension of values (d_v)
++    uint32_t                  n_ff;
+ 
+     float f_norm_eps;
+     float f_norm_rms_eps;
+ 
+     // cparams
+-    static constexpr uint32_t n_ctx = 512; // user-specified context size
++    static constexpr uint32_t n_ctx      = 512;  // user-specified context size
+     static constexpr uint32_t n_ctx_orig = n_ctx;
+ 
+     // batch
+     int32_t n_tokens;
+ 
+     // llm_build_context
+-    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
+-    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
++    static constexpr int32_t n_kv    = 32;  // size of KV cache to consider (n_kv <= n_ctx
++    static constexpr int32_t kv_head = 1;   // index of where we store new KV data in the cache
+ 
+-    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
++    uint32_t n_embd_gqa() const {           // dimension of key embeddings across all k-v heads
+         return n_embd_head * n_head_kv;
+     }
+ };
+@@ -3529,21 +3458,19 @@ struct llama_hparams {
+ struct test_llm : public test_case {
+     llama_hparams hp;
+ 
+-protected:
+-    test_llm(llama_hparams hp)
+-        : hp(std::move(hp)) {
+-    }
++  protected:
++    test_llm(llama_hparams hp) : hp(std::move(hp)) {}
+ 
+-public:
+-    struct ggml_tensor * llm_build_norm(
+-            struct ggml_context * ctx,
+-             struct ggml_tensor * cur,
+-             struct ggml_tensor * mw,
+-             struct ggml_tensor * mb,
+-                  llm_norm_type   type) {
++  public:
++    struct ggml_tensor * llm_build_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * mw,
++                                        struct ggml_tensor * mb, llm_norm_type type) {
+         switch (type) {
+-            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
+-            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
++            case LLM_NORM:
++                cur = ggml_norm(ctx, cur, hp.f_norm_eps);
++                break;
++            case LLM_NORM_RMS:
++                cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps);
++                break;
+         }
+         cur = ggml_mul(ctx, cur, mw);
+         if (mb) {
+@@ -3552,42 +3479,30 @@ public:
+         return cur;
+     }
+ 
+-    void llm_build_kv_store(
+-            struct ggml_context * ctx,
+-             struct ggml_tensor * k_l,
+-             struct ggml_tensor * v_l,
+-             struct ggml_tensor * k_cur,
+-             struct ggml_tensor * v_cur) {
++    void llm_build_kv_store(struct ggml_context * ctx, struct ggml_tensor * k_l, struct ggml_tensor * v_l,
++                            struct ggml_tensor * k_cur, struct ggml_tensor * v_cur) {
+         // compute the transposed [n_tokens, n_embd] V matrix
+         struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
+ 
+-        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
+-                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
++        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens * hp.n_embd_gqa(),
++                                                         (ggml_row_size(k_l->type, hp.n_embd_gqa())) *hp.kv_head);
+ 
+-        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
+-                (  hp.n_ctx)*ggml_element_size(v_l),
+-                (hp.kv_head)*ggml_element_size(v_l));
++        struct ggml_tensor * v_cache_view =
++            ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(), (hp.n_ctx) * ggml_element_size(v_l),
++                         (hp.kv_head) * ggml_element_size(v_l));
+ 
+         // important: storing RoPE-ed version of K in the KV cache!
+-        ggml_cpy(ctx, k_cur,   k_cache_view);
++        ggml_cpy(ctx, k_cur, k_cache_view);
+         ggml_cpy(ctx, v_cur_t, v_cache_view);
+     }
+ 
+-    struct ggml_tensor * llm_build_kqv(
+-            struct ggml_context * ctx,
+-             struct ggml_tensor * k_l,
+-             struct ggml_tensor * v_l,
+-             struct ggml_tensor * q_cur,
+-             struct ggml_tensor * kq_mask,
+-                        float     kq_scale) {
++    struct ggml_tensor * llm_build_kqv(struct ggml_context * ctx, struct ggml_tensor * k_l, struct ggml_tensor * v_l,
++                                       struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, float kq_scale) {
+         struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+ 
+         struct ggml_tensor * k =
+-            ggml_view_3d(ctx, k_l,
+-                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
+-                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
+-                    ggml_row_size(k_l->type, hp.n_embd_head),
+-                    0);
++            ggml_view_3d(ctx, k_l, hp.n_embd_head, hp.n_kv, hp.n_head_kv, ggml_row_size(k_l->type, hp.n_embd_gqa()),
++                         ggml_row_size(k_l->type, hp.n_embd_head), 0);
+ 
+         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+ 
+@@ -3595,20 +3510,17 @@ public:
+ 
+         // split cached v into n_head heads
+         struct ggml_tensor * v =
+-            ggml_view_3d(ctx, v_l,
+-                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
+-                    ggml_element_size(v_l)*hp.n_ctx,
+-                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
+-                    0);
++            ggml_view_3d(ctx, v_l, hp.n_kv, hp.n_embd_head, hp.n_head_kv, ggml_element_size(v_l) * hp.n_ctx,
++                         ggml_element_size(v_l) * hp.n_ctx * hp.n_embd_head, 0);
+ 
+         struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+ 
+         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+ 
+-        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
++        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head * hp.n_head, hp.n_tokens);
+ 
+         struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+-        cur = ggml_mul_mat(ctx, wo, cur);
++        cur                     = ggml_mul_mat(ctx, wo, cur);
+ 
+         return cur;
+     }
+@@ -3631,12 +3543,12 @@ public:
+ 
+ // Llama
+ struct test_llama : public test_llm {
+-    static constexpr float freq_base = 10000.0f;
+-    static constexpr float freq_scale = 1.0f;
+-    static constexpr float ext_factor = 0.0f;
++    static constexpr float freq_base   = 10000.0f;
++    static constexpr float freq_scale  = 1.0f;
++    static constexpr float ext_factor  = 0.0f;
+     static constexpr float attn_factor = 1.0f;
+-    static constexpr float beta_fast = 32.0f;
+-    static constexpr float beta_slow = 1.0f;
++    static constexpr float beta_fast   = 32.0f;
++    static constexpr float beta_slow   = 1.0f;
+ 
+     std::string op_desc(ggml_tensor * t) override {
+         GGML_UNUSED(t);
+@@ -3648,24 +3560,21 @@ struct test_llama : public test_llm {
+         return VARS_TO_STR1(n_tokens);
+     }
+ 
+-    double max_nmse_err() override {
+-        return 2e-3;
+-    }
++    double max_nmse_err() override { return 2e-3; }
+ 
+-    test_llama(int n_tokens = 1)
+-        : test_llm({
+-            /*n_vocab        =*/ 32000,
+-            /*n_embd         =*/ 3200,
+-            /*n_head         =*/ 32,
+-            /*n_head_kv      =*/ 32,
+-            /*n_rot          =*/ 100,
+-            /*n_embd_head    =*/ 100,
+-            /*n_ff           =*/ 8640,
+-            /*f_norm_eps     =*/ 0.f,
+-            /*f_norm_rms_eps =*/ 1e-5f,
+-            /*n_tokens       =*/ n_tokens,
+-        }) {
+-    }
++    test_llama(int n_tokens = 1) :
++        test_llm({
++            /*n_vocab        =*/32000,
++            /*n_embd         =*/3200,
++            /*n_head         =*/32,
++            /*n_head_kv      =*/32,
++            /*n_rot          =*/100,
++            /*n_embd_head    =*/100,
++            /*n_ff           =*/8640,
++            /*f_norm_eps     =*/0.f,
++            /*f_norm_rms_eps =*/1e-5f,
++            /*n_tokens       =*/n_tokens,
++        }) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         struct ggml_tensor * cur;
+@@ -3687,7 +3596,7 @@ struct test_llama : public test_llm {
+ 
+             // norm
+             ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+-            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
++            cur                     = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
+ 
+             // self-attention
+             {
+@@ -3700,37 +3609,33 @@ struct test_llama : public test_llm {
+                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
+                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
+ 
+-                Qcur = ggml_rope_ext(
+-                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
+-                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+-                    ext_factor, attn_factor, beta_fast, beta_slow
+-                );
++                Qcur = ggml_rope_ext(ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
++                                     nullptr, hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale, ext_factor,
++                                     attn_factor, beta_fast, beta_slow);
+ 
+-                Kcur = ggml_rope_ext(
+-                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
+-                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+-                    ext_factor, attn_factor, beta_fast, beta_slow
+-                );
++                Kcur = ggml_rope_ext(ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens),
++                                     inp_pos, nullptr, hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale, ext_factor,
++                                     attn_factor, beta_fast, beta_slow);
+ 
+                 llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+ 
+-                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
++                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f / sqrtf(float(hp.n_embd_head)));
+             }
+ 
+             struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
+ 
+             // feed-forward network
+             ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+-            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
++            cur                    = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
+ 
+-            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+-            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
+-            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+-            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
+-            cur = ggml_mul_mat(ctx, ffn_gate, cur);
+-            cur = ggml_silu(ctx, cur);
+-            cur = ggml_mul(ctx, cur, tmp);
+-            cur = ggml_mul_mat(ctx, ffn_down, cur);
++            ggml_tensor *        ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
++            ggml_tensor *        ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
++            ggml_tensor *        ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
++            struct ggml_tensor * tmp      = ggml_mul_mat(ctx, ffn_up, cur);
++            cur                           = ggml_mul_mat(ctx, ffn_gate, cur);
++            cur                           = ggml_silu(ctx, cur);
++            cur                           = ggml_mul(ctx, cur, tmp);
++            cur                           = ggml_mul_mat(ctx, ffn_down, cur);
+ 
+             cur = ggml_add(ctx, cur, ffn_inp);
+ 
+@@ -3741,11 +3646,11 @@ struct test_llama : public test_llm {
+         cur = inpL;
+ 
+         ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+-        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
++        cur                       = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
+ 
+         // lm_head
+         ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
+-        cur = ggml_mul_mat(ctx, output, cur);
++        cur                  = ggml_mul_mat(ctx, output, cur);
+ 
+         return cur;
+     }
+@@ -3753,12 +3658,12 @@ struct test_llama : public test_llm {
+ 
+ // Falcon
+ struct test_falcon : public test_llm {
+-    static constexpr float freq_base = 10000.0f;
+-    static constexpr float freq_scale = 1.0f;
+-    static constexpr float ext_factor = 0.0f;
++    static constexpr float freq_base   = 10000.0f;
++    static constexpr float freq_scale  = 1.0f;
++    static constexpr float ext_factor  = 0.0f;
+     static constexpr float attn_factor = 1.0f;
+-    static constexpr float beta_fast = 32.0f;
+-    static constexpr float beta_slow = 1.0f;
++    static constexpr float beta_fast   = 32.0f;
++    static constexpr float beta_slow   = 1.0f;
+ 
+     std::string op_desc(ggml_tensor * t) override {
+         GGML_UNUSED(t);
+@@ -3770,24 +3675,21 @@ struct test_falcon : public test_llm {
+         return VARS_TO_STR1(n_tokens);
+     }
+ 
+-    double max_nmse_err() override {
+-        return 2e-3;
+-    }
++    double max_nmse_err() override { return 2e-3; }
+ 
+-    test_falcon(int n_tokens = 1)
+-        : test_llm({
+-            /*n_vocab        =*/ 32000,
+-            /*n_embd         =*/ 3200,
+-            /*n_head         =*/ 50,
+-            /*n_head_kv      =*/ 1,
+-            /*n_rot          =*/ 64,
+-            /*n_embd_head    =*/ 64,
+-            /*n_ff           =*/ 8640,
+-            /*f_norm_eps     =*/ 1e-5f,
+-            /*f_norm_rms_eps =*/ 0.f,
+-            /*n_tokens       =*/ n_tokens,
+-        }) {
+-    }
++    test_falcon(int n_tokens = 1) :
++        test_llm({
++            /*n_vocab        =*/32000,
++            /*n_embd         =*/3200,
++            /*n_head         =*/50,
++            /*n_head_kv      =*/1,
++            /*n_rot          =*/64,
++            /*n_embd_head    =*/64,
++            /*n_ff           =*/8640,
++            /*f_norm_eps     =*/1e-5f,
++            /*f_norm_rms_eps =*/0.f,
++            /*n_tokens       =*/n_tokens,
++        }) {}
+ 
+     ggml_tensor * build_graph(ggml_context * ctx) override {
+         struct ggml_tensor * cur;
+@@ -3808,37 +3710,38 @@ struct test_falcon : public test_llm {
+             // norm
+             ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+             ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+-            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
++            ggml_tensor * attn_norm   = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
+ 
+             // self-attention
+             {
+                 cur = attn_norm;
+ 
+-                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
++                ggml_tensor * wqkv =
++                    ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2 * hp.n_embd_gqa());
+ 
+                 cur = ggml_mul_mat(ctx, wqkv, cur);
+ 
+-                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
+-                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
+-                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
++                struct ggml_tensor * Qcur = ggml_cont(
++                    ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0 * sizeof(float) * (hp.n_embd)));
++                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens,
++                                                                        cur->nb[1], 1 * sizeof(float) * (hp.n_embd)));
++                struct ggml_tensor * Vcur =
++                    ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1],
++                                                1 * sizeof(float) * (hp.n_embd + hp.n_embd_gqa())));
+ 
+-                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
++                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
+                 Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
+ 
+                 // using mode = 2 for neox mode
+-                Qcur = ggml_rope_ext(
+-                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+-                );
++                Qcur = ggml_rope_ext(ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig, freq_base, freq_scale,
++                                     ext_factor, attn_factor, beta_fast, beta_slow);
+ 
+-                Kcur = ggml_rope_ext(
+-                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+-                );
++                Kcur = ggml_rope_ext(ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig, freq_base, freq_scale,
++                                     ext_factor, attn_factor, beta_fast, beta_slow);
+ 
+                 llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+ 
+-                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
++                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f / sqrtf(float(hp.n_embd_head)));
+             }
+ 
+             struct ggml_tensor * ffn_inp = cur;
+@@ -3847,10 +3750,10 @@ struct test_falcon : public test_llm {
+             {
+                 ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+                 ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
+-                cur = attn_norm;
+-                cur = ggml_mul_mat(ctx, ffn_up, cur);
+-                cur = ggml_gelu(ctx, cur);
+-                cur = ggml_mul_mat(ctx, ffn_down, cur);
++                cur                    = attn_norm;
++                cur                    = ggml_mul_mat(ctx, ffn_up, cur);
++                cur                    = ggml_gelu(ctx, cur);
++                cur                    = ggml_mul_mat(ctx, ffn_down, cur);
+             }
+ 
+             cur = ggml_add(ctx, cur, ffn_inp);
+@@ -3865,65 +3768,80 @@ struct test_falcon : public test_llm {
+ 
+         ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+         ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+-        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
++        cur                         = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
+ 
+         // lm_head
+         ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
+-        cur = ggml_mul_mat(ctx, output, cur);
++        cur                  = ggml_mul_mat(ctx, output, cur);
+ 
+         return cur;
+     }
+ };
+ 
+-
+ // ###########################################
+ // ## Section 3: GGML Op Test Instantiation ##
+ // ###########################################
+ static const ggml_type all_types[] = {
+-    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+-    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+-    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
++    GGML_TYPE_F32,
++    GGML_TYPE_F16,
++    GGML_TYPE_BF16,
++    GGML_TYPE_Q4_0,
++    GGML_TYPE_Q4_1,
++    GGML_TYPE_Q5_0,
++    GGML_TYPE_Q5_1,
+     GGML_TYPE_Q8_0,
+-    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+-    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
++    GGML_TYPE_Q2_K,
++    GGML_TYPE_Q3_K,
++    GGML_TYPE_Q4_K,
++    GGML_TYPE_Q5_K,
+     GGML_TYPE_Q6_K,
+     // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+-    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+-    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+-    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
++    GGML_TYPE_IQ2_XXS,
++    GGML_TYPE_IQ2_XS,
++    GGML_TYPE_IQ2_S,
++    GGML_TYPE_IQ3_XXS,
++    GGML_TYPE_IQ1_S,
++    GGML_TYPE_IQ1_M,
++    GGML_TYPE_IQ4_NL,
++    GGML_TYPE_IQ3_S,
++    GGML_TYPE_IQ4_XS,
+ };
+ 
+-static const ggml_type base_types[] = {
+-    GGML_TYPE_F32, GGML_TYPE_F16,
+-    GGML_TYPE_Q8_0, // for I8MM tests
+-    GGML_TYPE_Q4_0,
+-    GGML_TYPE_Q4_1, // for I8MM tests
+-    GGML_TYPE_Q4_K,
+-    GGML_TYPE_IQ2_XXS
+-};
++static const ggml_type base_types[] = { GGML_TYPE_F32,  GGML_TYPE_F16,
++                                        GGML_TYPE_Q8_0,  // for I8MM tests
++                                        GGML_TYPE_Q4_0,
++                                        GGML_TYPE_Q4_1,  // for I8MM tests
++                                        GGML_TYPE_Q4_K, GGML_TYPE_IQ2_XXS };
+ 
+ static const ggml_type other_types[] = {
+     GGML_TYPE_Q4_1,
+-    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
++    GGML_TYPE_Q5_0,
++    GGML_TYPE_Q5_1,
+     GGML_TYPE_Q8_0,
+-    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
++    GGML_TYPE_Q2_K,
++    GGML_TYPE_Q3_K,
+     GGML_TYPE_Q5_K,
+     GGML_TYPE_Q6_K,
+     // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+-    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+-    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+-    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
++    GGML_TYPE_IQ2_XS,
++    GGML_TYPE_IQ2_S,
++    GGML_TYPE_IQ3_XXS,
++    GGML_TYPE_IQ1_S,
++    GGML_TYPE_IQ1_M,
++    GGML_TYPE_IQ4_NL,
++    GGML_TYPE_IQ3_S,
++    GGML_TYPE_IQ4_XS,
+     GGML_TYPE_BF16,
+ };
+ 
+ // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     std::vector<std::unique_ptr<test_case>> test_cases;
+-    std::default_random_engine rng(0);
++    std::default_random_engine              rng(0);
+ 
+     // unary ops
+-    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+-        for (int v : {0, 1}) {
++    for (ggml_type type : { GGML_TYPE_F16, GGML_TYPE_F32 }) {
++        for (int v : { 0, 1 }) {
+             for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
+                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v));
+                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v));
+@@ -3933,37 +3851,38 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+ 
+     test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
+     for (ggml_type type : all_types) {
+-        for (int b : {1, 7}) {
+-            for (bool v : {false, true}) {
++        for (int b : { 1, 7 }) {
++            for (bool v : { false, true }) {
+                 test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
+             }
+         }
+     }
+-    for (int b : {1, 7}) {
+-        for (bool v : {false, true}) {
++    for (int b : { 1, 7 }) {
++        for (bool v : { false, true }) {
+             test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
+         }
+     }
+ 
+     test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+     for (ggml_type type : all_types) {
+-        for (bool v : {false, true}) {
++        for (bool v : { false, true }) {
+             test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+         }
+     }
+-    for (bool v : {false, true}) {
++    for (bool v : { false, true }) {
+         test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v));
+     }
+ 
+-    for (ggml_type type_input : {GGML_TYPE_F32}) {
+-        for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
+-            for (int k0 : {1, 3}) {
+-                for (int k1 : {1, 3}) {
+-                    for (int s0 : {1, 2}) {
+-                        for (int s1 : {1, 2}) {
+-                            for (int p0 : {0, 1}) {
+-                                for (int p1 : {0, 1}) {
+-                                    test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
++    for (ggml_type type_input : { GGML_TYPE_F32 }) {
++        for (ggml_op_pool pool_type : { GGML_OP_POOL_AVG, GGML_OP_POOL_MAX }) {
++            for (int k0 : { 1, 3 }) {
++                for (int k1 : { 1, 3 }) {
++                    for (int s0 : { 1, 2 }) {
++                        for (int s1 : { 1, 2 }) {
++                            for (int p0 : { 0, 1 }) {
++                                for (int p1 : { 0, 1 }) {
++                                    test_cases.emplace_back(new test_pool2d(pool_type, type_input, { 10, 10, 3, 1 }, k0,
++                                                                            k1, s0, s1, p0, p1));
+                                 }
+                             }
+                         }
+@@ -3974,15 +3893,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     }
+ 
+     // im2col 1D
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+-    for (int s0 : {1, 3}) {
+-        for (int p0 : {0, 3}) {
+-            for (int d0 : {1, 3}) {
+-                test_cases.emplace_back(new test_im2col(
+-                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
+-                    s0, 0, p0, 0, d0, 0, false));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, { 3000, 128, 1, 1 },
++                                            { 3, 128, 1280, 1 }, 1, 0, 1, 0, 1, 0, false));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, { 3000, 128, 1, 1 },
++                                            { 3, 128, 1280, 1 }, 1, 0, 1, 0, 1, 0, false));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 3000, 128, 1, 1 },
++                                            { 3, 128, 1280, 1 }, 1, 0, 1, 0, 1, 0, false));
++    for (int s0 : { 1, 3 }) {
++        for (int p0 : { 0, 3 }) {
++            for (int d0 : { 1, 3 }) {
++                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, { 20, 2, 2, 1 },
++                                                        { 3, 2, 2, 1 }, s0, 0, p0, 0, d0, 0, false));
+             }
+         }
+     }
+@@ -3991,15 +3912,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
+     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+-    for (int s0 : {1, 3}) {
+-        for (int s1 : {1, 3}) {
+-            for (int p0 : {0, 3}) {
+-                for (int p1 : {0, 3}) {
+-                    for (int d0 : {1, 3}) {
+-                        for (int d1 : {1, 3}) {
+-                            test_cases.emplace_back(new test_im2col(
+-                                GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
+-                                s0, s1, p0, p1, d0, d1, true));
++    for (int s0 : { 1, 3 }) {
++        for (int s1 : { 1, 3 }) {
++            for (int p0 : { 0, 3 }) {
++                for (int p1 : { 0, 3 }) {
++                    for (int d0 : { 1, 3 }) {
++                        for (int d1 : { 1, 3 }) {
++                            test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
++                                                                    { 20, 20, 2, 2 }, { 3, 3, 2, 2 }, s0, s1, p0, p1,
++                                                                    d0, d1, true));
+                         }
+                     }
+                 }
+@@ -4008,14 +3929,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     }
+ 
+     // extra tests for im2col 2D
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
+-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 1, 32 },
++                                            { 3, 3, 1, 32 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 2, 32 },
++                                            { 3, 3, 2, 32 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 1, 1024 },
++                                            { 3, 3, 1, 1024 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 2, 1024 },
++                                            { 3, 3, 2, 1024 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 1, 2048 },
++                                            { 3, 3, 1, 2048 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 2, 2048 },
++                                            { 3, 3, 2, 2048 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 1, 2560 },
++                                            { 3, 3, 1, 2560 }, 1, 1, 1, 1, 1, 1, true));
++    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, { 12, 12, 2, 2560 },
++                                            { 3, 3, 2, 2560 }, 1, 1, 1, 1, 1, 1, true));
+ 
+     // sycl backend will limit task global_range < MAX_INT
+     // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
+@@ -4024,65 +3953,65 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+     // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+ 
+-    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
+-    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
+-    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
+-    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
++    test_cases.emplace_back(new test_conv_2d_dw({ 17, 34, 9, 1 }, { 3, 3, 1, 9 }, 1, 0, 1, false));
++    test_cases.emplace_back(new test_conv_2d_dw({ 17, 34, 9, 1 }, { 3, 3, 1, 9 }, 1, 0, 1, true));
++    test_cases.emplace_back(new test_conv_2d_dw({ 32, 8, 64, 1 }, { 3, 3, 1, 64 }, 2, 1, 1, false));
++    test_cases.emplace_back(new test_conv_2d_dw({ 32, 8, 64, 1 }, { 3, 3, 1, 64 }, 2, 1, 1, true));
+ 
+     test_cases.emplace_back(new test_conv_transpose_1d());
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+-    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+-
+-    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4,  500, 1, 1}));
+-    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
+-
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,    1, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100,  10, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438,  3, 1, 1}));
+-
+-    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+-        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+-    }
+-
+-    for (bool view : {false, true}) {
+-        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 1}, view));
+-        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
+-        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
+-        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
+-        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 2, 3, 2, 1 }, 3, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 2, 3, 2, 1 }, 2, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 2, 3, 2, 1 }, 1, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 3, 2, 2, 1 }, 2, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 3, 2, 2, 1 }, 1, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 3, 2, 1, 1 }, { 3, 1, 2, 1 }, 1, 0, 1));
++    test_cases.emplace_back(new test_conv_transpose_1d({ 2, 1, 1, 1 }, { 3, 1, 1, 1 }, 1, 0, 1));
++
++    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, { 4, 500, 1, 1 }));
++    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, { 4, 5000, 1, 1 }));
++
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 32, 1, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 100, 10, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 1024, 10, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 1024, 12, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 2000, 10, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 5438, 3, 1, 1 }));
++
++    for (int ne3 : { 1, 3 }) {  // CUDA backward pass only supports ne3 == 1
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, { 10, 5, 4, ne3 }, { 1, 1, 1, 1 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, { 10, 5, 4, ne3 }, { 2, 1, 1, 1 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, { 10, 5, 4, ne3 }, { 1, 2, 1, 1 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, { 10, 5, 4, ne3 }, { 1, 1, 2, 1 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, { 10, 5, 4, ne3 }, { 1, 1, 1, 2 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, { 10, 5, 4, ne3 }, { 2, 1, 1, 1 }));
++        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, { 10, 5, 4, ne3 }, { 1, 1, 1, 2 }));
++    }
++
++    for (bool view : { false, true }) {
++        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, { 8, 6, 4, 2 }, { 1, 1, 1, 1 }, view));
++        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, { 8, 6, 4, 2 }, { 2, 1, 1, 1 }, view));
++        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, { 8, 6, 4, 2 }, { 1, 2, 1, 1 }, view));
++        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, { 8, 6, 4, 2 }, { 1, 1, 2, 1 }, view));
++        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, { 8, 6, 4, 2 }, { 1, 1, 1, 2 }, view));
+     }
+ 
+     test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
+     test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
+     test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
+     test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
++    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, { 10, 10, 5, 1 }, { 0, 2, 1, 3 }));
++    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, { 10, 10, 5, 1 }, { 0, 2, 1, 3 }));  // dup by rows
++    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, { 10, 10, 5, 1 }, { 1, 0, 2, 3 }));
++    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, { 10, 10, 5, 1 }, { 1, 0, 2, 3 }));  // dup dst not-contiguous
++    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, { 10, 8, 3, 1 }, { 0, 2, 1, 3 }));
++    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, { 10, 8, 3, 1 }, { 1, 2, 0, 3 }));
+ 
+     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+-        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
++        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, { 6, 5, 4, 3 }, dim));
+     }
+ 
+     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+-        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
++        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, { 6, 5, 4, 3 }, dim));
+     }
+ 
+     // same-type copy
+@@ -4090,75 +4019,76 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         const auto nk = ggml_blck_size(type);
+ 
+         for (int k = 1; k < 4; ++k) {
+-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
+-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
+-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
++            test_cases.emplace_back(new test_cpy(type, type, { k * nk, 2, 3, 4 }));
++            test_cases.emplace_back(new test_cpy(type, type, { k * nk, 2, 3, 4 }, { 0, 2, 1, 3 }));
++            test_cases.emplace_back(new test_cpy(type, type, { k * nk, 2, 3, 4 }, { 0, 3, 1, 2 }, { 0, 2, 1, 3 }));
+         }
+     }
+ 
+-    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
++    for (ggml_type type_src : { GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32 }) {
+         for (ggml_type type_dst : all_types) {
+-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
++            test_cases.emplace_back(new test_cpy(type_src, type_dst, { 256, 4, 4, 4 }));
++            test_cases.emplace_back(new test_cpy(type_src, type_dst, { 256, 2, 3, 4 }, { 0, 2, 1, 3 }));  // cpy by rows
+         }
+     }
+     for (ggml_type type_src : all_types) {
+-        for (ggml_type type_dst : {GGML_TYPE_F32}) {
+-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
++        for (ggml_type type_dst : { GGML_TYPE_F32 }) {
++            test_cases.emplace_back(new test_cpy(type_src, type_dst, { 256, 4, 4, 4 }));
++            test_cases.emplace_back(new test_cpy(type_src, type_dst, { 256, 2, 3, 4 }, { 0, 2, 1, 3 }));  // cpy by rows
+         }
+     }
+-    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+-        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
++    for (ggml_type type_src : { GGML_TYPE_F16, GGML_TYPE_F32 }) {
++        for (ggml_type type_dst : { GGML_TYPE_F16, GGML_TYPE_F32 }) {
++            test_cases.emplace_back(
++                new test_cpy(type_src, type_dst, { 256, 2, 3, 4 }, { 1, 0, 2, 3 }));  // cpy not-contiguous
+         }
+     }
+ 
+     test_cases.emplace_back(new test_cont());
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
+-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, { 2, 1, 1, 1 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, { 2, 1, 3, 5 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, { 2, 3, 5, 7 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, { 2, 1, 1, 1 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, { 2, 1, 3, 5 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, { 2, 3, 5, 7 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, { 2, 1, 1, 1 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, { 2, 1, 3, 5 }));
++    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, { 2, 3, 5, 7 }));
+ 
+     auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
+-        for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
++        for (auto op : { ggml_add, ggml_sub, ggml_mul, ggml_div }) {
+             test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
+         }
+     };
+-    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+-        add_test_bin_bcast(type, {1, 1, 8, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 1, 1}, {32, 1, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 320, 320}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 1, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 1, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 1, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 1});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 2});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 2});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2});
+-        add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2});
++    for (ggml_type type : { GGML_TYPE_F16, GGML_TYPE_F32 }) {
++        add_test_bin_bcast(type, { 1, 1, 8, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 1, 1 }, { 32, 1, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 320, 320 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 1, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 2, 1, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 2, 1, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 1, 2, 1 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 1, 1, 2 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 1, 2, 2 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 1, 2, 2, 2 });
++        add_test_bin_bcast(type, { 10, 5, 4, 3 }, { 2, 2, 2, 2 });
+ 
+         // stable diffusion
+-        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});
+-        add_test_bin_bcast(type, {1280, 16, 16, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 256, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 1280, 1}, {16, 16, 1, 1});
+-        add_test_bin_bcast(type, {16, 16, 1280, 1}, {1, 1, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 1920, 1}, {16, 16, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 2560, 1}, {16, 16, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 1280, 1}, {32, 32, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 1920, 1}, {32, 32, 1, 1});
+-        add_test_bin_bcast(type, {1, 1, 640, 1}, {32, 32, 1, 1});
+-        add_test_bin_bcast(type, {5120, 1, 1, 1}, {1, 256, 1, 1});
+-        add_test_bin_bcast(type, {640, 1, 1, 1}, {1, 1, 1, 1});
++        add_test_bin_bcast(type, { 1280, 1, 1, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 1280, 1, 1, 1 }, { 1, 16, 16, 1 });
++        add_test_bin_bcast(type, { 1280, 16, 16, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 1280, 1, 1, 1 }, { 1, 256, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 1280, 1 }, { 16, 16, 1, 1 });
++        add_test_bin_bcast(type, { 16, 16, 1280, 1 }, { 1, 1, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 1920, 1 }, { 16, 16, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 2560, 1 }, { 16, 16, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 1280, 1 }, { 32, 32, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 1920, 1 }, { 32, 32, 1, 1 });
++        add_test_bin_bcast(type, { 1, 1, 640, 1 }, { 32, 32, 1, 1 });
++        add_test_bin_bcast(type, { 5120, 1, 1, 1 }, { 1, 256, 1, 1 });
++        add_test_bin_bcast(type, { 640, 1, 1, 1 }, { 1, 1, 1, 1 });
+         //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {1, 1, 1, 1});
+         //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
+     }
+@@ -4167,20 +4097,20 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     test_cases.emplace_back(new test_scale());
+     test_cases.emplace_back(new test_silu_back());
+ 
+-    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
+-        for (bool v : {false, true}) {
+-            test_cases.emplace_back(new test_norm    (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+-            test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
++    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) {
++        for (bool v : { false, true }) {
++            test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { 64, 5, 4, 3 }, v, eps));
++            test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { 64, 5, 4, 3 }, v, eps));
+         }
+-        test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
+-        test_cases.emplace_back(new test_l2_norm      (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
++        test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { 64, 5, 4, 3 }, eps));
++        test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { 64, 5, 4, 3 }, eps));
+     }
+ 
+-    test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
++    test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { 64, 5, 4, 3 }, 1e-12f));
+ 
+-    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
+-    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
+-    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
++    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, { 4, 1536, 1, 1 }, { 4, 1536, 1, 1 }));
++    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, { 8, 1536, 1, 1 }, { 4, 1536, 1, 1 }));
++    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, { 4, 1536, 4, 1 }, { 4, 1536, 1, 1 }));
+ 
+     test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
+ 
+@@ -4201,59 +4131,60 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+ 
+     for (ggml_type type_a : all_types) {
+         for (int i = 1; i < 10; ++i) {
+-            test_cases.emplace_back(new test_mul_mat(type_a,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 16, i, 256, { 1, 1 }, { 1, 1 }));
+         }
+     }
+ 
+ #if 1
+     for (ggml_type type_a : base_types) {
+-        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
++        for (ggml_type type_b : { GGML_TYPE_F32, GGML_TYPE_F16 }) {
+             // test cases without permutation
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 2}));
+-
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 2}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1 }, { 1, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 1 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 1 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 2 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 2 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 2 }, { 1, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 3, 2 }, { 2, 2 }));
++
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1 }, { 1, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 1 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 1 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 2 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 2 }, { 2, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 2 }, { 1, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 3, 2 }, { 2, 2 }));
+ 
+             // test cases with permutation
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 2, 3 }, { 1, 1 }, { 0, 2, 1, 3 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 2, 3 }, { 1, 1 }, { 0, 1, 3, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 2, 3 }, { 1, 1 }, { 0, 3, 2, 1 }));
+ 
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, { 2, 3 }, { 1, 1 }, { 0, 2, 1, 3 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, { 2, 3 }, { 1, 1 }, { 0, 1, 3, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, { 2, 3 }, { 1, 1 }, { 0, 3, 2, 1 }));
+ 
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 2, 3 }, { 1, 1 }, { 0, 2, 1, 3 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 2, 3 }, { 1, 1 }, { 0, 1, 3, 2 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 2, 3 }, { 1, 1 }, { 0, 3, 2, 1 }));
+ 
+             // test cases with large ne00/ne10 to cover stream-k fixup
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1}));
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, { 3, 2 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 1024, { 3, 2 }, { 1, 1 }));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, { 3, 2 }, { 1, 1 }));
+         }
+     }
+     for (ggml_type type_a : other_types) {
+-        for (ggml_type type_b : {GGML_TYPE_F32}) {
++        for (ggml_type type_b : { GGML_TYPE_F32 }) {
+             if (ggml_blck_size(type_a) != 256) {
+-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
++                test_cases.emplace_back(
++                    new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), { 1, 1 }, { 1, 1 }));
+             }
+-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
++            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1 }, { 1, 1 }));
+         }
+     }
+ #else
+@@ -4265,31 +4196,35 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     std::uniform_int_distribution<> dist_k(1, 16);
+     for (int i = 0; i < 1000; i++) {
+         for (ggml_type type_a : all_types) {
+-            for (ggml_type type_b : {GGML_TYPE_F32}) {
++            for (ggml_type type_b : { GGML_TYPE_F32 }) {
+                 int m = dist_m(rng);
+                 int n = dist_n(rng);
+                 int k = dist_k(rng) * ggml_blck_size(type_a);
+-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
++                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1 }, { 1, 1 }));
+             }
+         }
+     }
+ #endif
+ 
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
+-
+-    for (auto bs : {1,2,4,8}) {
+-        for (auto nr : {1,4}) {
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1 }, { 1, 1 }));
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1 }, { 4, 1 }));
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1 }, { 4, 1 }));
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1 }, { 4, 1 }));
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1 }, { 4, 1 }));
++    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1 }, { 4, 1 }));
++    test_cases.emplace_back(
++        new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, { 1, 1 }, { 4, 1 }, { 0, 2, 1, 3 }));
++    test_cases.emplace_back(
++        new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, { 1, 1 }, { 4, 1 }, { 0, 2, 1, 3 }));
++
++    for (auto bs : { 1, 2, 4, 8 }) {
++        for (auto nr : { 1, 4 }) {
+             for (uint32_t m = 0; m < 2; ++m) {
+                 for (uint32_t k = 0; k < 2; ++k) {
+-                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3}));
+-                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true));
++                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k,
++                                                             { bs, 1 }, { nr, 1 }, { 0, 2, 1, 3 }));
++                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k,
++                                                             { bs, 1 }, { nr, 1 }, { 0, 1, 2, 3 }, true));
+                 }
+             }
+         }
+@@ -4302,11 +4237,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
+ 
+     for (ggml_type type_a : base_types) {
+-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+-            for (int n_mats : {4, 8}) {
+-                for (int n_used : {1, 2, 4}) {
+-                    for (bool b : {false, true}) {
+-                        for (int n : {1, 32, 129}) {
++        for (ggml_type type_b : { GGML_TYPE_F32 /*, GGML_TYPE_F16 */ }) {
++            for (int n_mats : { 4, 8 }) {
++                for (int n_used : { 1, 2, 4 }) {
++                    for (bool b : { false, true }) {
++                        for (int n : { 1, 32, 129 }) {
+                             int m = 512;
+                             int k = 256;
+                             test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+@@ -4318,11 +4253,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     }
+ 
+     for (ggml_type type_a : other_types) {
+-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+-            for (int n_mats : {4}) {
+-                for (int n_used : {2}) {
+-                    for (bool b : {false}) {
+-                        for (int n : {1, 32}) {
++        for (ggml_type type_b : { GGML_TYPE_F32 /*, GGML_TYPE_F16 */ }) {
++            for (int n_mats : { 4 }) {
++                for (int n_used : { 2 }) {
++                    for (bool b : { false }) {
++                        for (int n : { 1, 32 }) {
+                             int m = 512;
+                             int k = 256;
+                             test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+@@ -4334,14 +4269,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     }
+ 
+     for (ggml_type type_a : base_types) {
+-        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+-            for (int n : {1, 16}) {
+-                for (int k : {1, 16}) {
+-                    for (int bs2 : {1, 3}) {
+-                        for (int bs3 : {1, 3}) {
+-                            for (int nr2 : {1, 2}) {
+-                                for (int nr3 : {1, 2}) {
+-                                    test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, n, k, {bs2, bs3}, {nr2, nr3}));
++        for (ggml_type type_b : { GGML_TYPE_F32, GGML_TYPE_F16 }) {
++            for (int n : { 1, 16 }) {
++                for (int k : { 1, 16 }) {
++                    for (int bs2 : { 1, 3 }) {
++                        for (int bs3 : { 1, 3 }) {
++                            for (int nr2 : { 1, 2 }) {
++                                for (int nr3 : { 1, 2 }) {
++                                    test_cases.emplace_back(
++                                        new test_out_prod(type_a, type_b, 256, n, k, { bs2, bs3 }, { nr2, nr3 }));
+                                 }
+                             }
+                         }
+@@ -4351,7 +4287,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         }
+     }
+ 
+-    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
++    for (ggml_type type : { GGML_TYPE_F16, GGML_TYPE_F32 }) {
+         test_cases.emplace_back(new test_sqr(type));
+         test_cases.emplace_back(new test_sqrt(type));
+         test_cases.emplace_back(new test_log(type));
+@@ -4360,9 +4296,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         test_cases.emplace_back(new test_clamp(type));
+     }
+ 
+-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
+-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
+-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
++    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, { 10, 10, 1, 1 }, 5));
++    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, { 10, 10, 3, 1 }, 5));
++    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, { 10, 10, 3, 2 }, 5));
+ 
+ #if 0
+     std::uniform_int_distribution<> dist_ne1(1, 50);
+@@ -4379,78 +4315,101 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         exponent <<= 1;
+     }
+ #endif
+-    for (bool mask : {false, true}) {
+-        for (float max_bias : {0.0f, 8.0f}) {
+-            if (!mask && max_bias > 0.0f) continue;
+-            for (float scale : {1.0f, 0.1f}) {
+-                for (int64_t ne0 : {16, 1024}) {
+-                    for (int64_t ne1 : {16, 1024}) {
++    for (bool mask : { false, true }) {
++        for (float max_bias : { 0.0f, 8.0f }) {
++            if (!mask && max_bias > 0.0f) {
++                continue;
++            }
++            for (float scale : { 1.0f, 0.1f }) {
++                for (int64_t ne0 : { 16, 1024 }) {
++                    for (int64_t ne1 : { 16, 1024 }) {
+                         if (mask) {
+-                            for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+-                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, m_prec, scale, max_bias));
+-                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias));
++                            for (ggml_type m_prec : { GGML_TYPE_F32, GGML_TYPE_F16 }) {
++                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { ne0, ne1, 1, 1 }, mask,
++                                                                          m_prec, scale, max_bias));
++                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { ne0 - 1, ne1 - 1, 1, 1 },
++                                                                          mask, m_prec, scale, max_bias));
+                             }
+                         } else {
+                             /* The precision of mask here doesn't matter as boolean mask is false */
+-                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
+-                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
++                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { ne0, ne1, 1, 1 }, mask,
++                                                                      GGML_TYPE_F32, scale, max_bias));
++                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { ne0 - 1, ne1 - 1, 1, 1 }, mask,
++                                                                      GGML_TYPE_F32, scale, max_bias));
+                         }
+                     }
+                 }
+             }
+         }
+     }
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 8.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 8.0f));
+-
+-    for (float max_bias : {0.0f, 8.0f}) {
+-        for (float scale : {1.0f, 0.1f}) {
+-            for (int64_t ne0 : {16, 1024}) {
+-                for (int64_t ne1 : {16, 1024}) {
+-                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, scale, max_bias));
+-                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 16, 2, 32, 1 }, true, GGML_TYPE_F32, 0.1f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 16, 2, 32, 1 }, true, GGML_TYPE_F16, 0.1f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 16, 2, 32, 1 }, false, GGML_TYPE_F32, 0.1f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 32, 2, 32, 1 }, true, GGML_TYPE_F32, 0.1f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 32, 2, 32, 1 }, true, GGML_TYPE_F16, 0.1f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 32, 2, 32, 1 }, true, GGML_TYPE_F32, 0.1f, 8.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 32, 2, 32, 1 }, true, GGML_TYPE_F16, 0.1f, 8.0f));
++
++    for (float max_bias : { 0.0f, 8.0f }) {
++        for (float scale : { 1.0f, 0.1f }) {
++            for (int64_t ne0 : { 16, 1024 }) {
++                for (int64_t ne1 : { 16, 1024 }) {
++                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, { ne0, ne1, 1, 1 }, scale, max_bias));
++                    test_cases.emplace_back(
++                        new test_soft_max_back(GGML_TYPE_F32, { ne0 - 1, ne1 - 1, 1, 1 }, scale, max_bias));
+                 }
+             }
+         }
+     }
+ 
+-    for (bool fw : {true, false}) { // fw == forward
++    for (bool fw : { true, false }) {  // fw == forward
+         bool all = true;
+ 
+         for (float v : { 0, 1 }) {
+             for (float fs : { 1.0f, 1.4245f }) {
+                 for (float ef : { 0.0f, 0.7465f }) {
+                     for (float af : { 1.0f, 1.4245f }) {
+-                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+-                            for (bool ff : {false, true}) { // freq_factors
+-                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B
++                        for (ggml_type type : { GGML_TYPE_F32, GGML_TYPE_F16 }) {
++                            for (bool ff : { false, true }) {                       // freq_factors
++                                test_cases.emplace_back(new test_rope(type, { 128, 32, 2, 1 }, 128, 0, 512, fs, ef, af,
++                                                                      ff, v, fw));  // llama 7B
+ 
+                                 if (all) {
+-                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B
+-                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B
+-                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B
++                                    test_cases.emplace_back(new test_rope(type, { 128, 40, 2, 1 }, 128, 0, 512, fs, ef,
++                                                                          af, ff, v, fw));  // llama 13B
++                                    test_cases.emplace_back(new test_rope(type, { 128, 52, 2, 1 }, 128, 0, 512, fs, ef,
++                                                                          af, ff, v, fw));  // llama 30B
++                                    test_cases.emplace_back(new test_rope(type, { 128, 64, 2, 1 }, 128, 0, 512, fs, ef,
++                                                                          af, ff, v, fw));  // llama 65B
+                                 }
+ 
+                                 if (all) {
+-                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+-                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+-                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
+-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
++                                    test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1 }, 64, 2, 512, fs, ef, af,
++                                                                          ff, v, fw));      // neox (falcon 7B)
++                                    test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1 }, 64, 2, 512, fs, ef,
++                                                                          af, ff, v, fw));  // neox (falcon 7B)
++                                    test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1 }, 64, 2, 512, fs, ef, af,
++                                                                          ff, v, fw));      // neox (falcon 40B)
++                                    test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1 }, 20, 2, 512, fs, ef,
++                                                                          af, ff, v, fw));  // neox (stablelm)
++                                    test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1 }, 32, 2, 512, fs, ef,
++                                                                          af, ff, v, fw));  // neox (phi-2)
+                                 }
+ 
+                                 if (all) {
+-                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+-                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
+-                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
++                                    test_cases.emplace_back(new test_rope(type, { 128, 12, 2, 1 }, 128,
++                                                                          GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v,
++                                                                          fw));  // rope_multi,m-rope (qwen2vl 2B)
++                                    test_cases.emplace_back(new test_rope(type, { 128, 28, 2, 1 }, 128,
++                                                                          GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v,
++                                                                          fw));  // rope_multi,m-rope (qwen2vl 7B)
++                                    test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1 }, 80,
++                                                                          GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v,
++                                                                          fw));  // rope_multi,m-rope (qwen2vl ViT)
+                                 }
+ 
+-                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
++                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1 }, 64, 2, 512, fs, ef, af,
++                                                                      ff, v, fw));  // neox (falcon 40B)
+                             }
+                         }
+ 
+@@ -4462,29 +4421,34 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+     }
+ 
+     for (int v : { 0, 1, 2, 3 }) {
+-        for (int dim : { 0, 1, 2, 3, }) {
+-            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
+-            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
++        for (int dim : {
++                 0,
++                 1,
++                 2,
++                 3,
++             }) {
++            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, { 11, 12, 13, 14 }, 7, dim, v));
++            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, { 11, 12, 13, 14 }, 7, dim, v));
+         }
+     }
+ 
+-    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
+-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
+-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
+-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
++    for (ggml_sort_order order : { GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC }) {
++        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, { 8, 1, 1, 1 }, order));
++        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, { 16, 10, 10, 10 }, order));
++        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, { 60, 10, 10, 10 }, order));  // qwen
+     }
+ 
+-    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
+-        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
+-        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
+-        test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode));
++    for (ggml_scale_mode mode : { GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR }) {
++        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 2 }, 2, mode));
++        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 2 }, 2, mode, true));
++        test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, { 2, 5, 7, 11 }, { 5, 7, 11, 13 }, mode));
+     }
+ 
+     test_cases.emplace_back(new test_sum());
+     test_cases.emplace_back(new test_sum_rows());
+     test_cases.emplace_back(new test_mean());
+-    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+-    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
++    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, { 64, 64, 320, 1 }));
++    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, { 9, 9, 1280, 1 }));
+     test_cases.emplace_back(new test_acc());
+     test_cases.emplace_back(new test_pad());
+     test_cases.emplace_back(new test_pad_reflect_1d());
+@@ -4494,30 +4458,60 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+ 
+     for (int hsk : { 64, 80, 128, 192, 256, 576 }) {
+         for (int hsv : { 64, 80, 128, 192, 256, 512 }) {
+-            if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
+-            if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
+-            if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
++            if (hsk != 192 && hsk != 576 && hsk != hsv) {
++                continue;
++            }
++            if (hsk == 192 && (hsv != 128 && hsv != 192)) {
++                continue;
++            }
++            if (hsk == 576 && hsv != 512) {
++                continue;  // DeepSeek MLA
++            }
+ 
+-            for (bool mask : { true, false } ) {
++            for (bool mask : { true, false }) {
+                 for (float max_bias : { 0.0f, 8.0f }) {
+-                    if (!mask && max_bias > 0.0f) continue;
+-                    for (float logit_softcap : {0.0f, 10.0f}) {
+-                        if (hsk != 128 && logit_softcap != 0.0f) continue;
+-                        for (int nh : { 4, }) {
++                    if (!mask && max_bias > 0.0f) {
++                        continue;
++                    }
++                    for (float logit_softcap : { 0.0f, 10.0f }) {
++                        if (hsk != 128 && logit_softcap != 0.0f) {
++                            continue;
++                        }
++                        for (int nh : {
++                                 4,
++                             }) {
+                             for (int nr : { 1, 4, 16 }) {
+-                                if (nr == 16 && hsk != 128) continue;
+-                                for (int kv : { 512, 1024, }) {
+-                                    if (nr != 1 && kv != 512) continue;
+-                                    for (int nb : { 1, 3, 32, 35, }) {
+-                                        for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
+-                                            if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
+-                                            for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+-                                                test_cases.emplace_back(new test_flash_attn_ext(
+-                                                    hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV));
++                                if (nr == 16 && hsk != 128) {
++                                    continue;
++                                }
++                                for (int kv : {
++                                         512,
++                                         1024,
++                                     }) {
++                                    if (nr != 1 && kv != 512) {
++                                        continue;
++                                    }
++                                    for (int nb : {
++                                             1,
++                                             3,
++                                             32,
++                                             35,
++                                         }) {
++                                        for (ggml_prec prec : { GGML_PREC_F32, GGML_PREC_DEFAULT }) {
++                                            if (hsk != 128 && prec == GGML_PREC_DEFAULT) {
++                                                continue;
++                                            }
++                                            for (ggml_type type_KV :
++                                                 { GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0 }) {
++                                                test_cases.emplace_back(
++                                                    new test_flash_attn_ext(hsk, hsv, nh, nr, kv, nb, mask, max_bias,
++                                                                            logit_softcap, prec, type_KV));
+                                                 // run fewer test cases permuted
+-                                                if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
++                                                if (mask == true && max_bias == 0.0f && logit_softcap == 0 &&
++                                                    kv == 512) {
+                                                     test_cases.emplace_back(new test_flash_attn_ext(
+-                                                        hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
++                                                        hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec,
++                                                        type_KV, { 0, 2, 1, 3 }));
+                                                 }
+                                             }
+                                         }
+@@ -4531,12 +4525,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         }
+     }
+ 
+-    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {   10, 5, 4, 3}));
+-    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {30000, 1, 1, 1}));
+-    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {   10, 5, 4, 3}));
+-    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1}));
++    test_cases.emplace_back(new test_cross_entropy_loss(GGML_TYPE_F32, { 10, 5, 4, 3 }));
++    test_cases.emplace_back(new test_cross_entropy_loss(GGML_TYPE_F32, { 30000, 1, 1, 1 }));
++    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, { 10, 5, 4, 3 }));
++    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, { 30000, 1, 1, 1 }));
+ 
+-    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
++    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, { 10, 5, 4, 3 }));
+ 
+     // these tests are disabled to save execution time, but they can be handy for debugging
+ #if 0
+@@ -4553,58 +4547,77 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
+     std::vector<std::unique_ptr<test_case>> test_cases;
+ 
+-    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+-    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
++    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, { 4096, 1, 1, 1 }, { 1, 1, 1, 1 }));
++    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, { 4096, 1, 1, 1 }, { 1, 512, 1, 1 }));
+ 
+-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
+-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
++    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, { 512, 3072, 1, 1 }));
++    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, { 8192, 512, 2, 1 }, { 0, 2, 1, 3 }));
++    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, { 3072, 512, 2, 1 }, { 0, 2, 1, 3 }));
+ 
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 4096, 4096, 5, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 77, 4096, 5, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 1024, 1024, 10, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 77, 1024, 10, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 256, 256, 20, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 64, 64, 20, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
++    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, { 77, 64, 20, 1 }, false, GGML_TYPE_F32, 1.0f, 0.0f));
+ 
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 32, 10, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 1024, 10, 1, 1 }));
++    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 32000, 512, 1, 1 }));
+ 
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
+-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, true));
++    test_cases.emplace_back(
++        new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, { 8, 1 }, { 4, 1 }, { 0, 2, 1, 3 }));
++    test_cases.emplace_back(
++        new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, { 8, 1 }, { 4, 1 }, { 0, 1, 2, 3 }, true));
+ 
+-    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
++    for (int bs : { 1, 2, 3, 4, 5, 8, 512 }) {
+         for (ggml_type type_a : all_types) {
+-            for (ggml_type type_b : {GGML_TYPE_F32}) {
+-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
++            for (ggml_type type_b : { GGML_TYPE_F32 }) {
++                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, { 1, 1 }, { 1, 1 }));
+             }
+         }
+     }
+ 
+-    for (int K : {3, 5}) {
+-        for (int IC : {256, 2560}) {
+-            for (int IW_IH : {32, 64, 256}) {
++    for (int K : { 3, 5 }) {
++        for (int IC : { 256, 2560 }) {
++            for (int IW_IH : { 32, 64, 256 }) {
+                 if (IC == 2560 && IW_IH == 256) {
+                     // too big
+                     continue;
+                 }
+-                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
++                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32,
++                                                        { IW_IH, IW_IH, IC, 1 }, { K, K, IC, 1 }, 1, 1, 1, 1, 1, 1,
++                                                        true));
+             }
+         }
+     }
+ 
+-    for (int kv : { 4096, 8192, 16384, }) {
+-        for (int hs : { 64, 128, }) {
+-            for (int nr : { 1, 4, }) {
+-                test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
++    for (int kv : {
++             4096,
++             8192,
++             16384,
++         }) {
++        for (int hs : {
++                 64,
++                 128,
++             }) {
++            for (int nr : {
++                     1,
++                     4,
++                 }) {
++                test_cases.emplace_back(
++                    new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+             }
+         }
+     }
+ 
+-    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
+-    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
++    test_cases.emplace_back(new test_conv_2d_dw({ 512, 512, 256, 1 }, { 3, 3, 1, 256 }, 1, 1, 1, false));
++    test_cases.emplace_back(new test_conv_2d_dw({ 512, 512, 256, 1 }, { 3, 3, 1, 256 }, 1, 1, 1, true));
++
++    test_cases.emplace_back(new test_conv_transpose_2d({ 256, 256, 256, 1 }, { 3, 3, 16, 256 }, 1));
++
++    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 256, 256, 3, 1 }));
+ 
+     return test_cases;
+ }
+@@ -4685,10 +4698,10 @@ static void usage(char ** argv) {
+ }
+ 
+ int main(int argc, char ** argv) {
+-    test_mode mode = MODE_TEST;
++    test_mode    mode           = MODE_TEST;
+     const char * op_name_filter = nullptr;
+     const char * backend_filter = nullptr;
+-    const char * params_filter = nullptr;
++    const char * params_filter  = nullptr;
+ 
+     for (int i = 1; i < argc; i++) {
+         if (strcmp(argv[i], "test") == 0) {
+@@ -4752,14 +4765,15 @@ int main(int argc, char ** argv) {
+         GGML_ASSERT(backend != NULL);
+ 
+         ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+-        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
++        auto               ggml_backend_set_n_threads_fn =
++            (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+         if (ggml_backend_set_n_threads_fn) {
+             // TODO: better value for n_threads
+             ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+         }
+ 
+         printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+-        size_t free, total; // NOLINT
++        size_t free, total;  // NOLINT
+         ggml_backend_dev_memory(dev, &free, &total);
+         printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+         printf("\n");
diff --git a/ml/backend.go b/ml/backend.go
index 2df6c892..61066c1a 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -253,6 +253,7 @@ type Tensor interface {
 
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
+	Sub(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
 	Div(ctx Context, t2 Tensor) Tensor
 
@@ -276,6 +277,7 @@ type Tensor interface {
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor
+	RELU(ctx Context) Tensor
 	Sigmoid(ctx Context) Tensor
 
 	Reshape(ctx Context, shape ...int) Tensor
@@ -297,6 +299,12 @@ type Tensor interface {
 
 	TopK(ctx Context, k int) Tensor
 	Argsort(ctx Context) Tensor
+	Mean(ctx Context) Tensor
+	Variance(ctx Context) Tensor
+	Stddev(ctx Context) Tensor
+	Sqr(ctx Context) Tensor
+	Sqrt(ctx Context) Tensor
+	Clamp(ctx Context, min, max float32) Tensor
 }
 
 // ScaledDotProductAttention implements a fused attention
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 8aadad86..707b739c 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -297,7 +297,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
 				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
-		case contains(t.Name, "cls", "output", "output_norm"):
+		case contains(t.Name, "cls", "output", "output_norm",
+			"altup_proj", "altup_unembd_proj",
+			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
 			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
@@ -893,6 +895,13 @@ func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
 
+func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
 	if dim < 0 || dim >= C.GGML_MAX_DIMS {
 		panic("invalid dimension")
@@ -1200,6 +1209,13 @@ func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	}
 }
 
+func (t *Tensor) RELU(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
+	}
+}
+
 func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1275,3 +1291,42 @@ func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
 		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
 	}
 }
+
+func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
+	}
+}
+
+func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
+	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
+		Sqr(ctx).
+		SumRows(ctx).
+		Scale(ctx, 1/float64(t.Dim(0)))
+}
+
+func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
+	return t.Variance(ctx).Sqrt(ctx)
+}
+
+func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
+	}
+}
+
+func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
+	}
+}
+
+func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
+	}
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
index 64fb4ff4..5b9a0fe3 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }
 
+// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
+template<bool norm>
+static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col != 0) {
+        return;
+    }
+
+    dst[row] = norm ? sum / ncols : sum;
+}
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 4c829153..9e64e5ae 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -35,6 +35,7 @@
 #include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
+#include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
@@ -2322,6 +2323,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_SUM_ROWS:
             ggml_cuda_op_sum_rows(ctx, dst);
             break;
+        case GGML_OP_MEAN:
+            ggml_cuda_op_mean(ctx, dst);
+            break;
         case GGML_OP_SSM_CONV:
             ggml_cuda_op_ssm_conv(ctx, dst);
             break;
@@ -3211,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
             return true;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mean.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cu
new file mode 100644
index 00000000..4b238a39
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cu
@@ -0,0 +1,19 @@
+#include "mean.cuh"
+
+void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0   = dst->src[0];
+    const float *       src0_d = (const float *) src0->data;
+    float *             dst_d  = (float *) dst->data;
+    cudaStream_t        stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh
new file mode 100644
index 00000000..2b9b1043
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
index 38dbf1b5..2eee08fa 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
@@ -1,25 +1,9 @@
 #include "sumrows.cuh"
 
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const dim3 block_dims(WARP_SIZE, 1, 1);
     const dim3 block_nums(nrows, 1, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t ncols = src0->ne[0];
     const int64_t nrows = ggml_nrows(src0);
 
-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+
+    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
index 191db1c1..3431c599 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
@@ -1,5 +1,4 @@
 #include "common.cuh"
 
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
-
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 3656c238..8f9a25e6 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -3434,31 +3434,61 @@ kernel void kernel_neg(
     dst[tpig] = -src0[tpig];
 }
 
+template <bool norm>
 kernel void kernel_sum_rows(
+        constant ggml_metal_kargs_sum_rows & args,
         device const float * src0,
         device       float * dst,
-        constant ggml_metal_kargs_sum_rows & args,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    int64_t i3 = tgpig.z;
+    int64_t i2 = tgpig.y;
+    int64_t i1 = tgpig.x;
 
     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
         return;
     }
 
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
 
-    float row_sum = 0;
+    float sumf = 0;
 
-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
-        row_sum += src_row[i0];
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        sumf += src_row[i0];
     }
 
-    dst_row[0] = row_sum;
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    if (tpitg.x == 0) {
+        dst_row[0] = norm ? sumf / args.ne00 : sumf;
+    }
 }
 
+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
+
+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
+template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
+
 template<typename T>
 kernel void kernel_soft_max(
         device const  char * src0,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index ee4f2dcb..f20f5615 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_COS,
     GGML_METAL_KERNEL_TYPE_NEG,
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    GGML_METAL_KERNEL_TYPE_MEAN,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
     GGML_METAL_KERNEL_TYPE_ARGMAX,
@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
@@ -1634,6 +1636,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_LOG:
             return false; // TODO: implement
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
@@ -2362,11 +2365,30 @@ static bool ggml_metal_encode_node(
                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
             {
                 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                id<MTLComputePipelineState> pipeline = nil;
 
+                switch (dst->op) {
+                    case GGML_OP_SUM_ROWS:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                        break;
+                    case GGML_OP_MEAN:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
+                        break;
+                    default:
+                        GGML_ABORT("fatal error");
+                }
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+
+                nth = MIN(nth, ne00);
 
                 ggml_metal_kargs_sum_rows args = {
                    /*.ne00 =*/ ne00,
@@ -2396,11 +2418,12 @@ static bool ggml_metal_encode_node(
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_SOFT_MAX:
             {
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 9cfddf45..08e8d807 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -956,31 +956,61 @@ kernel void kernel_neg(
     dst[tpig] = -src0[tpig];
 }
 
+template <bool norm>
 kernel void kernel_sum_rows(
+        constant ggml_metal_kargs_sum_rows & args,
         device const float * src0,
         device       float * dst,
-        constant ggml_metal_kargs_sum_rows & args,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    int64_t i3 = tgpig.z;
+    int64_t i2 = tgpig.y;
+    int64_t i1 = tgpig.x;
 
     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
         return;
     }
 
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
 
-    float row_sum = 0;
+    float sumf = 0;
 
-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
-        row_sum += src_row[i0];
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        sumf += src_row[i0];
     }
 
-    dst_row[0] = row_sum;
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    if (tpitg.x == 0) {
+        dst_row[0] = norm ? sumf / args.ne00 : sumf;
+    }
 }
 
+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
+
+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
+template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
+
 template<typename T>
 kernel void kernel_soft_max(
         device const  char * src0,
diff --git a/model/models/gemma3n/model.go b/model/models/gemma3n/model.go
new file mode 100644
index 00000000..d210ab75
--- /dev/null
+++ b/model/models/gemma3n/model.go
@@ -0,0 +1,52 @@
+package gemma3n
+
+import (
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.SentencePieceModel
+
+	*TextModel
+}
+
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	return m.TextModel.Forward(ctx, batch, m.Cache)
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		TextModel: newTextModel(c),
+		SentencePieceModel: model.NewSentencePieceModel(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+	}
+
+	// TODO: setup hybrid (local sliding window + global) cache
+	m.Cache = kvcache.NewWrapperCache(
+		kvcache.NewCausalCache(m.Shift),
+		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
+	)
+	return &m, nil
+}
+
+func init() {
+	model.Register("gemma3n", New)
+}
diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go
new file mode 100644
index 00000000..715b8a0e
--- /dev/null
+++ b/model/models/gemma3n/model_text.go
@@ -0,0 +1,360 @@
+package gemma3n
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model/input"
+)
+
+type TextModel struct {
+	TokenEmbedding *TextScaledWordEmbedding `gguf:"token_embd"`
+
+	*PerLayerProjector
+
+	AltupEmbd   *nn.Linear `gguf:"altup_proj"`
+	AltupUnembd *nn.Linear `gguf:"altup_unembd_proj"`
+
+	TextLayers []TextLayer `gguf:"blk"`
+	OutputNorm *nn.RMSNorm `gguf:"output_norm"`
+	Output     *nn.Linear  `gguf:"output,alt:token_embd"`
+
+	TextOptions
+}
+
+func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	// Create a tensor of a single float32 value of 1.0 to use for altup correction
+	one := ctx.Input().FromFloatSlice([]float32{1.0}, 1)
+
+	inputs := m.TokenEmbedding.Forward(ctx, batch.Inputs, math.Sqrt(float64(m.hiddenSize)))
+	inputsPerLayer := m.PerLayerProjector.Forward(ctx, batch, inputs, &m.TextOptions)
+
+	targetMagnitude := inputs.Sqr(ctx).Mean(ctx).Sqrt(ctx)
+	targetMagnitude = targetMagnitude.Repeat(ctx, 2, m.altupInputs-1)
+
+	hiddenState := inputs.Repeat(ctx, 2, m.altupInputs-1)
+	altupProj := m.AltupEmbd.Forward(ctx, hiddenState)
+	altupProj = altupProj.Mul(ctx, targetMagnitude.Div(ctx, altupProj.Sqr(ctx).Mean(ctx).Sqrt(ctx)))
+
+	hiddenStates := inputs.Concat(ctx, altupProj, 2)
+
+	firstSharedKeyValue := m.hiddenLayers - m.sharedKeyValueLayers
+	for i, layer := range m.TextLayers {
+		if i < firstSharedKeyValue {
+			cache.SetLayer(i)
+		} else if m.isLocal(i) {
+			cache.SetLayer(firstSharedKeyValue - 2)
+		} else {
+			cache.SetLayer(firstSharedKeyValue - 1)
+		}
+
+		var layerType int
+		ropeBase := m.ropeBase
+		if m.isLocal(i) {
+			layerType = 1
+			ropeBase = m.ropeBaseLocal
+		}
+
+		cache.(*kvcache.WrapperCache).SetLayerType(layerType)
+
+		// inputPerLayer = inputsPerLayer[:, i, :]
+		inputPerLayer := inputsPerLayer.View(ctx, i*inputsPerLayer.Stride(1), inputsPerLayer.Dim(0), inputsPerLayer.Stride(2), inputsPerLayer.Dim(2))
+		hiddenStates = layer.Forward(ctx, hiddenStates, inputPerLayer, positions, one, cache, i >= firstSharedKeyValue, ropeBase, float64(m.activationSparsityScale[i]), &m.TextOptions)
+	}
+
+	// hiddenStates = hiddenStates[:, :, 0]
+	hiddenStates0 := hiddenStates.View(ctx, 0, hiddenStates.Dim(0), hiddenStates.Stride(1), hiddenStates.Dim(1))
+	targetMagnitude = hiddenStates0.Sqr(ctx).Mean(ctx).Sqrt(ctx)
+	targetMagnitude = targetMagnitude.Repeat(ctx, 2, m.altupInputs-1)
+
+	// hiddenState = hiddenStates[:, :, 1:]
+	hiddenState = hiddenStates.View(ctx, hiddenStates.Stride(2), hiddenStates.Dim(0), hiddenStates.Stride(1), hiddenStates.Dim(1), hiddenStates.Stride(2), m.altupInputs-1)
+	altupUnembdProj := m.AltupUnembd.Forward(ctx, hiddenState)
+	altupUnembdProj = altupUnembdProj.Mul(ctx, targetMagnitude.Div(ctx, altupUnembdProj.Sqr(ctx).Mean(ctx).Sqrt(ctx)))
+
+	hiddenStates = hiddenStates0.Concat(ctx, altupUnembdProj, 2)
+
+	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx).Mean(ctx)
+	hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	hiddenStates = hiddenStates.Rows(ctx, ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs)))
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeBase := m.ropeBase
+	if m.isLocal(layer) {
+		ropeBase = m.ropeBaseLocal
+	}
+
+	return fast.RoPE(ctx, key, shift, m.headDim(), ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+type TextScaledWordEmbedding struct {
+	*nn.Embedding
+}
+
+func (e TextScaledWordEmbedding) Forward(ctx ml.Context, inputIDs ml.Tensor, scale float64) ml.Tensor {
+	return e.Embedding.Forward(ctx, inputIDs).Scale(ctx, scale)
+}
+
+type PerLayerProjector struct {
+	TokenEmbedding *TextScaledWordEmbedding `gguf:"per_layer_token_embd"`
+	Projector      *nn.Linear               `gguf:"per_layer_model_proj"`
+	Norm           *nn.RMSNorm              `gguf:"per_layer_proj_norm"`
+}
+
+func (p PerLayerProjector) Forward(ctx ml.Context, batch input.Batch, inputs ml.Tensor, opts *TextOptions) ml.Tensor {
+	inputsPerLayer := p.TokenEmbedding.Forward(ctx, batch.Inputs, math.Sqrt(float64(opts.hiddenSizePerLayerInput)))
+	inputsPerLayer = inputsPerLayer.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, batch.Inputs.Dim(0), batch.Inputs.Dim(1))
+
+	perLayerProjection := p.Projector.Forward(ctx, inputs)
+	perLayerProjection = perLayerProjection.Scale(ctx, math.Sqrt(float64(opts.hiddenSize)))
+	perLayerProjection = perLayerProjection.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, inputs.Dim(1))
+	perLayerProjection = p.Norm.Forward(ctx, perLayerProjection, opts.eps)
+
+	if inputsPerLayer != nil {
+		perLayerProjection = perLayerProjection.Add(ctx, inputsPerLayer)
+		perLayerProjection = perLayerProjection.Scale(ctx, 1/math.Sqrt(2))
+	}
+
+	return perLayerProjection
+}
+
+type TextLayer struct {
+	*AltUp
+	*Laurel
+
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	Attention         *TextAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
+
+	MLPNorm     *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP         *TextMLP
+	PostMLPNorm *nn.RMSNorm `gguf:"post_ffw_norm"`
+
+	PerLayerInputGate  *nn.Linear  `gguf:"inp_gate"`
+	PerLayerProjection *nn.Linear  `gguf:"proj"`
+	PostPerLayerNorm   *nn.RMSNorm `gguf:"post_norm"`
+}
+
+func (d TextLayer) Forward(ctx ml.Context, hiddenStates, perLayerInput, positions, one ml.Tensor, cache kvcache.Cache, sharedKV bool, ropeBase float32, activationSparsityScale float64, opts *TextOptions) ml.Tensor {
+	predictions := d.Predict(ctx, hiddenStates, opts)
+	active := opts.altupActive(ctx, predictions)
+
+	attn := d.AttentionNorm.Forward(ctx, active, opts.eps)
+	laurel := d.Laurel.Forward(ctx, attn, opts)
+
+	attn = d.Attention.Forward(ctx, attn, positions, cache, sharedKV, ropeBase, opts)
+	attn = d.PostAttentionNorm.Forward(ctx, attn, opts.eps)
+	attn = active.Add(ctx, attn)
+	attn = attn.Add(ctx, laurel).Scale(ctx, 1/math.Sqrt(2))
+
+	mlp := d.MLPNorm.Forward(ctx, attn, opts.eps)
+	mlp = d.MLP.Forward(ctx, mlp, activationSparsityScale)
+	mlp = d.PostMLPNorm.Forward(ctx, mlp, opts.eps)
+	mlp = attn.Add(ctx, mlp)
+
+	predictions = d.Correct(ctx, predictions, mlp, one, opts)
+	active = opts.altupActive(ctx, predictions)
+	if opts.altupCorrectScale {
+		active = d.ScaleCorrectedOutput(ctx, active)
+	}
+
+	active = d.PerLayerInputGate.Forward(ctx, active)
+	active = active.GELU(ctx)
+	active = active.Mul(ctx, perLayerInput)
+
+	active = d.PerLayerProjection.Forward(ctx, active)
+	active = d.PostPerLayerNorm.Forward(ctx, active, opts.eps)
+
+	// inactive := predictions[:, :, 1:]
+	inactive := predictions.View(ctx, predictions.Stride(2), predictions.Dim(0), predictions.Stride(1), predictions.Dim(1), predictions.Stride(2), predictions.Dim(2)-1)
+	active = inactive.Add(ctx, active)
+
+	predictions0 := predictions.View(ctx, 0, predictions.Dim(0), predictions.Stride(1), predictions.Dim(1))
+	return predictions0.Concat(ctx, active, 2)
+}
+
+type AltUp struct {
+	CorrectionScale       ml.Tensor   `gguf:"altup_correct_scale.weight"`
+	PredictionCoefficient *nn.Linear  `gguf:"altup_predict_coef"`
+	CorrectionCoefficient *nn.Linear  `gguf:"altup_correct_coef"`
+	Router                *nn.Linear  `gguf:"altup_router"`
+	RouterNorm            *nn.RMSNorm `gguf:"altup_router_norm"`
+}
+
+func (a AltUp) computeRouterModalities(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
+	routerInputs := a.RouterNorm.Forward(ctx, hiddenStates, opts.eps).Scale(ctx, 1.0/float64(opts.hiddenSize))
+	return a.Router.Forward(ctx, routerInputs).Tanh(ctx)
+}
+
+func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
+	modalities := a.computeRouterModalities(ctx, opts.altupActive(ctx, hiddenStates), opts)
+
+	coefficients := a.PredictionCoefficient.Forward(ctx, modalities)
+	coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2))
+
+	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	predictions := coefficients.Mulmat(ctx, hiddenStates)
+	predictions = predictions.Add(ctx, hiddenStates)
+	return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+}
+
+func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {
+	innovation := activated.Sub(ctx, opts.altupActive(ctx, predictions))
+	innovation = innovation.Repeat(ctx, 2, opts.altupInputs)
+
+	modalities := a.computeRouterModalities(ctx, activated, opts)
+	coefficients := a.CorrectionCoefficient.Forward(ctx, modalities)
+	coefficients = coefficients.Add(ctx, one)
+
+	coefficients = coefficients.Reshape(ctx, 1, coefficients.Dim(0), coefficients.Dim(1))
+	coefficients = coefficients.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+
+	corrected := innovation.Mul(ctx, coefficients)
+	corrected = corrected.Add(ctx, predictions)
+	return corrected
+}
+
+func (a AltUp) ScaleCorrectedOutput(ctx ml.Context, predictions ml.Tensor) ml.Tensor {
+	return predictions.Mul(ctx, a.CorrectionScale)
+}
+
+type Laurel struct {
+	LinearLeft     *nn.Linear  `gguf:"laurel_l"`
+	LinearRight    *nn.Linear  `gguf:"laurel_r"`
+	PostLaurelNorm *nn.RMSNorm `gguf:"laurel_post_norm"`
+}
+
+func (l Laurel) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = l.LinearLeft.Forward(ctx, hiddenStates)
+	hiddenStates = l.LinearRight.Forward(ctx, hiddenStates)
+	hiddenStates = l.PostLaurelNorm.Forward(ctx, hiddenStates, opts.eps)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type TextAttention struct {
+	Query     *nn.Linear  `gguf:"attn_q"`
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (attn TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, sharedKV bool, ropeBase float32, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	query = attn.QueryNorm.Forward(ctx, query, opts.eps)
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	var key, value ml.Tensor
+	if !sharedKV {
+		key = attn.Key.Forward(ctx, hiddenStates)
+		key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+		key = attn.KeyNorm.Forward(ctx, key, opts.eps)
+		key = fast.RoPE(ctx, key, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+		value = attn.Value.Forward(ctx, hiddenStates)
+		value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+		value = value.RMSNorm(ctx, nil, opts.eps)
+	}
+
+	attention := nn.Attention(ctx, query, key, value, 1., cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return attn.Output.Forward(ctx, attention)
+}
+
+type TextMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, activationSparsityScale float64) ml.Tensor {
+	upStates := mlp.Up.Forward(ctx, hiddenStates)
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates)
+	if activationSparsityScale > 0 {
+		mean := hiddenStates.Mean(ctx)
+		std := hiddenStates.Stddev(ctx).Scale(ctx, activationSparsityScale)
+		cutoff := mean.Add(ctx, std)
+		hiddenStates = hiddenStates.Sub(ctx, cutoff).RELU(ctx)
+	}
+
+	hiddenStates = hiddenStates.GELU(ctx).Mul(ctx, upStates)
+	hiddenStates = mlp.Down.Forward(ctx, hiddenStates)
+	return hiddenStates
+}
+
+type TextOptions struct {
+	hiddenLayers            int
+	hiddenSize              int
+	hiddenSizePerLayerInput int
+	numHeads, numKVHeads    int
+	keyLength, valueLength  int
+	sharedKeyValueLayers    int
+
+	altupActiveIndex  int
+	altupInputs       int
+	altupCorrectScale bool
+
+	eps           float32
+	ropeBase      float32
+	ropeBaseLocal float32
+	ropeScale     float32
+
+	slidingWindowPattern    []bool
+	activationSparsityScale []float32
+}
+
+func (o *TextOptions) altupActive(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	// t[:, :, o.altupActiveIndex]
+	return t.View(ctx, o.altupActiveIndex*t.Stride(2), t.Dim(0), t.Stride(1), t.Dim(1))
+}
+
+func (o *TextOptions) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+func (o *TextOptions) isLocal(i int) bool {
+	return o.slidingWindowPattern[i]
+}
+
+func newTextModel(c fs.Config) *TextModel {
+	return &TextModel{
+		TextLayers: make([]TextLayer, c.Uint("block_count")),
+		TextOptions: TextOptions{
+			hiddenLayers:            int(c.Uint("block_count")),
+			hiddenSize:              int(c.Uint("embedding_length")),
+			hiddenSizePerLayerInput: int(c.Uint("embedding_length_per_layer_input")),
+			numHeads:                int(c.Uint("attention.head_count")),
+			numKVHeads:              int(c.Uint("attention.head_count_kv")),
+			keyLength:               int(c.Uint("attention.key_length")),
+			valueLength:             int(c.Uint("attention.value_length")),
+			sharedKeyValueLayers:    int(c.Uint("attention.shared_kv_layers")),
+
+			altupActiveIndex: int(c.Uint("altup.active_idx")),
+			altupInputs:      int(c.Uint("altup.num_inputs")),
+
+			eps:           c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeBase:      c.Float("rope.freq_base", 1_000_000),
+			ropeBaseLocal: c.Float("rope.freq_base_local", 10_000),
+			ropeScale:     c.Float("rope.freq_scale", 1.0),
+
+			slidingWindowPattern:    c.Bools("attention.sliding_window_pattern"),
+			activationSparsityScale: c.Floats("activation_sparsity_scale"),
+		},
+	}
+}
diff --git a/model/models/models.go b/model/models/models.go
index 5471ce89..8752878e 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -3,6 +3,7 @@ package models
 import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
+	_ "github.com/ollama/ollama/model/models/gemma3n"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"

From 3944602f512b840953be00b6d028a0ab08b9b5ab Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 26 Jun 2025 00:11:23 -0700
Subject: [PATCH 18/24] fs/ggml: add missing architecture to
 OllamaEngineRequired() (#11206)

---
 fs/ggml/ggml.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index a0c2003f..c11b3385 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -174,6 +174,7 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
+		"gemma3n",
 		"mistral3",
 		"llama4",
 		"mllama",

From ba04902670cd5945ded682c1c9de2220475b9c38 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 26 Jun 2025 00:19:44 -0700
Subject: [PATCH 19/24] fs/ggml: add multiplier in graph estimates (#11208)

---
 fs/ggml/ggml.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index c11b3385..32f459a3 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -555,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
-	case "gemma", "gemma2", "gemma3":
+	case "gemma", "gemma2", "gemma3", "gemma3n":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
@@ -568,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				embedding*embeddingHeadsK*heads*9/16,
 		)
 
+		if f.KV().Architecture() == "gemma3n" {
+			fullOffload *= 4
+			partialOffload *= 4
+		}
+
 		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
 		// engine. Gemma3 always uses the Ollama engine.
 		if f.KV().Architecture() == "gemma3" {

From 11ffc36157029a73dab45ed11e2d2281dc2b58f9 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 26 Jun 2025 10:32:48 -0700
Subject: [PATCH 20/24] ci: multi-stage release process (#11001)

---
 .github/workflows/release.yaml | 149 +++------------------------------
 1 file changed, 11 insertions(+), 138 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5178eb5f..1f0cc273 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -54,48 +54,6 @@ jobs:
           name: build-${{ matrix.os }}-${{ matrix.arch }}
           path: dist/*
 
-  darwin-sign:
-    runs-on: macos-13
-    environment: release
-    needs: darwin-build
-    steps:
-      - uses: actions/checkout@v4
-      - run: |
-          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
-          security create-keychain -p password build.keychain
-          security default-keychain -s build.keychain
-          security unlock-keychain -p password build.keychain
-          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
-          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
-          security set-keychain-settings -lut 3600 build.keychain
-        env:
-          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
-          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
-      - uses: actions/download-artifact@v4
-        with:
-          name: build-darwin-amd64
-          path: dist/darwin-amd64
-      - uses: actions/download-artifact@v4
-        with:
-          name: build-darwin-arm64
-          path: dist/darwin-arm64
-      - run: |
-          export VERSION=${GITHUB_REF_NAME#v}
-          ./scripts/build_darwin.sh sign macapp
-        env:
-          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
-          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
-          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
-          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-darwin
-          path: |
-            dist/Ollama-darwin.zip
-            dist/ollama-darwin.tgz
-
   windows-depends:
     strategy:
       matrix:
@@ -230,61 +188,11 @@ jobs:
           go-version-file: go.mod
       - run: |
           go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
-      - if: matrix.arch == 'arm64'
-        run: |
-          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vc_redist.arm64.exe" -OutFile "dist\windows-arm64\vc_redist.arm64.exe"
-      - run: |
-          $env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
-          & .\scripts\build_windows.ps1 buildApp
-        env:
-          VCToolsRedistDir: stub
       - uses: actions/upload-artifact@v4
         with:
           name: build-${{ matrix.os }}-${{ matrix.arch }}
           path: |
             dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
-            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
-
-  windows-sign:
-    runs-on: windows
-    environment: release
-    needs: [windows-depends, windows-build]
-    steps:
-      - uses: actions/checkout@v4
-      - uses: google-github-actions/auth@v2
-        with:
-          project_id: ollama
-          credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
-      - run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
-          Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
-          Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
-          & "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
-
-          echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: build-windows-*
-          path: dist\
-          merge-multiple: true
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: depends-windows-amd64-*
-          path: dist\windows-amd64\
-          merge-multiple: true
-      - run: |
-          & .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
-        env:
-          KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-windows
-          path: |
-            dist\OllamaSetup.exe
-            dist\ollama-windows-*.zip
 
   linux-build:
     strategy:
@@ -441,58 +349,16 @@ jobs:
   trigger:
     runs-on: ubuntu-latest
     environment: release
-    needs: [darwin-build, windows-build, windows-depends]
-    steps:
-      - name: Trigger downstream release process
-        run: |
-          curl -L \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
-
-  # Aggregate all the assets and ship a release
-  release:
-    needs: [darwin-sign, windows-sign, linux-build]
-    runs-on: linux
-    environment: release
+    needs: [darwin-build, windows-build, windows-depends, linux-build]
     permissions:
       contents: write
     env:
       GH_TOKEN: ${{ github.token }}
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
-        with:
-          name: dist-darwin
-          path: dist
-      - uses: actions/download-artifact@v4
-        with:
-          name: dist-windows
-          path: dist
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: dist-linux-*
-          path: stage
-          merge-multiple: false
-      - name: Merge linux amd64 payload
-        working-directory: stage/dist-linux-amd64-archive
-        run: |
-          tar zxf ollama-linux-amd64.tgz
-          tar zxf ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
-          rm -f ollama-linux-amd64.tgz ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
-          tar -c -f- --owner 0 --group 0 . | pigz -9vc > ../ollama-linux-amd64.tgz
-      - name: Cleanup linux payloads
-        run: |
-          find stage -name ollama-linux\*.tgz -exec mv {} dist/ \;
-      - run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
-        working-directory: dist
-      - name: Create or update Release
+      - name: Create or update Release for tag
         run: |
           RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
-
           echo "Looking for existing release for ${RELEASE_VERSION}"
           OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
           if [ -n "$OLD_TAG" ]; then
@@ -506,5 +372,12 @@ jobs:
               --generate-notes \
               --prerelease
           fi
-          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
-          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
+      - name: Trigger downstream release process
+        run: |
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"publish\": \"1\"}}"

From d0b32def60b413407ddf4b4b063ba105a1ef2f92 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 26 Jun 2025 21:49:35 -0700
Subject: [PATCH 21/24] skip quantizing per_layer_token_embd (#11207)

this tensor isn't compatible with cuda when quantized to q4_K so skip it
---
 server/quantization.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/server/quantization.go b/server/quantization.go
index e57e8a4d..10175a35 100644
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -231,6 +231,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
 	// do not quantize relative position bias (T5)
 	quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
 
+	quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
+
 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
 		// get more optimal quantization type based on the tensor shape, layer, etc.

From 45f216a9c7e65bd30ab0e2b1b9fdb7cb2ad9436d Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 27 Jun 2025 11:11:49 -0700
Subject: [PATCH 22/24] ggml: Temporarily disable reporting UUIDs

This is causing segfaults, so disable it. Currently UUIDs are only
used for debugging purposes, although they planned to be used in
additional ways in the future.

Bug #11211
---
 ml/backend/ggml/ggml.go | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 707b739c..43104092 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -138,7 +138,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
+
+	// Bug #11211: Reporting of UUIDs is temporarily disabled due to causing segfaults
+	// This only affects debug information until the new memory management code is in place
+	// requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -155,7 +158,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		// requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

From 4129af9205763a113719c7ef102d5c6ff0f1e2e8 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 27 Jun 2025 11:45:33 -0700
Subject: [PATCH 23/24] chore: cleanup comments + unused vars (#11225)

---
 convert/convert_gemma3n.go    | 3 ---
 model/models/gemma3n/model.go | 1 -
 2 files changed, 4 deletions(-)

diff --git a/convert/convert_gemma3n.go b/convert/convert_gemma3n.go
index bf667e38..135ebaa5 100644
--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -24,7 +24,6 @@ type gemma3nModel struct {
 		HiddenSize                uint32    `json:"hidden_size"`
 		HiddenSizePerLayerInput   uint32    `json:"hidden_size_per_layer_input"`
 		IntermediateSize          uint32    `json:"intermediate_size"`
-		LaurelRank                uint32    `json:"laurel_rank"`
 		MaxPositionEmbeddings     uint32    `json:"max_position_embeddings"`
 		NumAttentionHeads         uint32    `json:"num_attention_heads"`
 		NumHiddenLayers           uint32    `json:"num_hidden_layers"`
@@ -72,8 +71,6 @@ func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
 	kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
 	kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
 	kv["gemma3n.head_dim"] = m.TextModel.HeadDim
-	kv["gemma3n.laurel_rank"] = m.TextModel.LaurelRank
-	kv["gemma3n.num_kv_shared_layers"] = m.TextModel.NumKVSharedLayers
 	kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
 	kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
 	return kv
diff --git a/model/models/gemma3n/model.go b/model/models/gemma3n/model.go
index d210ab75..6e83a972 100644
--- a/model/models/gemma3n/model.go
+++ b/model/models/gemma3n/model.go
@@ -39,7 +39,6 @@ func New(c fs.Config) (model.Model, error) {
 		),
 	}
 
-	// TODO: setup hybrid (local sliding window + global) cache
 	m.Cache = kvcache.NewWrapperCache(
 		kvcache.NewCausalCache(m.Shift),
 		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),

From 3b8b692218bf0da28859c89b71f4f5731f29002c Mon Sep 17 00:00:00 2001
From: Attogram Project <attogram@users.noreply.github.com>
Date: Sun, 29 Jun 2025 23:59:54 +0200
Subject: [PATCH 24/24] readme: add ollama-bash-toolshed to community
 integrations (#11224)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 366fe94b..8f9528f3 100644
--- a/README.md
+++ b/README.md
@@ -455,6 +455,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
+- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
 
 ### Apple Vision Pro