Merge branch 'ollama:main' into main

2025-12-23 15:08:27 +00:00 · 2024-06-07 17:25:53 +08:00
parent 71ae05239e ce0dc33cb8
commit a6390a8992
86 changed files with 1232 additions and 344 deletions
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -81,6 +81,11 @@ func (kv KV) ContextLength() uint64 {
 	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
 }

+func (kv KV) ChatTemplate() string {
+	s, _ := kv["tokenizer.chat_template"].(string)
+	return s
+}
+
 type Tensors []*Tensor

 func (ts Tensors) Layers() map[string]Layer {
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -592,8 +592,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
 			return err
 		}

-		dims := 0
-		for cnt := 0; cnt < len(tensor.Shape); cnt++ {
+		var dims int
+		for cnt := range len(tensor.Shape) {
 			if tensor.Shape[cnt] > 0 {
 				dims++
 			}
@@ -603,8 +603,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
 			return err
 		}

-		for i := 0; i < dims; i++ {
-			if err := binary.Write(ws, llm.ByteOrder, uint64(tensor.Shape[dims-1-i])); err != nil {
+		for i := range dims {
+			if err := binary.Write(ws, llm.ByteOrder, tensor.Shape[dims-1-i]); err != nil {
 				return err
 			}
 		}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -5,9 +5,9 @@ import (
 	"log/slog"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
-	"github.com/ollama/ollama/envconfig"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
@@ -103,7 +103,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	}

 	var layerCount int
-	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
+	for i := range int(ggml.KV().BlockCount()) {
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			memoryLayer := blk.size()

--- a/llm/patches/06-qwen2.diff
+++ b/llm/patches/06-qwen2.diff
@@ -0,0 +1,13 @@
+diff --git a/llama.cpp b/llama.cpp
+index 40d2ec2c..f34eb79a 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
+         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+         cb(kq, "kq", il);
+ 
+-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
+             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
+             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
+             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -10,9 +10,9 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"slices"
 	"strings"

-	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"

 	"github.com/ollama/ollama/gpu"
--- a/llm/server.go
+++ b/llm/server.go
@@ -85,7 +85,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemMemory uint64
 	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
-
 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner

 		cpuRunner = serverForCpu()
@@ -104,21 +103,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		var layers int
 		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)

-		if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
+		switch {
+		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
-		} else if gpus[0].Library != "metal" && layers == 0 {
+		case gpus[0].Library != "metal" && layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
 			gpuCount = 0
-		} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
+		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = layers
 		}
 	}

 	// Loop through potential servers
-	finalErr := fmt.Errorf("no suitable llama servers found")
+	finalErr := errors.New("no suitable llama servers found")

 	if len(adapters) > 1 {
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
@@ -232,7 +232,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

-	for i := 0; i < len(servers); i++ {
+	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
 			// Shouldn't happen
@@ -284,7 +284,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 		server := filepath.Join(dir, "ollama_llama_server")
 		if runtime.GOOS == "windows" {
-			server = server + ".exe"
+			server += ".exe"
 		}

 		// Detect tmp cleaners wiping out the file
@@ -315,7 +315,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status

-		visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
+		visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))

 		// Update or add the path and visible devices variable with our adjusted version
@@ -459,7 +459,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		if errors.Is(err, context.DeadlineExceeded) {
-			return ServerStatusNotResponding, fmt.Errorf("server not responding")
+			return ServerStatusNotResponding, errors.New("server not responding")
 		}
 		return ServerStatusError, fmt.Errorf("health resp: %w", err)
 	}