Merge branch 'ollama:main' into main

2025-12-22 23:03:55 +00:00 · 2024-05-24 15:19:08 +08:00
parent fc2f25c1d5 afd2b058b4
commit 4e37e24b04
41 changed files with 1030 additions and 614 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -334,6 +334,7 @@ struct server_metrics {
 struct llama_server_context
 {
    llama_model *model = nullptr;
+    float modelProgress = 0.0;
    llama_context *ctx = nullptr;

    clip_ctx *clp_ctx = nullptr;
@@ -737,7 +738,7 @@ struct llama_server_context
                    sampler_names.emplace_back(sampler_name);
                }
            }
-            slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
@@ -1095,7 +1096,7 @@ struct llama_server_context
        std::vector<std::string> samplers_sequence;
        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -2104,6 +2105,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --embedding               enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
    printf("  -spf FNAME, --system-prompt-file FNAME\n");
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
@@ -2501,7 +2503,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.use_mmap = false;
        }
-        else if (arg == "--numa") {
+        else if (arg == "--numa")
+        {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -2521,6 +2524,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.cont_batching = true;
        }
+        else if (arg == "-fa" || arg == "--flash-attn")
+        {
+            params.flash_attn = true;
+        }
        else if (arg == "-np" || arg == "--parallel")
        {
            if (++i >= argc)
@@ -2529,7 +2536,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                break;
            }
            params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
+        }
+        else if (arg == "-n" || arg == "--n-predict")
        {
            if (++i >= argc)
            {
@@ -2537,7 +2545,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                break;
            }
            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        }
+        else if (arg == "-spf" || arg == "--system-prompt-file")
        {
            if (++i >= argc)
            {
@@ -2771,6 +2780,12 @@ inline void signal_handler(int signal) {
    shutdown_handler(signal);
 }

+static bool update_load_progress(float progress, void *data)
+{
+    ((llama_server_context*)data)->modelProgress = progress;
+    return true;
+}
+
 #if defined(_WIN32)
 char* wchar_to_char(const wchar_t* wstr) {
    if (wstr == nullptr) return nullptr;
@@ -2876,7 +2891,9 @@ int main(int argc, char **argv) {
                break;
            }
            case SERVER_STATE_LOADING_MODEL:
-                res.set_content(R"({"status": "loading model"})", "application/json");
+                char buf[128];
+                snprintf(&buf[0], 128, R"({"status": "loading model", "progress": %0.2f})", llama.modelProgress);
+                res.set_content(buf, "application/json");
                res.status = 503; // HTTP Service Unavailable
                break;
            case SERVER_STATE_ERROR:
@@ -3071,6 +3088,9 @@ int main(int argc, char **argv) {
            });

    // load the model
+    params.progress_callback = update_load_progress;
+    params.progress_callback_user_data = (void*)&llama;
+
    if (!llama.load_model(params))
    {
        state.store(SERVER_STATE_ERROR);
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -27,8 +27,16 @@ const (
 	fileTypeIQ2_XXS
 	fileTypeIQ2_XS
 	fileTypeQ2_K_S
-	fileTypeQ3_K_XS
+	fileTypeIQ3_XS
 	fileTypeIQ3_XXS
+	fileTypeIQ1_S
+	fileTypeIQ4_NL
+	fileTypeIQ3_S
+	fileTypeIQ2_S
+	fileTypeIQ4_XS
+	fileTypeIQ2_M
+	fileTypeIQ1_M
+	fileTypeBF16

 	fileTypeUnknown
 )
@@ -75,10 +83,26 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ2_XS, nil
 	case "Q2_K_S":
 		return fileTypeQ2_K_S, nil
-	case "Q3_K_XS":
-		return fileTypeQ3_K_XS, nil
+	case "IQ3_XS":
+		return fileTypeIQ3_XS, nil
 	case "IQ3_XXS":
 		return fileTypeIQ3_XXS, nil
+	case "IQ1_S":
+		return fileTypeIQ1_S, nil
+	case "IQ4_NL":
+		return fileTypeIQ4_NL, nil
+	case "IQ3_S":
+		return fileTypeIQ3_S, nil
+	case "IQ2_S":
+		return fileTypeIQ2_S, nil
+	case "IQ4_XS":
+		return fileTypeIQ4_XS, nil
+	case "IQ2_M":
+		return fileTypeIQ2_M, nil
+	case "IQ1_M":
+		return fileTypeIQ1_M, nil
+	case "BF16":
+		return fileTypeBF16, nil
 	default:
 		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
 	}
@@ -126,10 +150,26 @@ func (t fileType) String() string {
 		return "IQ2_XS"
 	case fileTypeQ2_K_S:
 		return "Q2_K_S"
-	case fileTypeQ3_K_XS:
-		return "Q3_K_XS"
+	case fileTypeIQ3_XS:
+		return "IQ3_XS"
 	case fileTypeIQ3_XXS:
 		return "IQ3_XXS"
+	case fileTypeIQ1_S:
+		return "IQ1_S"
+	case fileTypeIQ4_NL:
+		return "IQ4_NL"
+	case fileTypeIQ3_S:
+		return "IQ3_S"
+	case fileTypeIQ2_S:
+		return "IQ2_S"
+	case fileTypeIQ4_XS:
+		return "IQ4_XS"
+	case fileTypeIQ2_M:
+		return "IQ2_M"
+	case fileTypeIQ1_M:
+		return "IQ1_M"
+	case fileTypeBF16:
+		return "BF16"
 	default:
 		return "unknown"
 	}
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -119,7 +119,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {

 		t.Offset = uint64(offset)

-		if _, err := rs.Seek(int64(t.size()), io.SeekCurrent); err != nil {
+		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
 			return err
 		}

--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -106,7 +106,7 @@ type Layer map[string]*Tensor

 func (l Layer) size() (size uint64) {
 	for _, t := range l {
-		size += t.size()
+		size += t.Size()
 	}

 	return size
@@ -124,12 +124,12 @@ type Tensor struct {
 }

 func (t Tensor) blockSize() uint64 {
-	switch {
-	case t.Kind < 2:
+	switch t.Kind {
+	case 0, 1, 24, 25, 26, 27, 28, 31: // F32, F16, I8, I16, I32, I64, F64, BF16
 		return 1
-	case t.Kind < 10:
+	case 2, 3, 8, 9, 20: // Q4_0, Q4_1, Q8_0, Q8_1, IQ4_NL
 		return 32
-	default:
+	default: // All others
 		return 256
 	}
 }
@@ -171,7 +171,29 @@ func (t Tensor) typeSize() uint64 {
 	case 17: // IQ2_XS
 		return 2 + 2*blockSize/8 + blockSize/32
 	case 18: // IQ3_XXS
-		return 2 + 3*blockSize/8
+		return 2 + blockSize/4 + blockSize/8
+	case 19: // IQ1_S
+		return 2 + blockSize/8 + blockSize/16
+	case 20: // IQ4_NL
+		return 2 + blockSize/2
+	case 21: // IQ3_S
+		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
+	case 22: // IQ2_S
+		return 2 + blockSize/4 + blockSize/16
+	case 23: // IQ4_XS
+		return 2 + 2 + blockSize/2 + blockSize/64
+	case 24: // I8
+		return 1
+	case 25: // I16
+		return 2
+	case 26: // I32
+		return 4
+	case 27: // I64
+		return 8
+	case 28: // F64
+		return 8
+	case 29: // IQ1_M
+		return blockSize/8 + blockSize/16 + blockSize/32
 	default:
 		return 0
 	}
@@ -185,7 +207,7 @@ func (t Tensor) parameters() uint64 {
 	return count
 }

-func (t Tensor) size() uint64 {
+func (t Tensor) Size() uint64 {
 	return t.parameters() * t.typeSize() / t.blockSize()
 }

@@ -288,7 +310,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			// mixtral 8x22b
 			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
-				3*ffnGateExpsWeight.size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
+				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
 				4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
 			)
 		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -62,16 +62,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
 	return model, nil
 }

-const (
-	_ uint32 = iota
-	GGUFTokenNormal
-	GGUFTokenUnknown
-	GGUFTokenControl
-	GGUFTokenUserDefined
-	GGUFTokenUnused
-	GGUFTokenByte
-)
-
 const (
 	ggufTypeUint8 uint32 = iota
 	ggufTypeInt8
@@ -251,11 +241,11 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	}

 	for _, tensor := range llm.tensors {
-		if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
+		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
 			return err
 		}

-		padding := llm.padding(int64(tensor.size()), int64(alignment))
+		padding := llm.padding(int64(tensor.Size()), int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
 			return err
 		}
@@ -480,9 +470,11 @@ var ggufKVOrder = map[string][]string{
 		"gemma.attention.key_length",
 		"gemma.attention.value_length",
 		"general.file_type",
+		"tokenizer.ggml.pre",
 		"tokenizer.ggml.model",
 		"tokenizer.ggml.tokens",
 		"tokenizer.ggml.scores",
+		"tokenizer.ggml.merges",
 		"tokenizer.ggml.token_type",
 		"tokenizer.ggml.bos_token_id",
 		"tokenizer.ggml.eos_token_id",
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -0,0 +1,31 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index ba1ecf0e..cead57cc 100644
+--- a/common/common.cpp
+++ b/common/common.cpp
+@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+     mparams.use_mmap        = params.use_mmap;
+     mparams.use_mlock       = params.use_mlock;
+     mparams.check_tensors   = params.check_tensors;
+    mparams.progress_callback = params.progress_callback;
+    mparams.progress_callback_user_data = params.progress_callback_user_data;
+     if (params.kv_overrides.empty()) {
+         mparams.kv_overrides = NULL;
+     } else {
+diff --git a/common/common.h b/common/common.h
+index d80344f2..71e84834 100644
+--- a/common/common.h
+++ b/common/common.h
+@@ -174,6 +174,13 @@ struct gpt_params {
+     // multimodal models (see examples/llava)
+     std::string mmproj = "";        // path to multimodal projector
+     std::vector<std::string> image; // path to image file(s)
+
+    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+    // If the provided progress_callback returns true, model loading continues.
+    // If it returns false, model loading is immediately aborted.
+    llama_progress_callback progress_callback = NULL;
+    // context pointer passed to the progress callback
+    void * progress_callback_user_data;
+ };
+ 
+ void gpt_params_handle_model_default(gpt_params & params);
--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
@@ -1,8 +1,17 @@
+From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Thu, 23 May 2024 11:18:45 -0700
+Subject: [PATCH] throw exception on load errors
+
+---
+ llama.cpp | 25 ++++++++++++++++---------
+ 1 file changed, 16 insertions(+), 9 deletions(-)
+
 diff --git a/llama.cpp b/llama.cpp
-index 4225f955..7b762f86 100644
+index 15c66077..8ba90b6a 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -6346,7 +6346,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@@ -11,10 +20,10 @@ index 4225f955..7b762f86 100644
     }
 
     return 0;
-@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
-         };
+@@ -15600,16 +15600,23 @@ struct llama_model * llama_load_model_from_file(
+         }
+         model->rpc_servers.push_back(servers);
     }
- 
 -    int status = llama_model_load(path_model, *model, params);
 -    GGML_ASSERT(status <= 0);
 -    if (status < 0) {
@@ -22,6 +31,7 @@ index 4225f955..7b762f86 100644
 -            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 -        } else if (status == -2) {
 -            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+
 +    try {
 +        int status = llama_model_load(path_model, *model, params);
 +        GGML_ASSERT(status <= 0);
@@ -42,3 +52,6 @@ index 4225f955..7b762f86 100644
     }
 
     return model;
+-- 
+2.45.1
+
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -0,0 +1,35 @@
+From d02a06f3f45a09255ace8684a66590e06ce44605 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Thu, 23 May 2024 11:33:20 -0700
+Subject: [PATCH] default pretokenizer on unrecognized type
+
+---
+ llama.cpp | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/llama.cpp b/llama.cpp
+index 15c66077..af1aede3 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -4504,9 +4504,6 @@ static void llm_load_vocab(
+                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+-            } else if (
+-                    tokenizer_pre == "default") {
+-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+             } else if (
+                     tokenizer_pre == "llama3"   ||
+                     tokenizer_pre == "llama-v3" ||
+@@ -4553,7 +4550,7 @@ static void llm_load_vocab(
+                 tokenizer_pre == "dbrx") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
+             } else {
+-                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+             }
+         } else {
+             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+-- 
+2.45.1
+
--- a/llm/server.go
+++ b/llm/server.go
@@ -55,6 +55,7 @@ type llmServer struct {
 	totalLayers    uint64
 	gpuCount       int
 	loadDuration   time.Duration // Record how long it took the model to load
+	loadProgress   float32

 	sem *semaphore.Weighted
 }
@@ -200,6 +201,23 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}

+	flashAttnEnabled := envconfig.FlashAttention
+
+	// partial offloading does not support flash attention
+	if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
+		flashAttnEnabled = false
+	}
+
+	// only cuda (compute capability 7+) and metal support flash attention
+	for _, g := range gpus {
+		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
+			flashAttnEnabled = false
+		}
+	}
+	if flashAttnEnabled {
+		params = append(params, "--flash-attn")
+	}
+
 	numParallel := envconfig.NumParallel

 	// TODO (jmorganca): multimodal models don't support parallel yet
@@ -408,10 +426,11 @@ func (s ServerStatus) ToString() string {
 }

 type ServerStatusResp struct {
-	Status          string `json:"status"`
-	SlotsIdle       int    `json:"slots_idle"`
-	SlotsProcessing int    `json:"slots_processing"`
-	Error           string `json:"error"`
+	Status          string  `json:"status"`
+	SlotsIdle       int     `json:"slots_idle"`
+	SlotsProcessing int     `json:"slots_processing"`
+	Error           string  `json:"error"`
+	Progress        float32 `json:"progress"`
 }

 func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
@@ -459,6 +478,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 	case "no slot available":
 		return ServerStatusNoSlotsAvailable, nil
 	case "loading model":
+		s.loadProgress = status.Progress
 		return ServerStatusLoadingModel, nil
 	default:
 		return ServerStatusError, fmt.Errorf("server error: %+v", status)
@@ -499,7 +519,8 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
+	stallDuration := 60 * time.Second
+	stallTimer := time.Now().Add(stallDuration) // give up if we stall for

 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
@@ -517,13 +538,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
 		default:
 		}
-		if time.Now().After(expiresAt) {
+		if time.Now().After(stallTimer) {
 			// timeout
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			}
-			return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+			return fmt.Errorf("timed out waiting for llama runner to start - progress %0.2f - %s", s.loadProgress, msg)
 		}
 		if s.cmd.ProcessState != nil {
 			msg := ""
@@ -534,6 +555,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 		}
 		ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
 		defer cancel()
+		priorProgress := s.loadProgress
 		status, _ := s.getServerStatus(ctx)
 		if lastStatus != status && status != ServerStatusReady {
 			// Only log on status changes
@@ -546,6 +568,11 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			return nil
 		default:
 			lastStatus = status
+			// Reset the timer as long as we're making forward progress on the load
+			if priorProgress != s.loadProgress {
+				slog.Debug(fmt.Sprintf("model load progress %0.2f", s.loadProgress))
+				stallTimer = time.Now().Add(stallDuration)
+			}
 			time.Sleep(time.Millisecond * 250)
 			continue
 		}