mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 23:03:55 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
32
llm/ext_server/server.cpp
vendored
32
llm/ext_server/server.cpp
vendored
@@ -334,6 +334,7 @@ struct server_metrics {
|
||||
struct llama_server_context
|
||||
{
|
||||
llama_model *model = nullptr;
|
||||
float modelProgress = 0.0;
|
||||
llama_context *ctx = nullptr;
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
@@ -737,7 +738,7 @@ struct llama_server_context
|
||||
sampler_names.emplace_back(sampler_name);
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1095,7 +1096,7 @@ struct llama_server_context
|
||||
std::vector<std::string> samplers_sequence;
|
||||
for (const auto &sampler_type : slot.sparams.samplers_sequence)
|
||||
{
|
||||
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
||||
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -2104,6 +2105,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
@@ -2501,7 +2503,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
{
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--numa") {
|
||||
else if (arg == "--numa")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -2521,6 +2524,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
{
|
||||
params.cont_batching = true;
|
||||
}
|
||||
else if (arg == "-fa" || arg == "--flash-attn")
|
||||
{
|
||||
params.flash_attn = true;
|
||||
}
|
||||
else if (arg == "-np" || arg == "--parallel")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2529,7 +2536,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
params.n_parallel = std::stoi(argv[i]);
|
||||
} else if (arg == "-n" || arg == "--n-predict")
|
||||
}
|
||||
else if (arg == "-n" || arg == "--n-predict")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
@@ -2537,7 +2545,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
params.n_predict = std::stoi(argv[i]);
|
||||
} else if (arg == "-spf" || arg == "--system-prompt-file")
|
||||
}
|
||||
else if (arg == "-spf" || arg == "--system-prompt-file")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
@@ -2771,6 +2780,12 @@ inline void signal_handler(int signal) {
|
||||
shutdown_handler(signal);
|
||||
}
|
||||
|
||||
static bool update_load_progress(float progress, void *data)
|
||||
{
|
||||
((llama_server_context*)data)->modelProgress = progress;
|
||||
return true;
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
char* wchar_to_char(const wchar_t* wstr) {
|
||||
if (wstr == nullptr) return nullptr;
|
||||
@@ -2876,7 +2891,9 @@ int main(int argc, char **argv) {
|
||||
break;
|
||||
}
|
||||
case SERVER_STATE_LOADING_MODEL:
|
||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||
char buf[128];
|
||||
snprintf(&buf[0], 128, R"({"status": "loading model", "progress": %0.2f})", llama.modelProgress);
|
||||
res.set_content(buf, "application/json");
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
break;
|
||||
case SERVER_STATE_ERROR:
|
||||
@@ -3071,6 +3088,9 @@ int main(int argc, char **argv) {
|
||||
});
|
||||
|
||||
// load the model
|
||||
params.progress_callback = update_load_progress;
|
||||
params.progress_callback_user_data = (void*)&llama;
|
||||
|
||||
if (!llama.load_model(params))
|
||||
{
|
||||
state.store(SERVER_STATE_ERROR);
|
||||
|
||||
@@ -27,8 +27,16 @@ const (
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeQ3_K_XS
|
||||
fileTypeIQ3_XS
|
||||
fileTypeIQ3_XXS
|
||||
fileTypeIQ1_S
|
||||
fileTypeIQ4_NL
|
||||
fileTypeIQ3_S
|
||||
fileTypeIQ2_S
|
||||
fileTypeIQ4_XS
|
||||
fileTypeIQ2_M
|
||||
fileTypeIQ1_M
|
||||
fileTypeBF16
|
||||
|
||||
fileTypeUnknown
|
||||
)
|
||||
@@ -75,10 +83,26 @@ func ParseFileType(s string) (fileType, error) {
|
||||
return fileTypeIQ2_XS, nil
|
||||
case "Q2_K_S":
|
||||
return fileTypeQ2_K_S, nil
|
||||
case "Q3_K_XS":
|
||||
return fileTypeQ3_K_XS, nil
|
||||
case "IQ3_XS":
|
||||
return fileTypeIQ3_XS, nil
|
||||
case "IQ3_XXS":
|
||||
return fileTypeIQ3_XXS, nil
|
||||
case "IQ1_S":
|
||||
return fileTypeIQ1_S, nil
|
||||
case "IQ4_NL":
|
||||
return fileTypeIQ4_NL, nil
|
||||
case "IQ3_S":
|
||||
return fileTypeIQ3_S, nil
|
||||
case "IQ2_S":
|
||||
return fileTypeIQ2_S, nil
|
||||
case "IQ4_XS":
|
||||
return fileTypeIQ4_XS, nil
|
||||
case "IQ2_M":
|
||||
return fileTypeIQ2_M, nil
|
||||
case "IQ1_M":
|
||||
return fileTypeIQ1_M, nil
|
||||
case "BF16":
|
||||
return fileTypeBF16, nil
|
||||
default:
|
||||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
||||
}
|
||||
@@ -126,10 +150,26 @@ func (t fileType) String() string {
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeQ3_K_XS:
|
||||
return "Q3_K_XS"
|
||||
case fileTypeIQ3_XS:
|
||||
return "IQ3_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
case fileTypeIQ1_S:
|
||||
return "IQ1_S"
|
||||
case fileTypeIQ4_NL:
|
||||
return "IQ4_NL"
|
||||
case fileTypeIQ3_S:
|
||||
return "IQ3_S"
|
||||
case fileTypeIQ2_S:
|
||||
return "IQ2_S"
|
||||
case fileTypeIQ4_XS:
|
||||
return "IQ4_XS"
|
||||
case fileTypeIQ2_M:
|
||||
return "IQ2_M"
|
||||
case fileTypeIQ1_M:
|
||||
return "IQ1_M"
|
||||
case fileTypeBF16:
|
||||
return "BF16"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
|
||||
t.Offset = uint64(offset)
|
||||
|
||||
if _, err := rs.Seek(int64(t.size()), io.SeekCurrent); err != nil {
|
||||
if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
38
llm/ggml.go
38
llm/ggml.go
@@ -106,7 +106,7 @@ type Layer map[string]*Tensor
|
||||
|
||||
func (l Layer) size() (size uint64) {
|
||||
for _, t := range l {
|
||||
size += t.size()
|
||||
size += t.Size()
|
||||
}
|
||||
|
||||
return size
|
||||
@@ -124,12 +124,12 @@ type Tensor struct {
|
||||
}
|
||||
|
||||
func (t Tensor) blockSize() uint64 {
|
||||
switch {
|
||||
case t.Kind < 2:
|
||||
switch t.Kind {
|
||||
case 0, 1, 24, 25, 26, 27, 28, 31: // F32, F16, I8, I16, I32, I64, F64, BF16
|
||||
return 1
|
||||
case t.Kind < 10:
|
||||
case 2, 3, 8, 9, 20: // Q4_0, Q4_1, Q8_0, Q8_1, IQ4_NL
|
||||
return 32
|
||||
default:
|
||||
default: // All others
|
||||
return 256
|
||||
}
|
||||
}
|
||||
@@ -171,7 +171,29 @@ func (t Tensor) typeSize() uint64 {
|
||||
case 17: // IQ2_XS
|
||||
return 2 + 2*blockSize/8 + blockSize/32
|
||||
case 18: // IQ3_XXS
|
||||
return 2 + 3*blockSize/8
|
||||
return 2 + blockSize/4 + blockSize/8
|
||||
case 19: // IQ1_S
|
||||
return 2 + blockSize/8 + blockSize/16
|
||||
case 20: // IQ4_NL
|
||||
return 2 + blockSize/2
|
||||
case 21: // IQ3_S
|
||||
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
||||
case 22: // IQ2_S
|
||||
return 2 + blockSize/4 + blockSize/16
|
||||
case 23: // IQ4_XS
|
||||
return 2 + 2 + blockSize/2 + blockSize/64
|
||||
case 24: // I8
|
||||
return 1
|
||||
case 25: // I16
|
||||
return 2
|
||||
case 26: // I32
|
||||
return 4
|
||||
case 27: // I64
|
||||
return 8
|
||||
case 28: // F64
|
||||
return 8
|
||||
case 29: // IQ1_M
|
||||
return blockSize/8 + blockSize/16 + blockSize/32
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
@@ -185,7 +207,7 @@ func (t Tensor) parameters() uint64 {
|
||||
return count
|
||||
}
|
||||
|
||||
func (t Tensor) size() uint64 {
|
||||
func (t Tensor) Size() uint64 {
|
||||
return t.parameters() * t.typeSize() / t.blockSize()
|
||||
}
|
||||
|
||||
@@ -288,7 +310,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
||||
)
|
||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||
|
||||
16
llm/gguf.go
16
llm/gguf.go
@@ -62,16 +62,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
||||
return model, nil
|
||||
}
|
||||
|
||||
const (
|
||||
_ uint32 = iota
|
||||
GGUFTokenNormal
|
||||
GGUFTokenUnknown
|
||||
GGUFTokenControl
|
||||
GGUFTokenUserDefined
|
||||
GGUFTokenUnused
|
||||
GGUFTokenByte
|
||||
)
|
||||
|
||||
const (
|
||||
ggufTypeUint8 uint32 = iota
|
||||
ggufTypeInt8
|
||||
@@ -251,11 +241,11 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
}
|
||||
|
||||
for _, tensor := range llm.tensors {
|
||||
if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
|
||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
padding := llm.padding(int64(tensor.size()), int64(alignment))
|
||||
padding := llm.padding(int64(tensor.Size()), int64(alignment))
|
||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -480,9 +470,11 @@ var ggufKVOrder = map[string][]string{
|
||||
"gemma.attention.key_length",
|
||||
"gemma.attention.value_length",
|
||||
"general.file_type",
|
||||
"tokenizer.ggml.pre",
|
||||
"tokenizer.ggml.model",
|
||||
"tokenizer.ggml.tokens",
|
||||
"tokenizer.ggml.scores",
|
||||
"tokenizer.ggml.merges",
|
||||
"tokenizer.ggml.token_type",
|
||||
"tokenizer.ggml.bos_token_id",
|
||||
"tokenizer.ggml.eos_token_id",
|
||||
|
||||
Submodule llm/llama.cpp updated: 614d3b914e...74f33adf5f
31
llm/patches/01-load-progress.diff
Normal file
31
llm/patches/01-load-progress.diff
Normal file
@@ -0,0 +1,31 @@
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index ba1ecf0e..cead57cc 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
+ mparams.progress_callback = params.progress_callback;
|
||||
+ mparams.progress_callback_user_data = params.progress_callback_user_data;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
diff --git a/common/common.h b/common/common.h
|
||||
index d80344f2..71e84834 100644
|
||||
--- a/common/common.h
|
||||
+++ b/common/common.h
|
||||
@@ -174,6 +174,13 @@ struct gpt_params {
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
+
|
||||
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
+ // If the provided progress_callback returns true, model loading continues.
|
||||
+ // If it returns false, model loading is immediately aborted.
|
||||
+ llama_progress_callback progress_callback = NULL;
|
||||
+ // context pointer passed to the progress callback
|
||||
+ void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
@@ -1,8 +1,17 @@
|
||||
From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Thu, 23 May 2024 11:18:45 -0700
|
||||
Subject: [PATCH] throw exception on load errors
|
||||
|
||||
---
|
||||
llama.cpp | 25 ++++++++++++++++---------
|
||||
1 file changed, 16 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 4225f955..7b762f86 100644
|
||||
index 15c66077..8ba90b6a 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
@@ -6346,7 +6346,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
@@ -11,10 +20,10 @@ index 4225f955..7b762f86 100644
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
|
||||
};
|
||||
@@ -15600,16 +15600,23 @@ struct llama_model * llama_load_model_from_file(
|
||||
}
|
||||
model->rpc_servers.push_back(servers);
|
||||
}
|
||||
|
||||
- int status = llama_model_load(path_model, *model, params);
|
||||
- GGML_ASSERT(status <= 0);
|
||||
- if (status < 0) {
|
||||
@@ -22,6 +31,7 @@ index 4225f955..7b762f86 100644
|
||||
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||
- } else if (status == -2) {
|
||||
- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||
+
|
||||
+ try {
|
||||
+ int status = llama_model_load(path_model, *model, params);
|
||||
+ GGML_ASSERT(status <= 0);
|
||||
@@ -42,3 +52,6 @@ index 4225f955..7b762f86 100644
|
||||
}
|
||||
|
||||
return model;
|
||||
--
|
||||
2.45.1
|
||||
|
||||
|
||||
35
llm/patches/05-default-pretokenizer.diff
Normal file
35
llm/patches/05-default-pretokenizer.diff
Normal file
@@ -0,0 +1,35 @@
|
||||
From d02a06f3f45a09255ace8684a66590e06ce44605 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Thu, 23 May 2024 11:33:20 -0700
|
||||
Subject: [PATCH] default pretokenizer on unrecognized type
|
||||
|
||||
---
|
||||
llama.cpp | 5 +----
|
||||
1 file changed, 1 insertion(+), 4 deletions(-)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 15c66077..af1aede3 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -4504,9 +4504,6 @@ static void llm_load_vocab(
|
||||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
- } else if (
|
||||
- tokenizer_pre == "default") {
|
||||
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
@@ -4553,7 +4550,7 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
} else {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
--
|
||||
2.45.1
|
||||
|
||||
@@ -55,6 +55,7 @@ type llmServer struct {
|
||||
totalLayers uint64
|
||||
gpuCount int
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
@@ -200,6 +201,23 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
flashAttnEnabled := envconfig.FlashAttention
|
||||
|
||||
// partial offloading does not support flash attention
|
||||
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
flashAttnEnabled = false
|
||||
}
|
||||
|
||||
// only cuda (compute capability 7+) and metal support flash attention
|
||||
for _, g := range gpus {
|
||||
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
|
||||
flashAttnEnabled = false
|
||||
}
|
||||
}
|
||||
if flashAttnEnabled {
|
||||
params = append(params, "--flash-attn")
|
||||
}
|
||||
|
||||
numParallel := envconfig.NumParallel
|
||||
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
@@ -408,10 +426,11 @@ func (s ServerStatus) ToString() string {
|
||||
}
|
||||
|
||||
type ServerStatusResp struct {
|
||||
Status string `json:"status"`
|
||||
SlotsIdle int `json:"slots_idle"`
|
||||
SlotsProcessing int `json:"slots_processing"`
|
||||
Error string `json:"error"`
|
||||
Status string `json:"status"`
|
||||
SlotsIdle int `json:"slots_idle"`
|
||||
SlotsProcessing int `json:"slots_processing"`
|
||||
Error string `json:"error"`
|
||||
Progress float32 `json:"progress"`
|
||||
}
|
||||
|
||||
func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
@@ -459,6 +478,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
case "no slot available":
|
||||
return ServerStatusNoSlotsAvailable, nil
|
||||
case "loading model":
|
||||
s.loadProgress = status.Progress
|
||||
return ServerStatusLoadingModel, nil
|
||||
default:
|
||||
return ServerStatusError, fmt.Errorf("server error: %+v", status)
|
||||
@@ -499,7 +519,8 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
||||
|
||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||
stallDuration := 60 * time.Second
|
||||
stallTimer := time.Now().Add(stallDuration) // give up if we stall for
|
||||
|
||||
slog.Info("waiting for llama runner to start responding")
|
||||
var lastStatus ServerStatus = -1
|
||||
@@ -517,13 +538,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||
default:
|
||||
}
|
||||
if time.Now().After(expiresAt) {
|
||||
if time.Now().After(stallTimer) {
|
||||
// timeout
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
|
||||
return fmt.Errorf("timed out waiting for llama runner to start - progress %0.2f - %s", s.loadProgress, msg)
|
||||
}
|
||||
if s.cmd.ProcessState != nil {
|
||||
msg := ""
|
||||
@@ -534,6 +555,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
||||
defer cancel()
|
||||
priorProgress := s.loadProgress
|
||||
status, _ := s.getServerStatus(ctx)
|
||||
if lastStatus != status && status != ServerStatusReady {
|
||||
// Only log on status changes
|
||||
@@ -546,6 +568,11 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
return nil
|
||||
default:
|
||||
lastStatus = status
|
||||
// Reset the timer as long as we're making forward progress on the load
|
||||
if priorProgress != s.loadProgress {
|
||||
slog.Debug(fmt.Sprintf("model load progress %0.2f", s.loadProgress))
|
||||
stallTimer = time.Now().Add(stallDuration)
|
||||
}
|
||||
time.Sleep(time.Millisecond * 250)
|
||||
continue
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user