diff --git a/README.md b/README.md index 8434dde7..8a815215 100644 --- a/README.md +++ b/README.md @@ -624,6 +624,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama) - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies) - [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases) +- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network) ### Supported backends diff --git a/harmony/harmonyparser.go b/harmony/harmonyparser.go index e7249661..a51819dd 100644 --- a/harmony/harmonyparser.go +++ b/harmony/harmonyparser.go @@ -1,7 +1,6 @@ package harmony import ( - "context" "fmt" "log/slog" "strings" @@ -292,7 +291,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo for _, event := range events { switch event := event.(type) { case HarmonyEventHeaderComplete: - slog.Log(context.TODO(), logutil.LevelTrace, "harmony event header complete", "header", event.Header) + logutil.Trace("harmony event header complete", "header", event.Header) switch event.Header.Channel { case "analysis": if event.Header.Recipient != "" { @@ -315,7 +314,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo h.state = harmonyMessageState_Normal } case HarmonyEventContentEmitted: - slog.Log(context.TODO(), logutil.LevelTrace, "harmony event content", "content", event.Content, "state", h.state) + logutil.Trace("harmony event content", "content", event.Content, "state", h.state) if h.state == harmonyMessageState_Normal { contentSb.WriteString(event.Content) } else if h.state == harmonyMessageState_Thinking { diff --git a/llama/patches/0025-harden-uncaught-exception-registration.patch b/llama/patches/0025-harden-uncaught-exception-registration.patch new file mode 100644 index 00000000..d5fc2598 --- /dev/null +++ b/llama/patches/0025-harden-uncaught-exception-registration.patch @@ -0,0 +1,28 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Hiltgen +Date: Fri, 29 Aug 2025 16:53:08 -0700 +Subject: [PATCH] harden uncaught exception registration + +--- + ggml/src/ggml.cpp | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp +index 0d388d45..f5bcb446 100644 +--- a/ggml/src/ggml.cpp ++++ b/ggml/src/ggml.cpp +@@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{ + return false; + } + const auto prev{std::get_terminate()}; +- GGML_ASSERT(prev != ggml_uncaught_exception); +- previous_terminate_handler = prev; ++ // GGML_ASSERT(prev != ggml_uncaught_exception); ++ if (prev != ggml_uncaught_exception) { ++ previous_terminate_handler = prev; ++ } else { ++ GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__); ++ } + std::set_terminate(ggml_uncaught_exception); + return true; + }(); diff --git a/llm/server.go b/llm/server.go index b9ffdc6c..664a69fb 100644 --- a/llm/server.go +++ b/llm/server.go @@ -678,8 +678,12 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ if !(len(gpus) == 1 && gpus[0].Library == "cpu") { for _, gpu := range gpus { + available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory + if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory { + available = 0 + } slog.Info("gpu memory", "id", gpu.ID, - "available", format.HumanBytes2(gpu.FreeMemory-envconfig.GpuOverhead()-gpu.MinimumMemory), + "available", format.HumanBytes2(available), "free", format.HumanBytes2(gpu.FreeMemory), "minimum", format.HumanBytes2(gpu.MinimumMemory), "overhead", format.HumanBytes2(envconfig.GpuOverhead())) @@ -861,7 +865,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d } layers[i] += memory.CPU.Weights[i].Size layers[i] += memory.CPU.Cache[i].Size - slog.Log(context.TODO(), logutil.LevelTrace, "layer to assign", "layer", i, "size", format.HumanBytes2(layers[i])) + logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i])) } gpuLayers := ml.GPULayersList{} diff --git a/logutil/logutil.go b/logutil/logutil.go index 406caf54..fff277b8 100644 --- a/logutil/logutil.go +++ b/logutil/logutil.go @@ -1,6 +1,7 @@ package logutil import ( + "context" "io" "log/slog" "path/filepath" @@ -27,3 +28,11 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger { }, })) } + +func Trace(msg string, args ...any) { + slog.Log(context.TODO(), LevelTrace, msg, args...) +} + +func TraceContext(ctx context.Context, msg string, args ...any) { + slog.Log(ctx, LevelTrace, msg, args...) +} diff --git a/ml/backend.go b/ml/backend.go index 57823e4d..154a0f1b 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -266,7 +266,7 @@ func (m DeviceMemory) LogValue() slog.Value { // allocation is guaranteed to be provided so that if it failed, the caller can // accommodate that to make forward progress. type BackendMemory struct { - // InputsWeights are always located on the CPU and cannot be moved + // InputWeights are always located on the CPU and cannot be moved InputWeights Memory // CPU model components are located in system memory. This does not diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 5fef97cd..931386d5 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -271,7 +271,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0]))) C.ggml_set_name(tt, cname) - slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) + logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt)) if layer == -1 { @@ -378,7 +378,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } for bs := range maps.Values(bbs) { - slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), + logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs)))) } @@ -811,7 +811,7 @@ func (c *Context) Reserve() { } } - slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), + logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size))) } diff --git a/ml/backend/ggml/ggml/src/ggml.cpp b/ml/backend/ggml/ggml/src/ggml.cpp index 0d388d45..f5bcb446 100644 --- a/ml/backend/ggml/ggml/src/ggml.cpp +++ b/ml/backend/ggml/ggml/src/ggml.cpp @@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{ return false; } const auto prev{std::get_terminate()}; - GGML_ASSERT(prev != ggml_uncaught_exception); - previous_terminate_handler = prev; + // GGML_ASSERT(prev != ggml_uncaught_exception); + if (prev != ggml_uncaught_exception) { + previous_terminate_handler = prev; + } else { + GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__); + } std::set_terminate(ggml_uncaught_exception); return true; }(); diff --git a/model/bytepairencoding.go b/model/bytepairencoding.go index e4083dfc..b2cb1132 100644 --- a/model/bytepairencoding.go +++ b/model/bytepairencoding.go @@ -2,7 +2,6 @@ package model import ( "cmp" - "context" "fmt" "iter" "log/slog" @@ -202,7 +201,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { } } - slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids) + logutil.Trace("encoded", "string", s, "ids", ids) if addSpecial && len(ids) > 0 { ids = bpe.vocab.addSpecials(ids) @@ -243,6 +242,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { } } - slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids}) + logutil.Trace("decoded", "string", sb.String(), "from", lazyIdsString{ids: ids}) return sb.String(), nil } diff --git a/model/model.go b/model/model.go index 190dedc7..30754143 100644 --- a/model/model.go +++ b/model/model.go @@ -1,12 +1,10 @@ package model import ( - "context" "errors" "fmt" _ "image/jpeg" _ "image/png" - "log/slog" "os" "reflect" "strconv" @@ -198,7 +196,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { names := fn(tagsCopy) for _, name := range names { if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil { - slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor) + logutil.Trace("found tensor", "", tensor) vv.Set(reflect.ValueOf(tensor)) break } diff --git a/model/sentencepiece.go b/model/sentencepiece.go index 7d725f04..4300f45f 100644 --- a/model/sentencepiece.go +++ b/model/sentencepiece.go @@ -2,7 +2,6 @@ package model import ( "container/heap" - "context" "fmt" "log/slog" "strconv" @@ -25,7 +24,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary { } func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel { - slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5]) + logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5]) counter := map[int]int{} var maxTokenLen int @@ -39,7 +38,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel { } } - slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL], + logutil.Trace("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL], "user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE], "max token len", maxTokenLen) @@ -182,7 +181,7 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error) } } - slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids) + logutil.Trace("encoded", "string", s, "ids", ids) if addSpecial && len(ids) > 0 { ids = spm.vocab.addSpecials(ids) @@ -246,6 +245,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) { } } - slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String()) + logutil.Trace("decoded", "ids", ids, "string", sb.String()) return sb.String(), nil } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index b8605db8..9b54a372 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -429,12 +429,12 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er // before setting up the next batch so the seqs inputs are ready to receive their // token values and we get the correct input pointers for the batchInputs if pendingBatch.ctx != nil { - slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id) + logutil.Trace("forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id) <-pendingBatch.computeStartedCh - slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID) + logutil.Trace("forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID) nextBatch.inputsReadyCh = pendingBatch.outputsReadyCh // Chain the ouputs from the pending batch to the next inputs batch } else { - slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no pending batch detected", "batchID", s.batchID) + logutil.Trace("forwardBatch no pending batch detected", "batchID", s.batchID) // No pendingBatch, so the inputs will be ready in the seqs immediately nextBatch.inputsReadyCh = make(chan struct{}, 1) nextBatch.inputsReadyCh <- struct{}{} @@ -546,7 +546,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er if i+1 == len(seq.inputs) { batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1)) } - slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs)) + logutil.Trace("forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs)) seq.pendingInputs = append(seq.pendingInputs, inp) } @@ -560,7 +560,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er } if len(batchInputs) == 0 { - slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no batchInputs, going idle", "batchID", s.batchID) + logutil.Trace("forwardBatch no batchInputs, going idle", "batchID", s.batchID) nextBatch.ctx.Close() nextBatch.ctx = nil return @@ -589,14 +589,14 @@ func (s *Server) computeBatch(activeBatch batchState) { defer activeBatch.ctx.Close() // Wait until inputs are ready - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id) + logutil.Trace("computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id) <-activeBatch.inputsReadyCh - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: inputs are ready", "batchID", activeBatch.id) + logutil.Trace("computeBatch: inputs are ready", "batchID", activeBatch.id) // Once we complete, signal the next batch of inputs are ready // This will unblock the next computeBatch, or forwardBatch if new seqs come in defer func() { - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: outputs are ready", "batchID", activeBatch.id) + logutil.Trace("computeBatch: outputs are ready", "batchID", activeBatch.id) activeBatch.outputsReadyCh <- struct{}{} }() @@ -626,7 +626,7 @@ func (s *Server) computeBatch(activeBatch batchState) { // Detect if the sequence we're processing has already been completed and replaced // with a new sequence if seq != activeBatch.seqs[i] { - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i) + logutil.Trace("computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i) continue } @@ -666,18 +666,18 @@ func (s *Server) computeBatch(activeBatch batchState) { activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs) activeBatch.ctx.ComputeWithNotify( func() { - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: signaling computeStartedCh", "batchID", activeBatch.id) + logutil.Trace("computeBatch: signaling computeStartedCh", "batchID", activeBatch.id) activeBatch.computeStartedCh <- struct{}{} }, activeBatch.modelOutput) logits := activeBatch.modelOutput.Floats() - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: logits ready", "batchID", activeBatch.id) + logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id) s.mu.Lock() defer s.mu.Unlock() - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: decoding", "batchID", activeBatch.id) + logutil.Trace("computeBatch: decoding", "batchID", activeBatch.id) for i, seq := range s.seqs { if seq == nil || nextBatchTokens[i] == nil { continue @@ -697,7 +697,7 @@ func (s *Server) computeBatch(activeBatch batchState) { // sample a token vocabSize := len(logits) / len(activeBatch.batch.Outputs) - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(logits), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches) + logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(logits), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches) token, err := seq.sampler.Sample(logits[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize]) if err != nil { s.hardErrCh <- fmt.Errorf("failed to sample token: %w", err) @@ -711,7 +711,7 @@ func (s *Server) computeBatch(activeBatch batchState) { // TODO (jmorganca): we should send this back // as it's important for the /api/generate context // seq.responses <- piece - slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i) + logutil.Trace("computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i) s.removeSequence(i, llm.DoneReasonStop) continue }