mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
501cb38b8c | ||
|
|
5994e8e8fd | ||
|
|
59e3a35203 | ||
|
|
b3e6120736 | ||
|
|
fb92b61754 | ||
|
|
8149a3c86e | ||
|
|
0cc90a8186 | ||
|
|
e42300f25b | ||
|
|
66e73809a1 |
@@ -624,6 +624,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||||
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
|
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
|
||||||
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
|
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
|
||||||
|
- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package harmony
|
package harmony
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -292,7 +291,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
|
|||||||
for _, event := range events {
|
for _, event := range events {
|
||||||
switch event := event.(type) {
|
switch event := event.(type) {
|
||||||
case HarmonyEventHeaderComplete:
|
case HarmonyEventHeaderComplete:
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "harmony event header complete", "header", event.Header)
|
logutil.Trace("harmony event header complete", "header", event.Header)
|
||||||
switch event.Header.Channel {
|
switch event.Header.Channel {
|
||||||
case "analysis":
|
case "analysis":
|
||||||
if event.Header.Recipient != "" {
|
if event.Header.Recipient != "" {
|
||||||
@@ -315,7 +314,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
|
|||||||
h.state = harmonyMessageState_Normal
|
h.state = harmonyMessageState_Normal
|
||||||
}
|
}
|
||||||
case HarmonyEventContentEmitted:
|
case HarmonyEventContentEmitted:
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "harmony event content", "content", event.Content, "state", h.state)
|
logutil.Trace("harmony event content", "content", event.Content, "state", h.state)
|
||||||
if h.state == harmonyMessageState_Normal {
|
if h.state == harmonyMessageState_Normal {
|
||||||
contentSb.WriteString(event.Content)
|
contentSb.WriteString(event.Content)
|
||||||
} else if h.state == harmonyMessageState_Thinking {
|
} else if h.state == harmonyMessageState_Thinking {
|
||||||
|
|||||||
@@ -0,0 +1,28 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Daniel Hiltgen <daniel@ollama.com>
|
||||||
|
Date: Fri, 29 Aug 2025 16:53:08 -0700
|
||||||
|
Subject: [PATCH] harden uncaught exception registration
|
||||||
|
|
||||||
|
---
|
||||||
|
ggml/src/ggml.cpp | 8 ++++++--
|
||||||
|
1 file changed, 6 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
|
||||||
|
index 0d388d45..f5bcb446 100644
|
||||||
|
--- a/ggml/src/ggml.cpp
|
||||||
|
+++ b/ggml/src/ggml.cpp
|
||||||
|
@@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const auto prev{std::get_terminate()};
|
||||||
|
- GGML_ASSERT(prev != ggml_uncaught_exception);
|
||||||
|
- previous_terminate_handler = prev;
|
||||||
|
+ // GGML_ASSERT(prev != ggml_uncaught_exception);
|
||||||
|
+ if (prev != ggml_uncaught_exception) {
|
||||||
|
+ previous_terminate_handler = prev;
|
||||||
|
+ } else {
|
||||||
|
+ GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
|
||||||
|
+ }
|
||||||
|
std::set_terminate(ggml_uncaught_exception);
|
||||||
|
return true;
|
||||||
|
}();
|
||||||
@@ -678,8 +678,12 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
|||||||
|
|
||||||
if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
|
if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
|
available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory
|
||||||
|
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
|
||||||
|
available = 0
|
||||||
|
}
|
||||||
slog.Info("gpu memory", "id", gpu.ID,
|
slog.Info("gpu memory", "id", gpu.ID,
|
||||||
"available", format.HumanBytes2(gpu.FreeMemory-envconfig.GpuOverhead()-gpu.MinimumMemory),
|
"available", format.HumanBytes2(available),
|
||||||
"free", format.HumanBytes2(gpu.FreeMemory),
|
"free", format.HumanBytes2(gpu.FreeMemory),
|
||||||
"minimum", format.HumanBytes2(gpu.MinimumMemory),
|
"minimum", format.HumanBytes2(gpu.MinimumMemory),
|
||||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
|
"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
|
||||||
@@ -861,7 +865,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
|||||||
}
|
}
|
||||||
layers[i] += memory.CPU.Weights[i].Size
|
layers[i] += memory.CPU.Weights[i].Size
|
||||||
layers[i] += memory.CPU.Cache[i].Size
|
layers[i] += memory.CPU.Cache[i].Size
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
|
logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
gpuLayers := ml.GPULayersList{}
|
gpuLayers := ml.GPULayersList{}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package logutil
|
package logutil
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -27,3 +28,11 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
|
|||||||
},
|
},
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Trace(msg string, args ...any) {
|
||||||
|
slog.Log(context.TODO(), LevelTrace, msg, args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TraceContext(ctx context.Context, msg string, args ...any) {
|
||||||
|
slog.Log(ctx, LevelTrace, msg, args...)
|
||||||
|
}
|
||||||
|
|||||||
@@ -266,7 +266,7 @@ func (m DeviceMemory) LogValue() slog.Value {
|
|||||||
// allocation is guaranteed to be provided so that if it failed, the caller can
|
// allocation is guaranteed to be provided so that if it failed, the caller can
|
||||||
// accommodate that to make forward progress.
|
// accommodate that to make forward progress.
|
||||||
type BackendMemory struct {
|
type BackendMemory struct {
|
||||||
// InputsWeights are always located on the CPU and cannot be moved
|
// InputWeights are always located on the CPU and cannot be moved
|
||||||
InputWeights Memory
|
InputWeights Memory
|
||||||
|
|
||||||
// CPU model components are located in system memory. This does not
|
// CPU model components are located in system memory. This does not
|
||||||
|
|||||||
@@ -271,7 +271,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
|
tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
|
||||||
C.ggml_set_name(tt, cname)
|
C.ggml_set_name(tt, cname)
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||||
|
|
||||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
||||||
if layer == -1 {
|
if layer == -1 {
|
||||||
@@ -378,7 +378,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for bs := range maps.Values(bbs) {
|
for bs := range maps.Values(bbs) {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
|
logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
|
||||||
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
|
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -811,7 +811,7 @@ func (c *Context) Reserve() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
||||||
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
|
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
8
ml/backend/ggml/ggml/src/ggml.cpp
vendored
8
ml/backend/ggml/ggml/src/ggml.cpp
vendored
@@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const auto prev{std::get_terminate()};
|
const auto prev{std::get_terminate()};
|
||||||
GGML_ASSERT(prev != ggml_uncaught_exception);
|
// GGML_ASSERT(prev != ggml_uncaught_exception);
|
||||||
previous_terminate_handler = prev;
|
if (prev != ggml_uncaught_exception) {
|
||||||
|
previous_terminate_handler = prev;
|
||||||
|
} else {
|
||||||
|
GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
|
||||||
|
}
|
||||||
std::set_terminate(ggml_uncaught_exception);
|
std::set_terminate(ggml_uncaught_exception);
|
||||||
return true;
|
return true;
|
||||||
}();
|
}();
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ package model
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"iter"
|
"iter"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@@ -202,12 +201,11 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
|
||||||
|
|
||||||
if addSpecial && len(ids) > 0 {
|
if addSpecial && len(ids) > 0 {
|
||||||
ids = bpe.vocab.addSpecials(ids)
|
ids = bpe.vocab.addSpecials(ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logutil.Trace("encoded", "string", s, "ids", ids)
|
||||||
return ids, nil
|
return ids, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,6 +241,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
|
logutil.Trace("decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
|
||||||
return sb.String(), nil
|
return sb.String(), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
package model
|
package model
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
_ "image/jpeg"
|
_ "image/jpeg"
|
||||||
_ "image/png"
|
_ "image/png"
|
||||||
"log/slog"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -105,6 +104,10 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
arch := b.Config().Architecture()
|
arch := b.Config().Architecture()
|
||||||
|
if b.Config().Uint("pooling_type", math.MaxUint32) != math.MaxUint32 {
|
||||||
|
arch = arch + "_embed"
|
||||||
|
}
|
||||||
|
|
||||||
f, ok := models[arch]
|
f, ok := models[arch]
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, fmt.Errorf("unsupported model architecture %q", arch)
|
return nil, fmt.Errorf("unsupported model architecture %q", arch)
|
||||||
@@ -198,7 +201,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
|
|||||||
names := fn(tagsCopy)
|
names := fn(tagsCopy)
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
|
if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
|
logutil.Trace("found tensor", "", tensor)
|
||||||
vv.Set(reflect.ValueOf(tensor))
|
vv.Set(reflect.ValueOf(tensor))
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|||||||
73
model/models/gemma3/embed.go
Normal file
73
model/models/gemma3/embed.go
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
package gemma3
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
"github.com/ollama/ollama/kvcache"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
"github.com/ollama/ollama/ml/nn"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
|
"github.com/ollama/ollama/model/input"
|
||||||
|
)
|
||||||
|
|
||||||
|
type embedModel struct {
|
||||||
|
model.Base
|
||||||
|
model.SentencePieceModel
|
||||||
|
|
||||||
|
*TextModel
|
||||||
|
PoolingType uint32
|
||||||
|
|
||||||
|
Dense [2]*nn.Linear `gguf:"dense"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *embedModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
|
batch.Outputs = batch.Positions // return all positions
|
||||||
|
hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
|
||||||
|
|
||||||
|
switch m.PoolingType {
|
||||||
|
case 0: // None
|
||||||
|
case 1: // Mean
|
||||||
|
hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mean(ctx)
|
||||||
|
hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||||
|
default:
|
||||||
|
return nil, errors.New("unsupported pooling type")
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dense := range m.Dense {
|
||||||
|
hiddenStates = dense.Forward(ctx, hiddenStates)
|
||||||
|
}
|
||||||
|
|
||||||
|
return hiddenStates, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newEmbedModel(c fs.Config) (model.Model, error) {
|
||||||
|
m := &embedModel{
|
||||||
|
SentencePieceModel: model.NewSentencePieceModel(
|
||||||
|
&model.Vocabulary{
|
||||||
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
|
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||||
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||||
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
|
EOS: append(
|
||||||
|
[]int32{
|
||||||
|
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
|
int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
|
||||||
|
},
|
||||||
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TextModel: newTextModel(c),
|
||||||
|
PoolingType: c.Uint("pooling_type", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
m.Cache = kvcache.NewWrapperCache(
|
||||||
|
kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
|
||||||
|
kvcache.NewCausalCache(m.Shift),
|
||||||
|
)
|
||||||
|
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
@@ -141,12 +141,11 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
return m.Output.Forward(ctx, hiddenStates), nil
|
||||||
|
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
model.Register("gemma3", New)
|
model.Register("gemma3", New)
|
||||||
|
model.Register("gemma3_embed", newEmbedModel)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -159,8 +159,11 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
|
|||||||
return hiddenState.Add(ctx, residual)
|
return hiddenState.Add(ctx, residual)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
|
func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
|
||||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
|
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
|
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
|
||||||
|
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||||
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))
|
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))
|
||||||
|
|
||||||
// set image embeddings
|
// set image embeddings
|
||||||
@@ -198,5 +201,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||||
return m.Output.Forward(ctx, hiddenState)
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ package model
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"container/heap"
|
"container/heap"
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -25,7 +24,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
|
func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
|
logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
|
||||||
|
|
||||||
counter := map[int]int{}
|
counter := map[int]int{}
|
||||||
var maxTokenLen int
|
var maxTokenLen int
|
||||||
@@ -39,7 +38,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
|
logutil.Trace("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
|
||||||
"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
|
"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
|
||||||
"max token len", maxTokenLen)
|
"max token len", maxTokenLen)
|
||||||
|
|
||||||
@@ -182,12 +181,11 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
|
||||||
|
|
||||||
if addSpecial && len(ids) > 0 {
|
if addSpecial && len(ids) > 0 {
|
||||||
ids = spm.vocab.addSpecials(ids)
|
ids = spm.vocab.addSpecials(ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logutil.Trace("encoded", "string", s, "ids", ids)
|
||||||
return ids, nil
|
return ids, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,6 +244,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
|
logutil.Trace("decoded", "ids", ids, "string", sb.String())
|
||||||
return sb.String(), nil
|
return sb.String(), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
|
|||||||
slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
|
slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("adding bos token to prompt", "id", v.BOS)
|
slog.Debug("adding bos token to prompt", "id", v.BOS[0])
|
||||||
ids = append([]int32{v.BOS[0]}, ids...)
|
ids = append([]int32{v.BOS[0]}, ids...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
|
|||||||
slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
|
slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("adding eos token to prompt", "id", v.EOS)
|
slog.Debug("adding eos token to prompt", "id", v.EOS[0])
|
||||||
ids = append(ids, v.EOS[0])
|
ids = append(ids, v.EOS[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ type InputCacheSlot struct {
|
|||||||
lastUsed time.Time
|
lastUsed time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) LoadCacheSlot(prompt []*input.Input) (*InputCacheSlot, []*input.Input, error) {
|
func (c *InputCache) LoadCacheSlot(prompt []*input.Input, cachePrompt bool) (*InputCacheSlot, []*input.Input, error) {
|
||||||
var slot *InputCacheSlot
|
var slot *InputCacheSlot
|
||||||
var numPast int32
|
var numPast int32
|
||||||
var err error
|
var err error
|
||||||
@@ -113,6 +113,10 @@ func (c *InputCache) LoadCacheSlot(prompt []*input.Input) (*InputCacheSlot, []*i
|
|||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !cachePrompt {
|
||||||
|
numPast = 0
|
||||||
|
}
|
||||||
|
|
||||||
slot.InUse = true
|
slot.InUse = true
|
||||||
slot.lastUsed = time.Now()
|
slot.lastUsed = time.Now()
|
||||||
|
|
||||||
|
|||||||
@@ -393,7 +393,7 @@ func TestLoadCacheSlot(t *testing.T) {
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
slot, remainingPrompt, err := tt.cache.LoadCacheSlot(tt.prompt)
|
slot, remainingPrompt, err := tt.cache.LoadCacheSlot(tt.prompt, true)
|
||||||
|
|
||||||
// Check error state
|
// Check error state
|
||||||
if (err != nil) != tt.wantErr {
|
if (err != nil) != tt.wantErr {
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"image"
|
"image"
|
||||||
"log"
|
"log"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@@ -405,6 +406,8 @@ func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
|
|||||||
func (s *Server) run(ctx context.Context) {
|
func (s *Server) run(ctx context.Context) {
|
||||||
s.ready.Wait()
|
s.ready.Wait()
|
||||||
|
|
||||||
|
supportsAsync := s.model.Backend().Config().Uint("pooling_type", math.MaxUint32) == math.MaxUint32
|
||||||
|
|
||||||
var activeBatch batchState
|
var activeBatch batchState
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
@@ -418,7 +421,12 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
go s.computeBatch(activeBatch)
|
|
||||||
|
if supportsAsync {
|
||||||
|
go s.computeBatch(activeBatch)
|
||||||
|
} else {
|
||||||
|
s.computeBatch(activeBatch)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -429,12 +437,12 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
|||||||
// before setting up the next batch so the seqs inputs are ready to receive their
|
// before setting up the next batch so the seqs inputs are ready to receive their
|
||||||
// token values and we get the correct input pointers for the batchInputs
|
// token values and we get the correct input pointers for the batchInputs
|
||||||
if pendingBatch.ctx != nil {
|
if pendingBatch.ctx != nil {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id)
|
logutil.Trace("forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id)
|
||||||
<-pendingBatch.computeStartedCh
|
<-pendingBatch.computeStartedCh
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID)
|
logutil.Trace("forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID)
|
||||||
nextBatch.inputsReadyCh = pendingBatch.outputsReadyCh // Chain the ouputs from the pending batch to the next inputs batch
|
nextBatch.inputsReadyCh = pendingBatch.outputsReadyCh // Chain the ouputs from the pending batch to the next inputs batch
|
||||||
} else {
|
} else {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no pending batch detected", "batchID", s.batchID)
|
logutil.Trace("forwardBatch no pending batch detected", "batchID", s.batchID)
|
||||||
// No pendingBatch, so the inputs will be ready in the seqs immediately
|
// No pendingBatch, so the inputs will be ready in the seqs immediately
|
||||||
nextBatch.inputsReadyCh = make(chan struct{}, 1)
|
nextBatch.inputsReadyCh = make(chan struct{}, 1)
|
||||||
nextBatch.inputsReadyCh <- struct{}{}
|
nextBatch.inputsReadyCh <- struct{}{}
|
||||||
@@ -546,7 +554,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
|||||||
if i+1 == len(seq.inputs) {
|
if i+1 == len(seq.inputs) {
|
||||||
batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
|
batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
|
||||||
}
|
}
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs))
|
logutil.Trace("forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs))
|
||||||
seq.pendingInputs = append(seq.pendingInputs, inp)
|
seq.pendingInputs = append(seq.pendingInputs, inp)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -560,7 +568,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(batchInputs) == 0 {
|
if len(batchInputs) == 0 {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no batchInputs, going idle", "batchID", s.batchID)
|
logutil.Trace("forwardBatch no batchInputs, going idle", "batchID", s.batchID)
|
||||||
nextBatch.ctx.Close()
|
nextBatch.ctx.Close()
|
||||||
nextBatch.ctx = nil
|
nextBatch.ctx = nil
|
||||||
return
|
return
|
||||||
@@ -589,14 +597,14 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
defer activeBatch.ctx.Close()
|
defer activeBatch.ctx.Close()
|
||||||
|
|
||||||
// Wait until inputs are ready
|
// Wait until inputs are ready
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id)
|
||||||
<-activeBatch.inputsReadyCh
|
<-activeBatch.inputsReadyCh
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: inputs are ready", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: inputs are ready", "batchID", activeBatch.id)
|
||||||
|
|
||||||
// Once we complete, signal the next batch of inputs are ready
|
// Once we complete, signal the next batch of inputs are ready
|
||||||
// This will unblock the next computeBatch, or forwardBatch if new seqs come in
|
// This will unblock the next computeBatch, or forwardBatch if new seqs come in
|
||||||
defer func() {
|
defer func() {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: outputs are ready", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: outputs are ready", "batchID", activeBatch.id)
|
||||||
activeBatch.outputsReadyCh <- struct{}{}
|
activeBatch.outputsReadyCh <- struct{}{}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -626,7 +634,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
// Detect if the sequence we're processing has already been completed and replaced
|
// Detect if the sequence we're processing has already been completed and replaced
|
||||||
// with a new sequence
|
// with a new sequence
|
||||||
if seq != activeBatch.seqs[i] {
|
if seq != activeBatch.seqs[i] {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i)
|
logutil.Trace("computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -666,18 +674,19 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs)
|
activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs)
|
||||||
activeBatch.ctx.ComputeWithNotify(
|
activeBatch.ctx.ComputeWithNotify(
|
||||||
func() {
|
func() {
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: signaling computeStartedCh", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: signaling computeStartedCh", "batchID", activeBatch.id)
|
||||||
activeBatch.computeStartedCh <- struct{}{}
|
activeBatch.computeStartedCh <- struct{}{}
|
||||||
},
|
},
|
||||||
activeBatch.modelOutput)
|
activeBatch.modelOutput)
|
||||||
logits := activeBatch.modelOutput.Floats()
|
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: logits ready", "batchID", activeBatch.id)
|
outputs := activeBatch.modelOutput.Floats()
|
||||||
|
|
||||||
|
logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id)
|
||||||
|
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: decoding", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: decoding", "batchID", activeBatch.id)
|
||||||
for i, seq := range s.seqs {
|
for i, seq := range s.seqs {
|
||||||
if seq == nil || nextBatchTokens[i] == nil {
|
if seq == nil || nextBatchTokens[i] == nil {
|
||||||
continue
|
continue
|
||||||
@@ -689,16 +698,15 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
|
|
||||||
// if done processing the prompt, generate an embedding and return
|
// if done processing the prompt, generate an embedding and return
|
||||||
if seq.embeddingOnly {
|
if seq.embeddingOnly {
|
||||||
// TODO(jessegross): Embedding support
|
seq.embedding <- outputs
|
||||||
slog.Warn("generation of embedding outputs not yet supported", "id", activeBatch.id, "seqIdx", i)
|
|
||||||
s.removeSequence(i, llm.DoneReasonStop)
|
s.removeSequence(i, llm.DoneReasonStop)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// sample a token
|
// sample a token
|
||||||
vocabSize := len(logits) / len(activeBatch.batch.Outputs)
|
vocabSize := len(outputs) / len(activeBatch.batch.Outputs)
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(logits), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches)
|
logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(outputs), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches)
|
||||||
token, err := seq.sampler.Sample(logits[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
|
token, err := seq.sampler.Sample(outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.hardErrCh <- fmt.Errorf("failed to sample token: %w", err)
|
s.hardErrCh <- fmt.Errorf("failed to sample token: %w", err)
|
||||||
return
|
return
|
||||||
@@ -711,7 +719,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
// TODO (jmorganca): we should send this back
|
// TODO (jmorganca): we should send this back
|
||||||
// as it's important for the /api/generate context
|
// as it's important for the /api/generate context
|
||||||
// seq.responses <- piece
|
// seq.responses <- piece
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i)
|
logutil.Trace("computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i)
|
||||||
s.removeSequence(i, llm.DoneReasonStop)
|
s.removeSequence(i, llm.DoneReasonStop)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -834,7 +842,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
found := false
|
found := false
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
s.seqsSem.Release(1)
|
s.seqsSem.Release(1)
|
||||||
@@ -890,6 +898,67 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if s.model.Backend().Config().Uint("pooling_type", math.MaxUint32) == math.MaxUint32 {
|
||||||
|
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req llm.EmbeddingRequest
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
|
||||||
|
if errors.Is(err, context.Canceled) {
|
||||||
|
slog.Info("aborting embedding request due to client closing the connection")
|
||||||
|
} else {
|
||||||
|
http.Error(w, fmt.Sprintf("failed to acquire semaphore: %v", err), http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.mu.Lock()
|
||||||
|
found := false
|
||||||
|
for i, sq := range s.seqs {
|
||||||
|
if sq == nil {
|
||||||
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, false)
|
||||||
|
if err != nil {
|
||||||
|
s.mu.Unlock()
|
||||||
|
s.seqsSem.Release(1)
|
||||||
|
http.Error(w, fmt.Sprintf("failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.seqs[i] = seq
|
||||||
|
s.cond.Signal()
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.mu.Unlock()
|
||||||
|
|
||||||
|
if !found {
|
||||||
|
s.seqsSem.Release(1)
|
||||||
|
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
|
||||||
|
Embedding: <-seq.embedding,
|
||||||
|
}); err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
if err := json.NewEncoder(w).Encode(&llm.ServerStatusResponse{
|
if err := json.NewEncoder(w).Encode(&llm.ServerStatusResponse{
|
||||||
@@ -1206,10 +1275,7 @@ func Execute(args []string) error {
|
|||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
// TODO: support embeddings
|
// TODO: support embeddings
|
||||||
mux.HandleFunc("POST /load", server.load)
|
mux.HandleFunc("POST /load", server.load)
|
||||||
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
|
mux.HandleFunc("POST /embedding", server.embeddings)
|
||||||
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
|
|
||||||
})
|
|
||||||
|
|
||||||
mux.HandleFunc("POST /completion", server.completion)
|
mux.HandleFunc("POST /completion", server.completion)
|
||||||
mux.HandleFunc("GET /health", server.health)
|
mux.HandleFunc("GET /health", server.health)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user