mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -191,7 +191,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
|||||||
### Multimodal models
|
### Multimodal models
|
||||||
|
|
||||||
```
|
```
|
||||||
>>> What's in this image? /Users/jmorgan/Desktop/smile.png
|
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
||||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -355,6 +355,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
### Libraries
|
### Libraries
|
||||||
|
|
||||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||||
|
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||||
|
|||||||
25
SECURITY.md
Normal file
25
SECURITY.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Security
|
||||||
|
|
||||||
|
The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
|
||||||
|
|
||||||
|
## Reporting a vulnerability
|
||||||
|
|
||||||
|
If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
|
||||||
|
|
||||||
|
Please include the following details in your report:
|
||||||
|
- A description of the vulnerability
|
||||||
|
- Steps to reproduce the issue
|
||||||
|
- Your assessment of the potential impact
|
||||||
|
- Any possible mitigations
|
||||||
|
|
||||||
|
## Security best practices
|
||||||
|
|
||||||
|
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||||
|
|
||||||
|
- Regularly updating to the latest version of Ollama
|
||||||
|
- Securing access to hosted instances of Ollama
|
||||||
|
- Monitoring systems for unusual activity
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
|
||||||
|
For any other questions or concerns related to security, please contact us at hello@ollama.com
|
||||||
@@ -267,6 +267,10 @@ type EmbedRequest struct {
|
|||||||
type EmbedResponse struct {
|
type EmbedResponse struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Embeddings [][]float32 `json:"embeddings"`
|
Embeddings [][]float32 `json:"embeddings"`
|
||||||
|
|
||||||
|
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||||
|
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||||
|
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||||
|
|||||||
@@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
|||||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
|
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
|
||||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if res.PromptEvalCount != 8 {
|
||||||
|
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||||
@@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
|||||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
|
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
|
||||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if res.PromptEvalCount != 16 {
|
||||||
|
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
|
|||||||
7
llm/ext_server/server.cpp
vendored
7
llm/ext_server/server.cpp
vendored
@@ -1221,6 +1221,7 @@ struct llama_server_context
|
|||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||||
|
{"timings", slot.get_formated_timings()},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3203,11 +3204,15 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
||||||
json embeddings = json::array();
|
json embeddings = json::array();
|
||||||
|
|
||||||
|
int prompt_n = 0;
|
||||||
for (auto & elem : responses) {
|
for (auto & elem : responses) {
|
||||||
embeddings.push_back(elem.at("embedding"));
|
embeddings.push_back(elem.at("embedding"));
|
||||||
|
prompt_n += elem.at("timings").at("prompt_n").get<int>();
|
||||||
}
|
}
|
||||||
|
|
||||||
// send the result
|
// send the result
|
||||||
json embedding_res = json{{"embedding", embeddings}};
|
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
|
||||||
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
20
llm/patches/10-params.diff
Normal file
20
llm/patches/10-params.diff
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index a207451f..fba6b175 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
|
||||||
|
hparams.attn_soft_cap = true;
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
+ case 26: model.type = e_model::MODEL_2B; break;
|
||||||
|
case 42: model.type = e_model::MODEL_9B; break;
|
||||||
|
case 46: model.type = e_model::MODEL_27B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
@@ -11736,6 +11737,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
|
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
||||||
|
switch (model.type) {
|
||||||
|
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||||
|
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||||
|
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||||
|
default: GGML_ABORT("fatal error");
|
||||||
@@ -33,7 +33,7 @@ type LlamaServer interface {
|
|||||||
Ping(ctx context.Context) error
|
Ping(ctx context.Context) error
|
||||||
WaitUntilRunning(ctx context.Context) error
|
WaitUntilRunning(ctx context.Context) error
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||||
Embed(ctx context.Context, input []string) ([][]float32, error)
|
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
Close() error
|
Close() error
|
||||||
@@ -879,10 +879,11 @@ type EmbedRequest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type EmbedResponse struct {
|
type EmbedResponse struct {
|
||||||
Embedding [][]float32 `json:"embedding"`
|
Embedding [][]float32 `json:"embedding"`
|
||||||
|
PromptEvalCount int `json:"prompt_n"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
slog.Error("Failed to acquire semaphore", "error", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
|
|||||||
return nil, fmt.Errorf("%s", body)
|
return nil, fmt.Errorf("%s", body)
|
||||||
}
|
}
|
||||||
|
|
||||||
var embedding EmbedResponse
|
var e EmbedResponse
|
||||||
if err := json.Unmarshal(body, &embedding); err != nil {
|
if err := json.Unmarshal(body, &e); err != nil {
|
||||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return embedding.Embedding, nil
|
return &e, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type TokenizeRequest struct {
|
type TokenizeRequest struct {
|
||||||
|
|||||||
@@ -284,6 +284,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) EmbedHandler(c *gin.Context) {
|
func (s *Server) EmbedHandler(c *gin.Context) {
|
||||||
|
checkpointStart := time.Now()
|
||||||
var req api.EmbedRequest
|
var req api.EmbedRequest
|
||||||
err := c.ShouldBindJSON(&req)
|
err := c.ShouldBindJSON(&req)
|
||||||
switch {
|
switch {
|
||||||
@@ -332,6 +333,8 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
checkpointLoaded := time.Now()
|
||||||
|
|
||||||
kvData, err := getKVData(m.ModelPath, false)
|
kvData, err := getKVData(m.ModelPath, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
@@ -370,13 +373,16 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, e := range embeddings {
|
for i, e := range embeddings.Embedding {
|
||||||
embeddings[i] = normalize(e)
|
embeddings.Embedding[i] = normalize(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
resp := api.EmbedResponse{
|
resp := api.EmbedResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
Embeddings: embeddings,
|
Embeddings: embeddings.Embedding,
|
||||||
|
TotalDuration: time.Since(checkpointStart),
|
||||||
|
LoadDuration: checkpointLoaded.Sub(checkpointStart),
|
||||||
|
PromptEvalCount: embeddings.PromptEvalCount,
|
||||||
}
|
}
|
||||||
c.JSON(http.StatusOK, resp)
|
c.JSON(http.StatusOK, resp)
|
||||||
}
|
}
|
||||||
@@ -428,9 +434,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
embedding := make([]float64, len(embeddings[0]))
|
embedding := make([]float64, len(embeddings.Embedding[0]))
|
||||||
|
|
||||||
for i, v := range embeddings[0] {
|
for i, v := range embeddings.Embedding[0] {
|
||||||
embedding[i] = float64(v)
|
embedding[i] = float64(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
} else if loadedCount == 0 {
|
} else if loadedCount == 0 {
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||||
if g != nil {
|
if g != nil {
|
||||||
gpus = g
|
gpus = g
|
||||||
|
} else {
|
||||||
|
// Only allow partial loads when this is the first model
|
||||||
|
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||||
}
|
}
|
||||||
s.loadFn(pending, ggml, gpus, numParallel)
|
s.loadFn(pending, ggml, gpus, numParallel)
|
||||||
break
|
break
|
||||||
@@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
|
|
||||||
// Update free memory from currently loaded models
|
// Update free memory from currently loaded models
|
||||||
s.updateFreeSpace(availGpus)
|
s.updateFreeSpace(availGpus)
|
||||||
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
||||||
if fitGpus != nil {
|
if fitGpus != nil {
|
||||||
slog.Debug("new model fits with existing models, loading")
|
slog.Debug("new model fits with existing models, loading")
|
||||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||||
@@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
|
|||||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||||
|
|
||||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||||
|
// The list of GPUs returned will always be the same brand (library)
|
||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
@@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
|
*numParallel = 1
|
||||||
|
byLibrary := gpus.ByLibrary()
|
||||||
|
if len(byLibrary) <= 1 {
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
var bestEstimate uint64
|
||||||
|
var bestFit int
|
||||||
|
for i, gl := range byLibrary {
|
||||||
|
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||||
|
if estimatedVRAM > bestEstimate {
|
||||||
|
bestEstimate = estimatedVRAM
|
||||||
|
bestFit = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return byLibrary[bestFit]
|
||||||
|
}
|
||||||
|
|
||||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
|||||||
@@ -666,11 +666,50 @@ func TestAlreadyCanceled(t *testing.T) {
|
|||||||
require.Empty(t, scenario1a.req.successCh)
|
require.Empty(t, scenario1a.req.successCh)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHomogeneousGPUs(t *testing.T) {
|
||||||
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
|
defer done()
|
||||||
|
s := InitScheduler(ctx)
|
||||||
|
|
||||||
|
s.getGpuFn = func() gpu.GpuInfoList {
|
||||||
|
// Set memory values to require the model to be spread
|
||||||
|
gpus := []gpu.GpuInfo{
|
||||||
|
{Library: "cuda"},
|
||||||
|
{Library: "rocm"},
|
||||||
|
}
|
||||||
|
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||||
|
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||||
|
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||||
|
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
s.getCpuFn = getCpuFn
|
||||||
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||||
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
|
require.Len(t, gpus, 1)
|
||||||
|
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||||
|
}
|
||||||
|
slog.Info("a")
|
||||||
|
s.pendingReqCh <- a.req
|
||||||
|
require.Len(t, s.pendingReqCh, 1)
|
||||||
|
s.Run(ctx)
|
||||||
|
select {
|
||||||
|
case resp := <-a.req.successCh:
|
||||||
|
require.Equal(t, resp.llama, a.srv)
|
||||||
|
require.Empty(t, s.pendingReqCh)
|
||||||
|
require.Empty(t, a.req.errCh)
|
||||||
|
case err := <-a.req.errCh:
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
case <-ctx.Done():
|
||||||
|
t.Fatal("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type mockLlm struct {
|
type mockLlm struct {
|
||||||
pingResp error
|
pingResp error
|
||||||
waitResp error
|
waitResp error
|
||||||
completionResp error
|
completionResp error
|
||||||
embedResp [][]float32
|
embedResp *llm.EmbedResponse
|
||||||
embedRespErr error
|
embedRespErr error
|
||||||
tokenizeResp []int
|
tokenizeResp []int
|
||||||
tokenizeRespErr error
|
tokenizeRespErr error
|
||||||
@@ -688,7 +727,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
|
|||||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||||
return s.completionResp
|
return s.completionResp
|
||||||
}
|
}
|
||||||
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
|
||||||
return s.embedResp, s.embedRespErr
|
return s.embedResp, s.embedRespErr
|
||||||
}
|
}
|
||||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user