From 345420998e90090d2d6fba38ad5c2f3f5512adf4 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 11:57:26 -0700 Subject: [PATCH 1/6] Prevent partial loading on mixed GPU brands In mult-brand GPU setups, if we couldn't fully load the model we would fall through the scheduler and mistakenly try to load across a mix of brands. This makes sure we find the set of GPU(s) that best fit for the partial load. --- server/sched.go | 31 +++++++++++++++++++++++++++---- server/sched_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/server/sched.go b/server/sched.go index 2daed3ab..92b8d508 100644 --- a/server/sched.go +++ b/server/sched.go @@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) { } else if loadedCount == 0 { // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) - g := pickBestFitGPUs(pending, ggml, gpus, &numParallel) + g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel) if g != nil { gpus = g + } else { + // Only allow partial loads when this is the first model + gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel) } s.loadFn(pending, ggml, gpus, numParallel) break @@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // Update free memory from currently loaded models s.updateFreeSpace(availGpus) - fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel) + fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel) if fitGpus != nil { slog.Debug("new model fits with existing models, loading") s.loadFn(pending, ggml, fitGpus, numParallel) @@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool { // func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM } -// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits +// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits +// The list of GPUs returned will always be the same brand (library) // If the model can not be fit fully within the available GPU(s) nil is returned // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust // opts.NumCtx accordingly -func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { +func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { var estimatedVRAM uint64 var numParallelToTry []int @@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP return nil } +// If multiple Libraries are detected, pick the Library which loads the most layers for the model +func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { + *numParallel = 1 + byLibrary := gpus.ByLibrary() + if len(byLibrary) <= 1 { + return gpus + } + var bestEstimate uint64 + var bestFit int + for i, gl := range byLibrary { + _, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) + if estimatedVRAM > bestEstimate { + bestEstimate = estimatedVRAM + bestFit = i + } + } + return byLibrary[bestFit] +} + // findRunnerToUnload finds a runner to unload to make room for a new model func (s *Scheduler) findRunnerToUnload() *runnerRef { s.loadedMu.Lock() diff --git a/server/sched_test.go b/server/sched_test.go index 9ddd1fab..a186ce0e 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) { require.Empty(t, scenario1a.req.successCh) } +func TestHomogeneousGPUs(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer done() + s := InitScheduler(ctx) + + s.getGpuFn = func() gpu.GpuInfoList { + // Set memory values to require the model to be spread + gpus := []gpu.GpuInfo{ + {Library: "cuda"}, + {Library: "rocm"}, + } + gpus[0].TotalMemory = 1 * format.GibiByte + gpus[0].FreeMemory = 256 * format.MebiByte + gpus[1].TotalMemory = 1 * format.GibiByte + gpus[1].FreeMemory = 256 * format.MebiByte + return gpus + } + s.getCpuFn = getCpuFn + a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + require.Len(t, gpus, 1) + return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) + } + slog.Info("a") + s.pendingReqCh <- a.req + require.Len(t, s.pendingReqCh, 1) + s.Run(ctx) + select { + case resp := <-a.req.successCh: + require.Equal(t, resp.llama, a.srv) + require.Empty(t, s.pendingReqCh) + require.Empty(t, a.req.errCh) + case err := <-a.req.errCh: + t.Fatal(err.Error()) + case <-ctx.Done(): + t.Fatal("timeout") + } +} + type mockLlm struct { pingResp error waitResp error From 1b44d873e74f62de4f53f154da386919c1426f8b Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:12:21 -0700 Subject: [PATCH 2/6] Add Metrics to `api\embed` response (#5709) * add prompt tokens to embed response * rm slog * metrics * types * prompt n * clean up * reset submodule * update tests * test name * list metrics --- api/types.go | 4 ++++ integration/embed_test.go | 8 ++++++++ llm/ext_server/server.cpp | 7 ++++++- llm/server.go | 13 +++++++------ server/routes.go | 18 ++++++++++++------ server/sched_test.go | 4 ++-- 6 files changed, 39 insertions(+), 15 deletions(-) diff --git a/api/types.go b/api/types.go index ea5161ff..c2529652 100644 --- a/api/types.go +++ b/api/types.go @@ -267,6 +267,10 @@ type EmbedRequest struct { type EmbedResponse struct { Model string `json:"model"` Embeddings [][]float32 `json:"embeddings"` + + TotalDuration time.Duration `json:"total_duration,omitempty"` + LoadDuration time.Duration `json:"load_duration,omitempty"` + PromptEvalCount int `json:"prompt_eval_count,omitempty"` } // EmbeddingRequest is the request passed to [Client.Embeddings]. diff --git a/integration/embed_test.go b/integration/embed_test.go index 61b36fa2..10333d5d 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) { if !floatsEqual32(res.Embeddings[0][0], 0.010071031) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } + + if res.PromptEvalCount != 8 { + t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + } } func TestAllMiniLMBatchEmbed(t *testing.T) { @@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } + + if res.PromptEvalCount != 16 { + t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + } } func TestAllMiniLMEmbedTruncate(t *testing.T) { diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 0d51460c..d72bb1b1 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1221,6 +1221,7 @@ struct llama_server_context res.result_json = json { {"embedding", std::vector(embd, embd + n_embd)}, + {"timings", slot.get_formated_timings()}, }; } } @@ -3203,11 +3204,15 @@ int main(int argc, char **argv) { responses = result.result_json.value("results", std::vector{result.result_json}); json embeddings = json::array(); + + int prompt_n = 0; for (auto & elem : responses) { embeddings.push_back(elem.at("embedding")); + prompt_n += elem.at("timings").at("prompt_n").get(); } + // send the result - json embedding_res = json{{"embedding", embeddings}}; + json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}}; return res.set_content(embedding_res.dump(), "application/json; charset=utf-8"); } }); diff --git a/llm/server.go b/llm/server.go index 8127960f..afde077e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -33,7 +33,7 @@ type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error - Embed(ctx context.Context, input []string) ([][]float32, error) + Embed(ctx context.Context, input []string) (*EmbedResponse, error) Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error @@ -879,10 +879,11 @@ type EmbedRequest struct { } type EmbedResponse struct { - Embedding [][]float32 `json:"embedding"` + Embedding [][]float32 `json:"embedding"` + PromptEvalCount int `json:"prompt_n"` } -func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) { +func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) { if err := s.sem.Acquire(ctx, 1); err != nil { slog.Error("Failed to acquire semaphore", "error", err) return nil, err @@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err return nil, fmt.Errorf("%s", body) } - var embedding EmbedResponse - if err := json.Unmarshal(body, &embedding); err != nil { + var e EmbedResponse + if err := json.Unmarshal(body, &e); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } - return embedding.Embedding, nil + return &e, nil } type TokenizeRequest struct { diff --git a/server/routes.go b/server/routes.go index e6ffe526..a560f369 100644 --- a/server/routes.go +++ b/server/routes.go @@ -284,6 +284,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { } func (s *Server) EmbedHandler(c *gin.Context) { + checkpointStart := time.Now() var req api.EmbedRequest err := c.ShouldBindJSON(&req) switch { @@ -332,6 +333,8 @@ func (s *Server) EmbedHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + kvData, err := getKVData(m.ModelPath, false) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) @@ -370,13 +373,16 @@ func (s *Server) EmbedHandler(c *gin.Context) { return } - for i, e := range embeddings { - embeddings[i] = normalize(e) + for i, e := range embeddings.Embedding { + embeddings.Embedding[i] = normalize(e) } resp := api.EmbedResponse{ - Model: req.Model, - Embeddings: embeddings, + Model: req.Model, + Embeddings: embeddings.Embedding, + TotalDuration: time.Since(checkpointStart), + LoadDuration: checkpointLoaded.Sub(checkpointStart), + PromptEvalCount: embeddings.PromptEvalCount, } c.JSON(http.StatusOK, resp) } @@ -428,9 +434,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embedding := make([]float64, len(embeddings[0])) + embedding := make([]float64, len(embeddings.Embedding[0])) - for i, v := range embeddings[0] { + for i, v := range embeddings.Embedding[0] { embedding[i] = float64(v) } diff --git a/server/sched_test.go b/server/sched_test.go index a186ce0e..4f8789fa 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -709,7 +709,7 @@ type mockLlm struct { pingResp error waitResp error completionResp error - embedResp [][]float32 + embedResp *llm.EmbedResponse embedRespErr error tokenizeResp []int tokenizeRespErr error @@ -727,7 +727,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error { return s.completionResp } -func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) { +func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) { return s.embedResp, s.embedRespErr } func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) { From afa8d6e9d56da834a03df7817d065f6c8b46e102 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Tue, 30 Jul 2024 18:06:26 -0700 Subject: [PATCH 3/6] patch gemma support --- llm/patches/10-params.diff | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 llm/patches/10-params.diff diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff new file mode 100644 index 00000000..56699b8e --- /dev/null +++ b/llm/patches/10-params.diff @@ -0,0 +1,20 @@ +diff --git a/src/llama.cpp b/src/llama.cpp +index a207451f..fba6b175 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -4969,6 +4969,7 @@ static void llm_load_hparams( + hparams.attn_soft_cap = true; + + switch (hparams.n_layer) { ++ case 26: model.type = e_model::MODEL_2B; break; + case 42: model.type = e_model::MODEL_9B; break; + case 46: model.type = e_model::MODEL_27B; break; + default: model.type = e_model::MODEL_UNKNOWN; +@@ -11736,6 +11737,7 @@ struct llm_build_context { + + // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e + switch (model.type) { ++ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; + case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; + case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; + default: GGML_ABORT("fatal error"); From 5d6657835669064fa9658e6712b01887a072c606 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 30 Jul 2024 18:08:34 -0700 Subject: [PATCH 4/6] Update README.md Better example for multi-modal input --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 824b3761..0593a785 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol ### Multimodal models ``` ->>> What's in this image? /Users/jmorgan/Desktop/smile.png +ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png" The image features a yellow smiley face, which is likely the central focus of the picture. ``` From 3579b4966a9b21e048db4f7610e3f9f4a5c4dc64 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 30 Jul 2024 18:40:09 -0700 Subject: [PATCH 5/6] Update README to include Firebase Genkit (#6083) Firebase Genkit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0593a785..941a4f99 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Libraries - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa) +- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example) - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java) - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs) From 463a8aa2731a9fe5258c6c7e1466f3dae27f0c6a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 30 Jul 2024 21:01:12 -0700 Subject: [PATCH 6/6] Create SECURITY.md --- SECURITY.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..d38bb7c4 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,25 @@ +# Security + +The Ollama maintainer team takes security seriously and will actively work to resolve security issues. + +## Reporting a vulnerability + +If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly. + +Please include the following details in your report: +- A description of the vulnerability +- Steps to reproduce the issue +- Your assessment of the potential impact +- Any possible mitigations + +## Security best practices + +While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as: + +- Regularly updating to the latest version of Ollama +- Securing access to hosted instances of Ollama +- Monitoring systems for unusual activity + +## Contact + +For any other questions or concerns related to security, please contact us at hello@ollama.com