From 2cc7d050124929ae4745633fddf053585a22f0a2 Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 27 Jun 2024 12:45:16 -0400 Subject: [PATCH 1/6] update readme for gemma 2 (#5333) * update readme for gemma 2 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 97862573..72ed8fa5 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,8 @@ Here are some example models that can be downloaded: | Llama 3 | 70B | 40GB | `ollama run llama3:70b` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | -| Gemma | 2B | 1.4GB | `ollama run gemma:2b` | -| Gemma | 7B | 4.8GB | `ollama run gemma:7b` | +| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` | +| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` | | Mistral | 7B | 4.1GB | `ollama run mistral` | | Moondream 2 | 1.4B | 829MB | `ollama run moondream` | | Neural Chat | 7B | 4.1GB | `ollama run neural-chat` | From de2163dafd19b5ba2bed3d459354179662cc524d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 27 Jun 2024 10:52:25 -0700 Subject: [PATCH 2/6] gemma2 graph --- llm/ggml.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/llm/ggml.go b/llm/ggml.go index d0d0b6dd..cfead450 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -366,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui 4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), ) } - case "gemma": - fullOffload = 4 * batch * (embedding + vocab) - partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128 + case "gemma", "gemma2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads), + ) + + partialOffload = max( + 4*embedding*batch+embedding*vocab*105/128+4*vocab*batch, + 4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+ + 4*embeddingHeadsK*context*8+ + embedding*embeddingHeadsK*heads*9/16, + ) case "command-r": fullOffload = max( 4*batch*(embedding+vocab), From 6d4219083c56ec4b031f0fda67e9ef2c09ad9888 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:58:14 -0700 Subject: [PATCH 3/6] Update docs (#5312) --- docs/openai.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/openai.md b/docs/openai.md index 59e7d640..81b967eb 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \ #### Notes -- `finish_reason` will always be `stop` - `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached ## Models From b910fa90101038d09ca9cbbea16701831fafaffb Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:30:16 -0700 Subject: [PATCH 4/6] Ollama Show: Check for Projector Type (#5307) * Check exists projtype * Maintain Ordering --- cmd/cmd.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 909e8e4b..debb3921 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -672,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error { projectorData := [][]string{ {"arch", "clip"}, {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, - {"projector type", resp.ProjectorInfo["clip.projector_type"].(string)}, - {"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, - {"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, } + if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok { + projectorData = append(projectorData, []string{"projector type", projectorType.(string)}) + } + + projectorData = append(projectorData, + []string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, + []string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, + ) + mainTableData = append(mainTableData, []string{"Projector"}, []string{renderSubTable(projectorData, false)}, From 5f034f5b63cab3a5eb61104118727b088cceea21 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:15:52 -0700 Subject: [PATCH 5/6] Include Show Info in Interactive (#5342) --- cmd/cmd.go | 24 +++++++++++------------- cmd/interactive.go | 10 +--------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index debb3921..c898c7db 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -624,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified") } - if flagsSet == 1 { - req := api.ShowRequest{Name: args[0]} - resp, err := client.Show(cmd.Context(), &req) - if err != nil { - return err - } + req := api.ShowRequest{Name: args[0]} + resp, err := client.Show(cmd.Context(), &req) + if err != nil { + return err + } + if flagsSet == 1 { switch showType { case "license": fmt.Println(resp.License) @@ -647,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return nil } - req := api.ShowRequest{Name: args[0]} - resp, err := client.Show(cmd.Context(), &req) - if err != nil { - return err - } + showInfo(resp) + return nil +} + +func showInfo(resp *api.ShowResponse) { arch := resp.ModelInfo["general.architecture"].(string) modelData := [][]string{ @@ -711,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error { } table.Render() - - return nil } func renderSubTable(data [][]string, file bool) string { diff --git a/cmd/interactive.go b/cmd/interactive.go index 0a2f429b..9214f2db 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -404,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { switch args[1] { case "info": - fmt.Println("Model details:") - if len(resp.Details.Families) > 0 { - fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", ")) - } else if resp.Details.Family != "" { - fmt.Printf("Family %s\n", resp.Details.Family) - } - fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize) - fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel) - fmt.Println("") + showInfo(resp) case "license": if resp.License == "" { fmt.Println("No license was specified for this model.") From 717f7229eb4f9220d4070aae617923950643d327 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 28 Jun 2024 19:39:31 -0700 Subject: [PATCH 6/6] Do not shift context for sliding window models (#5368) * Do not shift context for sliding window models * truncate prompt > 2/3 tokens * only target gemma2 --- llm/ext_server/server.cpp | 46 +++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 492126a4..3bc01252 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1650,26 +1650,41 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + char buf[256]; + llama_model_meta_val_str(model, "general.architecture", buf, 256); + bool gemma2 = strcmp(buf, "gemma2") == 0; + + int32_t truncate_at = slot.n_ctx; + + // truncate at 2/3 of the context length for gemma2 models + // as they do not support context shifts (from the sliding window implementation). + // this way, prompts that almost fit the context length can still generate a full + // response without a sudden stop from hitting the context limit + if (gemma2) { + truncate_at = 2 * slot.n_ctx / 3; + } + // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at) { const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + const int n_shift = n_left / 2; + const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift; std::vector new_tokens( prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); new_tokens.insert( new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.begin() + slot.params.n_keep + n_erase, prompt_tokens.end()); - LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, + LOG_INFO("input truncated", { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"n_shift", n_shift}, + {"n_erase", n_erase}, }); slot.truncated = true; prompt_tokens = new_tokens; @@ -1678,6 +1693,19 @@ struct llama_server_context GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } + // Models with sliding window attention do not work with context shifts, so + // limit their prediction to the context length + if (gemma2) { + int32_t limit = slot.n_ctx - slot.n_prompt_tokens; + slot.n_predict = limit; + slot.params.n_predict = limit; + LOG_INFO("model does not support sliding window, limiting generation", { + {"n_ctx", slot.n_ctx}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"n_predict", slot.n_predict} + }); + } + if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling);