remove code to support igpu

Merge branch 'ollama:main' into main
Do not shift context for sliding window models (#5368 )
2025-12-22 14:53:56 +00:00 · 2024-06-29 22:32:45 +08:00 · 2024-06-29 20:59:45 +08:00 · 2024-06-28 19:39:31 -07:00 · 2024-06-28 13:15:52 -07:00 · 2024-06-28 11:30:16 -07:00
7 changed files with 77 additions and 45 deletions
--- a/README.md
+++ b/README.md
@@ -71,8 +71,8 @@ Here are some example models that can be downloaded:
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
-| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -624,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
 	}

-	if flagsSet == 1 {
 	req := api.ShowRequest{Name: args[0]}
 	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return err
 	}

+	if flagsSet == 1 {
 		switch showType {
 		case "license":
 			fmt.Println(resp.License)
@@ -647,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	req := api.ShowRequest{Name: args[0]}
-	resp, err := client.Show(cmd.Context(), &req)
-	if err != nil {
-		return err
-	}
+	showInfo(resp)

+	return nil
+}
+
+func showInfo(resp *api.ShowResponse) {
 	arch := resp.ModelInfo["general.architecture"].(string)

 	modelData := [][]string{
@@ -672,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		projectorData := [][]string{
 			{"arch", "clip"},
 			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
-			{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
-			{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
-			{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
 		}

+		if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
+			projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
+		}
+
+		projectorData = append(projectorData,
+			[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+			[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+		)
+
 		mainTableData = append(mainTableData,
 			[]string{"Projector"},
 			[]string{renderSubTable(projectorData, false)},
@@ -705,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	}

 	table.Render()
-
-	return nil
 }

 func renderSubTable(data [][]string, file bool) string {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -404,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 				switch args[1] {
 				case "info":
-					fmt.Println("Model details:")
-					if len(resp.Details.Families) > 0 {
-						fmt.Printf("Family              %s\n", strings.Join(resp.Details.Families, ", "))
-					} else if resp.Details.Family != "" {
-						fmt.Printf("Family              %s\n", resp.Details.Family)
-					}
-					fmt.Printf("Parameter Size      %s\n", resp.Details.ParameterSize)
-					fmt.Printf("Quantization Level  %s\n", resp.Details.QuantizationLevel)
-					fmt.Println("")
+					showInfo(resp)
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \

 #### Notes

- `finish_reason` will always be `stop`
 - `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached

 ## Models
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -93,7 +93,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		//}
 		if gfxOverride == "" {
 			if !slices.Contains[[]string, string](supported, gfx) {
-				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
+				//slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
@@ -109,10 +109,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		}

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		if totalMemory < IGPUMemLimit {
-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
-			continue
-		}
+		//if totalMemory < IGPUMemLimit {
+		//	slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		//	continue
+		//}

 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1650,26 +1650,41 @@ struct llama_server_context
                    }
                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);

+                    char buf[256];
+                    llama_model_meta_val_str(model, "general.architecture", buf, 256);
+                    bool gemma2 = strcmp(buf, "gemma2") == 0;
+
+                    int32_t truncate_at = slot.n_ctx;
+
+                    // truncate at 2/3 of the context length for gemma2 models
+                    // as they do not support context shifts (from the sliding window implementation).
+                    // this way, prompts that almost fit the context length can still generate a full
+                    // response without a sudden stop from hitting the context limit
+                    if (gemma2) {
+                        truncate_at = 2 * slot.n_ctx / 3;
+                    }
+
                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
-                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
                    {
                        const int n_left = slot.n_ctx - slot.params.n_keep;
-                        const int n_block_size = n_left / 2;
-                        const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+                        const int n_shift = n_left / 2;
+                        const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;

                        std::vector<llama_token> new_tokens(
                            prompt_tokens.begin(),
                            prompt_tokens.begin() + slot.params.n_keep);
                        new_tokens.insert(
                            new_tokens.end(),
-                            prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+                            prompt_tokens.begin() + slot.params.n_keep + n_erase,
                            prompt_tokens.end());

-                        LOG_VERBOSE("input truncated", {
+                        LOG_INFO("input truncated", {
                            {"n_ctx",        slot.n_ctx},
                            {"n_keep",       slot.params.n_keep},
                            {"n_left",       n_left},
-                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                            {"n_shift",      n_shift},
+                            {"n_erase",      n_erase},
                        });
                        slot.truncated = true;
                        prompt_tokens = new_tokens;
@@ -1678,6 +1693,19 @@ struct llama_server_context
                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                    }

+                    // Models with sliding window attention do not work with context shifts, so
+                    // limit their prediction to the context length
+                    if (gemma2) {
+                        int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
+                        slot.n_predict = limit;
+                        slot.params.n_predict = limit;
+                        LOG_INFO("model does not support sliding window, limiting generation", {
+                            {"n_ctx", slot.n_ctx},
+                            {"n_prompt_tokens", slot.n_prompt_tokens},
+                            {"n_predict", slot.n_predict}
+                        });
+                    }
+
                    if (!slot.params.cache_prompt)
                    {
                        llama_sampling_reset(slot.ctx_sampling);
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -366,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
-	case "gemma":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
+	case "gemma", "gemma2":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
+		)
+
+		partialOffload = max(
+			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
+				4*embeddingHeadsK*context*8+
+				embedding*embeddingHeadsK*heads*9/16,
+		)
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
Author	SHA1	Message	Date
likelovewant	1c648e512e	remove code to support igpu	2024-06-29 22:32:45 +08:00
likelovewant	159dcaa93b	Merge branch 'ollama:main' into main	2024-06-29 20:59:45 +08:00
Jeffrey Morgan	717f7229eb	Do not shift context for sliding window models (#5368 ) * Do not shift context for sliding window models * truncate prompt > 2/3 tokens * only target gemma2	2024-06-28 19:39:31 -07:00
royjhan	5f034f5b63	Include Show Info in Interactive (#5342 )	2024-06-28 13:15:52 -07:00
royjhan	b910fa9010	Ollama Show: Check for Projector Type (#5307 ) * Check exists projtype * Maintain Ordering	2024-06-28 11:30:16 -07:00
royjhan	6d4219083c	Update docs (#5312 )	2024-06-28 09:58:14 -07:00
Michael Yang	1ed4f521c4	Merge pull request #5340 from ollama/mxyng/mem gemma2 graph	2024-06-27 14:26:49 -07:00
Michael Yang	de2163dafd	gemma2 graph	2024-06-27 13:34:52 -07:00
Michael	2cc7d05012	update readme for gemma 2 (#5333 ) * update readme for gemma 2	2024-06-27 12:45:16 -04:00