remove code to support igpu

Merge branch 'ollama:main' into main
Do not shift context for sliding window models (#5368 )
2025-12-22 14:53:56 +00:00 · 2024-06-29 22:32:45 +08:00 · 2024-06-29 20:59:45 +08:00 · 2024-06-28 19:39:31 -07:00 · 2024-06-28 13:15:52 -07:00 · 2024-06-28 11:30:16 -07:00
22 changed files with 826 additions and 208 deletions
--- a/README.md
+++ b/README.md
@@ -71,8 +71,8 @@ Here are some example models that can be downloaded:
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
-| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
 	}
 	defer tempfile.Close()
 	zipfile := zip.NewWriter(tempfile)
 	defer zipfile.Close()
 	detectContentType := func(path string) (string, error) {
 		f, err := os.Open(path)
 		if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
 		files = append(files, tks...)
 	}
 	zipfile := zip.NewWriter(tempfile)
 	defer zipfile.Close()
 	for _, file := range files {
 		f, err := os.Open(file)
 		if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 }
 func RunHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	name := args[0]
 	// check if the model exists on the server
 	show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
 		if err := PullHandler(cmd, []string{name}); err != nil {
 			return err
 		}
 		show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 		if err != nil {
 			return err
 		}
 	case err != nil:
 		return err
 	}
 	interactive := true
 	opts := runOptions{
 		Model:    args[0],
 		WordWrap: os.Getenv("TERM") == "xterm-256color",
 		Options:  map[string]interface{}{},
 		MultiModal:  slices.Contains(show.Details.Families, "clip"),
 		ParentModel: show.Details.ParentModel,
 	}
 	format, err := cmd.Flags().GetString("format")
@@ -362,12 +336,39 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.WordWrap = !nowrap
-	if !interactive {
+	// Fill out the rest of the options based on information about the
-		return generate(cmd, opts)
+	// model.
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	name := args[0]
 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
 		var se api.StatusError
 		if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
 			if err := PullHandler(cmd, []string{name}); err != nil {
 				return nil, err
 			}
 			return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 		}
 		return info, err
 	}()
 	if err != nil {
 		return err
 	}
 	opts.MultiModal = slices.Contains(info.Details.Families, "clip")
 	opts.ParentModel = info.Details.ParentModel
 	opts.Messages = append(opts.Messages, info.Messages...)
 	if interactive {
 		return generateInteractive(cmd, opts)
 	}
 	return generate(cmd, opts)
 }
 func errFromUnknownKey(unknownKeyErr error) error {
 	// find SSH public key in the error message
@@ -623,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
 	}
 	if flagsSet == 1 {
 	req := api.ShowRequest{Name: args[0]}
 	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return err
 	}
 	if flagsSet == 1 {
 		switch showType {
 		case "license":
 			fmt.Println(resp.License)
@@ -646,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}
-	req := api.ShowRequest{Name: args[0]}
+	showInfo(resp)
-	resp, err := client.Show(cmd.Context(), &req)
+
-	if err != nil {
+	return nil
 		return err
 }
 func showInfo(resp *api.ShowResponse) {
 	arch := resp.ModelInfo["general.architecture"].(string)
 	modelData := [][]string{
@@ -671,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		projectorData := [][]string{
 			{"arch", "clip"},
 			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
 			{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
 			{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
 			{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
 		}
 		if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
 			projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
 		}
 		projectorData = append(projectorData,
 			[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
 			[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
 		)
 		mainTableData = append(mainTableData,
 			[]string{"Projector"},
 			[]string{renderSubTable(projectorData, false)},
@@ -704,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	}
 	table.Render()
 	return nil
 }
 func renderSubTable(data [][]string, file bool) string {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -31,41 +31,24 @@ const (
 )
 func loadModel(cmd *cobra.Command, opts *runOptions) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
-	showReq := api.ShowRequest{Name: opts.Model}
+	client, err := api.ClientFromEnvironment()
 	showResp, err := client.Show(cmd.Context(), &showReq)
 	if err != nil {
 		return err
 	}
 	opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
 	opts.ParentModel = showResp.Details.ParentModel
 	if len(showResp.Messages) > 0 {
 		opts.Messages = append(opts.Messages, showResp.Messages...)
 	}
 	chatReq := &api.ChatRequest{
 		Model:     opts.Model,
-		Messages: []api.Message{},
+		KeepAlive: opts.KeepAlive,
 	}
-	if opts.KeepAlive != nil {
+	return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		chatReq.KeepAlive = opts.KeepAlive
 	}
 	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		p.StopAndClear()
 		if len(opts.Messages) > 0 {
 		for _, msg := range opts.Messages {
 			switch msg.Role {
 			case "user":
@@ -77,19 +60,11 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
 				fmt.Println()
 			}
 		}
 		}
 		return nil
 	})
 	if err != nil {
 		return err
 	}
 	return nil
 }
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	opts.Messages = make([]api.Message, 0)
 	err := loadModel(cmd, &opts)
 	if err != nil {
 		return err
@@ -429,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				switch args[1] {
 				case "info":
-					fmt.Println("Model details:")
+					showInfo(resp)
 					if len(resp.Details.Families) > 0 {
 						fmt.Printf("Family              %s\n", strings.Join(resp.Details.Families, ", "))
 					} else if resp.Details.Family != "" {
 						fmt.Printf("Family              %s\n", resp.Details.Family)
 					}
 					fmt.Printf("Parameter Size      %s\n", resp.Details.ParameterSize)
 					fmt.Printf("Quantization Level  %s\n", resp.Details.QuantizationLevel)
 					fmt.Println("")
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
 #### Notes
 - `finish_reason` will always be `stop`
 - `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
 ## Models
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -93,7 +93,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		//}
 		if gfxOverride == "" {
 			if !slices.Contains[[]string, string](supported, gfx) {
-				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
+				//slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
@@ -109,10 +109,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		if totalMemory < IGPUMemLimit {
+		//if totalMemory < IGPUMemLimit {
-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		//	slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
-			continue
+		//	continue
-		}
+		//}
 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1650,26 +1650,41 @@ struct llama_server_context
                    }
                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
                    char buf[256];
                    llama_model_meta_val_str(model, "general.architecture", buf, 256);
                    bool gemma2 = strcmp(buf, "gemma2") == 0;
                    int32_t truncate_at = slot.n_ctx;
                    // truncate at 2/3 of the context length for gemma2 models
                    // as they do not support context shifts (from the sliding window implementation).
                    // this way, prompts that almost fit the context length can still generate a full
                    // response without a sudden stop from hitting the context limit
                    if (gemma2) {
                        truncate_at = 2 * slot.n_ctx / 3;
                    }
                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
-                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
                    {
                        const int n_left = slot.n_ctx - slot.params.n_keep;
-                        const int n_block_size = n_left / 2;
+                        const int n_shift = n_left / 2;
-                        const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+                        const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
                        std::vector<llama_token> new_tokens(
                            prompt_tokens.begin(),
                            prompt_tokens.begin() + slot.params.n_keep);
                        new_tokens.insert(
                            new_tokens.end(),
-                            prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+                            prompt_tokens.begin() + slot.params.n_keep + n_erase,
                            prompt_tokens.end());
-                        LOG_VERBOSE("input truncated", {
+                        LOG_INFO("input truncated", {
                            {"n_ctx",        slot.n_ctx},
                            {"n_keep",       slot.params.n_keep},
                            {"n_left",       n_left},
-                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                            {"n_shift",      n_shift},
                            {"n_erase",      n_erase},
                        });
                        slot.truncated = true;
                        prompt_tokens = new_tokens;
@@ -1678,6 +1693,19 @@ struct llama_server_context
                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                    }
                    // Models with sliding window attention do not work with context shifts, so
                    // limit their prediction to the context length
                    if (gemma2) {
                        int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
                        slot.n_predict = limit;
                        slot.params.n_predict = limit;
                        LOG_INFO("model does not support sliding window, limiting generation", {
                            {"n_ctx", slot.n_ctx},
                            {"n_prompt_tokens", slot.n_prompt_tokens},
                            {"n_predict", slot.n_predict}
                        });
                    }
                    if (!slot.params.cache_prompt)
                    {
                        llama_sampling_reset(slot.ctx_sampling);
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -12,7 +12,7 @@ function amdGPUs {
        "gfx900"
        "gfx902"
        "gfx904"
-        "gfx90c"
+        "gfx90c:xnack-"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
@@ -22,7 +22,7 @@ function amdGPUs {
        "gfx942"
        "gfx1010"
        "gfx1011"
-        "gfx1012"
+        "gfx1012:xnack-"
        "gfx1030"
        "gfx1031"
        "gfx1032"
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }
-func (llm *ggla) decode(rs io.ReadSeeker) error {
+func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
 		return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
 			if errors.Is(err, io.EOF) {
 				return nil
 			}
 			return err
 		}
 		defer func() {
 			if errors.Is(retErr, io.EOF) {
 				retErr = io.ErrUnexpectedEOF
 			}
 		}()
 		var namesize uint32
 		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
 			return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 			return err
 		}
-		if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
+		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
 			return err
 		}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 	"io"
 	"strings"
 	"github.com/ollama/ollama/util/bufioutil"
 )
 type GGML struct {
@@ -278,7 +280,18 @@ func DetectGGMLType(b []byte) string {
 	}
 }
-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+// DecodeGGML decodes a GGML model from the given reader.
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
 func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	if maxArraySize == 0 {
 		maxArraySize = 1024
 	}
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
 		return nil, 0, err
@@ -291,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	case FILE_MAGIC_GGLA:
 		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian}
+		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian}
+		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
 		return nil, 0, errors.New("invalid file magic")
 	}
 	model, err := c.Decode(rs)
-	if errors.Is(err, io.EOF) {
+	if err != nil {
 		// noop
 	} else if err != nil {
 		return nil, 0, err
 	}
@@ -321,7 +332,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
@@ -355,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
-	case "gemma":
+	case "gemma", "gemma2":
-		fullOffload = 4 * batch * (embedding + vocab)
+		fullOffload = max(
-		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
+			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
 		)
 		partialOffload = max(
 			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
 			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
 				4*embeddingHeadsK*context*8+
 				embedding*embeddingHeadsK*heads*9/16,
 		)
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
--- a/llm/ggml_test.go
+++ b/llm/ggml_test.go
@@ -0,0 +1 @@
 package llm
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -3,11 +3,10 @@ package llm
 import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"strings"
 	"log/slog"
 )
 type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
 	maxArraySize int
 }
 func (c *containerGGUF) canCollectArray(size int) bool {
 	return c.maxArraySize < 0 || size <= c.maxArraySize
 }
 func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
 	}
 	model := newGGUF(c)
 	slog.Debug(fmt.Sprintf("model = %#v", model))
 	if err := model.Decode(rs); err != nil {
 		return nil, err
 	}
@@ -85,6 +89,8 @@ type gguf struct {
 	tensors []*Tensor
 	parameters uint64
 	scratch [16 << 10]byte
 }
 func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	}
 	// decode tensors
-	for i := 0; uint64(i) < llm.numTensor(); i++ {
+	for range llm.numTensor() {
 		name, err := readGGUFString(llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor name: %w", err)
 		}
 		// dims is the number of dimensions in the tensor
 		dims, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor dimensions: %w", err)
 		}
 		shape := [4]uint64{1, 1, 1, 1}
 		for i := 0; uint32(i) < dims; i++ {
 			shape[i], err = readGGUF[uint64](llm, rs)
 			if err != nil {
-				return err
+				return fmt.Errorf("failed to read tensor shape: %w", err)
 			}
 		}
 		kind, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor kind: %w", err)
 		}
 		offset, err := readGGUF[uint64](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor offset: %w", err)
 		}
 		tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		alignment = 32
 	}
 	for _, tensor := range llm.tensors {
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {
-		return err
+			return fmt.Errorf("failed to get current offset: %w", err)
 		}
 		padding := llm.padding(offset, int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-		return err
+			return fmt.Errorf("failed to seek to init padding: %w", err)
 		}
 	for _, tensor := range llm.tensors {
 		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
-			return err
+			return fmt.Errorf("failed to seek to tensor: %w", err)
 		}
 		padding := llm.padding(int64(tensor.Size()), int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
 			return err
 		}
 	}
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
 	return b.String(), nil
 }
 func discardGGUFString(llm *gguf, r io.Reader) error {
 	buf := llm.scratch[:8]
 	_, err := io.ReadFull(r, buf)
 	if err != nil {
 		return err
 	}
 	size := int(llm.ByteOrder.Uint64(buf))
 	for size > 0 {
 		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
 		if err != nil {
 			return err
 		}
 		size -= n
 	}
 	return nil
 }
 func readGGUFString(llm *gguf, r io.Reader) (string, error) {
 	if llm.Version == 1 {
 		return readGGUFV1String(llm, r)
 	}
-	var length uint64
+	buf := llm.scratch[:8]
-	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
+	_, err := io.ReadFull(r, buf)
 	if err != nil {
 		return "", err
 	}
-	var b bytes.Buffer
+	length := int(llm.ByteOrder.Uint64(buf))
-	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
+	if length > len(llm.scratch) {
 		buf = make([]byte, length)
 	} else {
 		buf = llm.scratch[:length]
 	}
 	clear(buf)
 	_, err = io.ReadFull(r, buf)
 	if err != nil {
 		return "", err
 	}
-
+	return string(buf), nil
 	return b.String(), nil
 }
 func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
 	return err
 }
-func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
+type array struct {
 	size   int
 	values []any
 }
 func (a *array) MarshalJSON() ([]byte, error) {
 	return json.Marshal(a.values)
 }
 func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
 	t, err := readGGUF[uint32](llm, r)
 	if err != nil {
 		return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
-	for i := 0; uint32(i) < n; i++ {
+	a := &array{size: int(n)}
 	if llm.canCollectArray(int(n)) {
 		a.values = make([]any, 0, int(n))
 	}
 	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
-		a = append(a, e)
+		if a.values != nil {
 			a.values[i] = e
 		}
 	}
-	return
+	return a, nil
 }
-func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	if llm.Version == 1 {
 		return readGGUFV1Array(llm, r)
 	}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
-	for i := 0; uint64(i) < n; i++ {
+	a := &array{size: int(n)}
 	if llm.canCollectArray(int(n)) {
 		a.values = make([]any, int(n))
 	}
 	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		case ggufTypeBool:
 			e, err = readGGUF[bool](llm, r)
 		case ggufTypeString:
 			if a.values != nil {
 				e, err = readGGUFString(llm, r)
 			} else {
 				err = discardGGUFString(llm, r)
 			}
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", t)
 		}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
-		a = append(a, e)
+		if a.values != nil {
 			a.values[i] = e
 		}
 	}
-	return
+	return a, nil
 }
 func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
 	tensors := []Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
 	}, tensors)
 	require.NoError(t, err)
-	ggml, err := LoadModel(f.Name())
+	ggml, err := LoadModel(f.Name(), 0)
-	require.NoError(t, err)
+	if err != nil {
 		t.Fatal(err)
 	}
 	// Simple CPU scenario
 	gpus := []gpu.GpuInfo{
--- a/llm/patches/07-gemma.diff
+++ b/llm/patches/07-gemma.diff
@@ -0,0 +1,305 @@
 From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
 From: Ollama maintainers <hello@ollama.com>
 Date: Wed, 26 Jun 2024 16:18:09 -0700
 Subject: [PATCH] Architecture support
 ---
 llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 1 deletion(-)
 diff --git a/llama.cpp b/llama.cpp
 index 61948751..3b4196f5 100644
 --- a/llama.cpp
 +++ b/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
     LLM_ARCH_INTERNLM2,
     LLM_ARCH_MINICPM,
     LLM_ARCH_GEMMA,
 +    LLM_ARCH_GEMMA2,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
     LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_INTERNLM2,       "internlm2"    },
     { LLM_ARCH_MINICPM,         "minicpm"      },
     { LLM_ARCH_GEMMA,           "gemma"        },
 +    { LLM_ARCH_GEMMA2,          "gemma2"       },
     { LLM_ARCH_STARCODER2,      "starcoder2"   },
     { LLM_ARCH_MAMBA,           "mamba"        },
     { LLM_ARCH_XVERSE,          "xverse"       },
@@ -464,10 +466,12 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_NORM,
     LLM_TENSOR_ATTN_NORM_2,
     LLM_TENSOR_ATTN_OUT_NORM,
 +    LLM_TENSOR_ATTN_POST_NORM,
     LLM_TENSOR_ATTN_ROT_EMBD,
     LLM_TENSOR_FFN_GATE_INP,
     LLM_TENSOR_FFN_GATE_INP_SHEXP,
     LLM_TENSOR_FFN_NORM,
 +    LLM_TENSOR_FFN_POST_NORM,
     LLM_TENSOR_FFN_GATE,
     LLM_TENSOR_FFN_DOWN,
     LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
 +    {
 +        LLM_ARCH_GEMMA2,
 +        {
 +            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
 +            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
 +            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
 +            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
 +            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
 +            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
 +            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
 +            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
 +            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
 +            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
 +            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
 +            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
 +            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
 +        },
 +    },
     {
         LLM_ARCH_STARCODER2,
         {
@@ -1941,6 +1963,8 @@ enum e_model {
     MODEL_8x22B,
     MODEL_16x12B,
     MODEL_10B_128x3_66B,
 +    MODEL_9B,
 +    MODEL_27B,
 };
 static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@ struct llama_layer {
     struct ggml_tensor * attn_out_norm_b;
     struct ggml_tensor * attn_q_a_norm;
     struct ggml_tensor * attn_kv_a_norm;
 +    struct ggml_tensor * attn_post_norm;
     // attention
     struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@ struct llama_layer {
     // normalization
     struct ggml_tensor * ffn_norm;
     struct ggml_tensor * ffn_norm_b;
 +    struct ggml_tensor * ffn_post_norm;
     struct ggml_tensor * layer_out_norm;
     struct ggml_tensor * layer_out_norm_b;
     struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
                 }
             } break;
         case LLM_ARCH_GEMMA:
 +            {
 +                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 +
 +                switch (hparams.n_layer) {
 +                    case 18: model.type = e_model::MODEL_9B; break;
 +                    case 28: model.type = e_model::MODEL_27B; break;
 +                    default: model.type = e_model::MODEL_UNKNOWN;
 +               }
 +            } break;
 +        case LLM_ARCH_GEMMA2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                     }
                 } break;
 +            case LLM_ARCH_GEMMA2:
 +                {
 +                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 +
 +                    // output
 +                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 +                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
 +
 +                    const int64_t n_ff          = hparams.n_ff;
 +                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
 +                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
 +                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
 +
 +                    for (uint32_t i = 0; i < n_layer; ++i) {
 +                        ggml_context * ctx_layer = ctx_for_layer(i);
 +                        ggml_context * ctx_split = ctx_for_layer_split(i);
 +
 +                        auto & layer = model.layers[i];
 +
 +                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 +
 +                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
 +                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
 +                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
 +                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
 +                        layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
 +
 +                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 +                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
 +                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 +                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
 +                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
 +                    }
 +                } break;
             case LLM_ARCH_STARCODER2:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@ struct llm_build_context {
         return gf;
     }
 +    struct ggml_cgraph * build_gemma2() {
 +        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 +
 +        const int64_t n_embd_head_k = hparams.n_embd_head_k;
 +
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
 +        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 +
 +        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
 +        cb(inpL, "inp_scaled", -1);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
 +        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 +
 +        for (int il = 0; il < n_layer; ++il) {
 +            // norm
 +            cur = llm_build_norm(ctx0, inpL, hparams,
 +                    model.layers[il].attn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // compute Q and K and RoPE them
 +                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +
 +                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +
 +                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +
 +                Qcur = ggml_rope_ext(
 +                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
 +                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                        ext_factor, attn_factor, beta_fast, beta_slow);
 +                cb(Qcur, "Qcur", il);
 +
 +                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
 +                cb(Qcur, "Qcur_scaled", il);
 +
 +                Kcur = ggml_rope_ext(
 +                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
 +                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                        ext_factor, attn_factor, beta_fast, beta_slow);
 +                cb(Kcur, "Kcur", il);
 +
 +                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
 +                        model.layers[il].wo, NULL,
 +                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
 +                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
 +                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
 +            }
 +
 +            cur = llm_build_norm(ctx0, cur, hparams,
 +                    model.layers[il].attn_post_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_post_norm", il);
 +
 +            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
 +            cb(sa_out, "sa_out", il);
 +
 +            cur = llm_build_norm(ctx0, sa_out, hparams,
 +                    model.layers[il].ffn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "ffn_norm", il);
 +
 +            // feed-forward network
 +            {
 +                cur = llm_build_ffn(ctx0, cur,
 +                        model.layers[il].ffn_up, NULL,
 +                        model.layers[il].ffn_gate, NULL,
 +                        model.layers[il].ffn_down, NULL,
 +                        NULL,
 +                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
 +                cb(cur, "ffn_out", il);
 +            }
 +
 +            cur = llm_build_norm(ctx0, cur, hparams,
 +                model.layers[il].ffn_post_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +            cb(cur, "ffn_post_norm", -1);
 +
 +            cur = ggml_add(ctx0, cur, sa_out);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
 +            inpL = cur;
 +        }
 +
 +        cur = inpL;
 +
 +        cur = llm_build_norm(ctx0, cur, hparams,
 +                model.output_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
 +
 +        // lm_head
 +        cur = ggml_mul_mat(ctx0, model.output, cur);
 +        cb(cur, "result_output", -1);
 +
 +        ggml_build_forward_expand(gf, cur);
 +
 +        return gf;
 +    }
 +
     struct ggml_cgraph * build_starcoder2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_gemma();
             } break;
 +        case LLM_ARCH_GEMMA2:
 +            {
 +                result = llm.build_gemma2();
 +            } break;
         case LLM_ARCH_STARCODER2:
             {
                 result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_GEMMA:
 +        case LLM_ARCH_GEMMA2:
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_GPTNEOX:
             return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
 -    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
 +    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {
 -- 
 2.45.2
--- a/llm/server.go
+++ b/llm/server.go
@@ -60,7 +60,12 @@ type llmServer struct {
 	sem *semaphore.Weighted
 }
-func LoadModel(model string) (*GGML, error) {
+// LoadModel will load a model from disk. The model must be in the GGML format.
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
 func LoadModel(model string, maxArraySize int) (*GGML, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
 	}
 	defer f.Close()
-	ggml, _, err := DecodeGGML(f)
+	ggml, _, err := DecodeGGML(f, maxArraySize)
 	return ggml, err
 }
@@ -412,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
 	}
 	defer file.Close()
-	ggml, _, err := DecodeGGML(file)
+	ggml, _, err := DecodeGGML(file, 0)
 	if err != nil {
 		return 0
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -423,7 +423,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 						}
-						ggml, _, err := llm.DecodeGGML(temp)
+						ggml, _, err := llm.DecodeGGML(temp, 0)
 						if err != nil {
 							return err
 						}
--- a/server/model.go
+++ b/server/model.go
@@ -11,6 +11,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()
-			ggml, _, err := llm.DecodeGGML(blob)
+			ggml, _, err := llm.DecodeGGML(blob, 0)
 			if err != nil {
 				return nil, err
 			}
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }
-func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
 	stat, err := file.Stat()
 	if err != nil {
-		return nil, err
+		return err
 	}
 	r, err := zip.NewReader(file, stat.Size())
 	if err != nil {
-		return nil, err
+		return err
 	}
 	tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
 	if err != nil {
 		return nil, err
 	}
 	defer os.RemoveAll(tempdir)
 	fn(api.ProgressResponse{Status: "unpacking model metadata"})
 	for _, f := range r.File {
 		n := filepath.Join(p, f.Name)
 		if !strings.HasPrefix(n, p) {
 			slog.Warn("skipped extracting file outside of context", "name", f.Name)
 			continue
 		}
 		if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
 			return err
 		}
 		// TODO(mxyng): this should not write out all files to disk
-		outfile, err := os.Create(filepath.Join(tempdir, f.Name))
+		outfile, err := os.Create(n)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer outfile.Close()
 		infile, err := f.Open()
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer infile.Close()
 		if _, err = io.Copy(outfile, infile); err != nil {
-			return nil, err
+			return err
 		}
 		if err := outfile.Close(); err != nil {
-			return nil, err
+			return err
 		}
 		if err := infile.Close(); err != nil {
-			return nil, err
+			return err
 		}
 	}
-	mf, err := convert.GetModelFormat(tempdir)
+	return nil
 }
 func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
 	if err != nil {
 		return nil, err
 	}
 	defer os.RemoveAll(tempDir)
 	if err := extractFromZipFile(tempDir, file, fn); err != nil {
 		return nil, err
 	}
 	mf, err := convert.GetModelFormat(tempDir)
 	if err != nil {
 		return nil, err
 	}
-	params, err := mf.GetParams(tempdir)
+	params, err := mf.GetParams(tempDir)
 	if err != nil {
 		return nil, err
 	}
-	mArch, err := mf.GetModelArch("", tempdir, params)
+	mArch, err := mf.GetModelArch("", tempDir, params)
 	if err != nil {
 		return nil, err
 	}
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
-	temp, err := os.CreateTemp(tempdir, "fp16")
+	temp, err := os.CreateTemp(tempDir, "fp16")
 	if err != nil {
 		return nil, err
 	}
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 	}
 	defer bin.Close()
-	ggml, _, err := llm.DecodeGGML(bin)
+	ggml, _, err := llm.DecodeGGML(bin, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 	var offset int64
 	for offset < stat.Size() {
-		ggml, n, err := llm.DecodeGGML(file)
+		ggml, n, err := llm.DecodeGGML(file, 0)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -0,0 +1,92 @@
 package server
 import (
 	"archive/zip"
 	"bytes"
 	"io"
 	"os"
 	"path/filepath"
 	"slices"
 	"testing"
 	"github.com/ollama/ollama/api"
 )
 func createZipFile(t *testing.T, name string) *os.File {
 	t.Helper()
 	f, err := os.CreateTemp(t.TempDir(), "")
 	if err != nil {
 		t.Fatal(err)
 	}
 	zf := zip.NewWriter(f)
 	defer zf.Close()
 	zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
 	if err != nil {
 		t.Fatal(err)
 	}
 	if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
 		t.Fatal(err)
 	}
 	return f
 }
 func TestExtractFromZipFile(t *testing.T) {
 	cases := []struct {
 		name   string
 		expect []string
 	}{
 		{
 			name:   "good",
 			expect: []string{"good"},
 		},
 		{
 			name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			f := createZipFile(t, tt.name)
 			defer f.Close()
 			tempDir := t.TempDir()
 			if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
 				t.Fatal(err)
 			}
 			var matches []string
 			if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
 				if err != nil {
 					return err
 				}
 				if !fi.IsDir() {
 					matches = append(matches, p)
 				}
 				return nil
 			}); err != nil {
 				t.Fatal(err)
 			}
 			var actual []string
 			for _, match := range matches {
 				rel, err := filepath.Rel(tempDir, match)
 				if err != nil {
 					t.Error(err)
 				}
 				actual = append(actual, rel)
 			}
 			if !slices.Equal(actual, tt.expect) {
 				t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
 			}
 		})
 	}
 }
--- a/server/routes.go
+++ b/server/routes.go
@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 }
 func getKVData(digest string, verbose bool) (llm.KV, error) {
-	kvData, err := llm.LoadModel(digest)
+	maxArraySize := 0
 	if verbose {
 		maxArraySize = -1
 	}
 	kvData, err := llm.LoadModel(digest, maxArraySize)
 	if err != nil {
 		return nil, err
 	}
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
 	schedCtx, schedDone := context.WithCancel(ctx)
 	sched := InitScheduler(schedCtx)
 	s := &Server{addr: ln.Addr(), sched: sched}
-	r := s.GenerateRoutes()
+
 	http.Handle("/", s.GenerateRoutes())
 	slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
 	srvr := &http.Server{
-		Handler: r,
+		// Use http.DefaultServeMux so we get net/http/pprof for
 		// free.
 		//
 		// TODO(bmizerany): Decide if we want to make this
 		// configurable so it is not exposed by default, or allow
 		// users to bind it to a different port. This was a quick
 		// and easy way to get pprof, but it may not be the best
 		// way.
 		Handler: nil,
 	}
 	// listen for a ctrl+c and stop any loaded llm
--- a/server/sched.go
+++ b/server/sched.go
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
 					if err != nil {
 						pending.errCh <- err
 						break
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	})
 	require.NoError(t, err)
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	scenario.ggml, err = llm.LoadModel(model.ModelPath)
+	scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 	scenario.req = &LlmRequest{
--- a/util/bufioutil/buffer_seeker.go
+++ b/util/bufioutil/buffer_seeker.go
@@ -0,0 +1,34 @@
 package bufioutil
 import (
 	"bufio"
 	"io"
 )
 type BufferedSeeker struct {
 	rs io.ReadSeeker
 	br *bufio.Reader
 }
 func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
 	return &BufferedSeeker{
 		rs: rs,
 		br: bufio.NewReaderSize(rs, size),
 	}
 }
 func (b *BufferedSeeker) Read(p []byte) (int, error) {
 	return b.br.Read(p)
 }
 func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
 	if whence == io.SeekCurrent {
 		offset -= int64(b.br.Buffered())
 	}
 	n, err := b.rs.Seek(offset, whence)
 	if err != nil {
 		return 0, err
 	}
 	b.br.Reset(b.rs)
 	return n, nil
 }
--- a/util/bufioutil/buffer_seeker_test.go
+++ b/util/bufioutil/buffer_seeker_test.go
@@ -0,0 +1,64 @@
 package bufioutil
 import (
 	"bytes"
 	"io"
 	"strings"
 	"testing"
 )
 func TestBufferedSeeker(t *testing.T) {
 	const alphabet = "abcdefghijklmnopqrstuvwxyz"
 	bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
 	checkRead := func(buf []byte, expected string) {
 		t.Helper()
 		_, err := bs.Read(buf)
 		if err != nil {
 			t.Fatal(err)
 		}
 		if !bytes.Equal(buf, []byte(expected)) {
 			t.Fatalf("expected %s, got %s", expected, buf)
 		}
 	}
 	// Read the first 5 bytes
 	buf := make([]byte, 5)
 	checkRead(buf, "abcde")
 	// Seek back to the beginning
 	_, err := bs.Seek(0, io.SeekStart)
 	if err != nil {
 		t.Fatal(err)
 	}
 	// read 'a'
 	checkRead(buf[:1], "a")
 	if bs.br.Buffered() == 0 {
 		t.Fatalf("totally unexpected sanity check failed")
 	}
 	// Seek past 'b'
 	_, err = bs.Seek(1, io.SeekCurrent)
 	if err != nil {
 		t.Fatal(err)
 	}
 	checkRead(buf, "cdefg")
 	// Seek back to the beginning
 	_, err = bs.Seek(0, io.SeekStart)
 	if err != nil {
 		t.Fatal(err)
 	}
 	checkRead(buf, "abcde")
 	// Seek to the end
 	_, err = bs.Seek(-5, io.SeekEnd)
 	if err != nil {
 		t.Fatal(err)
 	}
 	checkRead(buf, "vwxyz")
 }
Author	SHA1	Message	Date
likelovewant	1c648e512e	remove code to support igpu	2024-06-29 22:32:45 +08:00
likelovewant	159dcaa93b	Merge branch 'ollama:main' into main	2024-06-29 20:59:45 +08:00
Jeffrey Morgan	717f7229eb	Do not shift context for sliding window models (#5368 ) * Do not shift context for sliding window models * truncate prompt > 2/3 tokens * only target gemma2	2024-06-28 19:39:31 -07:00
royjhan	5f034f5b63	Include Show Info in Interactive (#5342 )	2024-06-28 13:15:52 -07:00
royjhan	b910fa9010	Ollama Show: Check for Projector Type (#5307 ) * Check exists projtype * Maintain Ordering	2024-06-28 11:30:16 -07:00
royjhan	6d4219083c	Update docs (#5312 )	2024-06-28 09:58:14 -07:00
Michael Yang	1ed4f521c4	Merge pull request #5340 from ollama/mxyng/mem gemma2 graph	2024-06-27 14:26:49 -07:00
Michael Yang	de2163dafd	gemma2 graph	2024-06-27 13:34:52 -07:00
Michael	2cc7d05012	update readme for gemma 2 (#5333 ) * update readme for gemma 2	2024-06-27 12:45:16 -04:00
likelovewant	b5286d46dc	Update gen_windows.ps1	2024-06-27 12:55:18 +08:00
likelovewant	d5fd3ae7ea	Merge branch 'ollama:main' into main	2024-06-27 12:44:25 +08:00
Michael Yang	123a722a6f	zip: prevent extracting files into parent dirs (#5314 )	2024-06-26 21:38:21 -07:00
Jeffrey Morgan	4d311eb731	llm: architecture patch (#5316 )	2024-06-26 21:38:12 -07:00
likelovewant	0fc2f9c5f2	Merge branch 'ollama:main' into main	2024-06-25 19:22:17 +08:00
likelovewant	7ef869f2dc	Update gen_windows.ps1	2024-06-25 19:21:02 +08:00
Blake Mizerany	cb42e607c5	llm: speed up gguf decoding by a lot (#5246 ) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF.	2024-06-24 21:47:52 -07:00
Blake Mizerany	2aa91a937b	cmd: defer stating model info until necessary (#5248 ) This commit changes the 'ollama run' command to defer fetching model information until it really needs it. That is, when in interactive mode. It also removes one such case where the model information is fetch in duplicate, just before calling generateInteractive and then again, first thing, in generateInteractive. This positively impacts the performance of the command: ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.168 total ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.220 total ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.217 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 4% cpu 0.652 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.01s user 0.01s system 5% cpu 0.498 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with or would you like to chat? ./after run llama3 'hi' 0.01s user 0.01s system 3% cpu 0.479 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 5% cpu 0.507 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 5% cpu 0.507 total	2024-06-24 20:14:03 -07:00