Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-08-28 12:02:21 +08:00
committed by GitHub
53 changed files with 1609 additions and 332 deletions

View File

@@ -87,7 +87,7 @@ DialogFontSize=12
[Files] [Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
@@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
[Run] [Run]
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
[UninstallRun] [UninstallRun]
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
[Registry] [Registry]
Root: HKCU; Subkey: "Environment"; \ Root: HKCU; Subkey: "Environment"; \
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
Check: NeedsAddPath('{app}\bin') Check: NeedsAddPath('{app}')
[Code] [Code]

View File

@@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
// safetensors files might be unresolved git lfs references; skip if they are // safetensors files might be unresolved git lfs references; skip if they are
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
files = append(files, st...) files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapters.safetensors
files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapter_model.safetensors
files = append(files, st...)
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 { } else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are // pytorch files might also be unresolved git lfs references; skip if they are
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin // covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@@ -223,6 +229,14 @@ func tempZipFiles(path string) (string, error) {
} }
files = append(files, js...) files = append(files, js...)
// bert models require a nested config.json
// TODO(mxyng): merge this with the glob above
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
if err != nil {
return "", err
}
files = append(files, js...)
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
// tokenizer.model might be a unresolved git lfs reference; error if it is // tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -252,6 +266,11 @@ func tempZipFiles(path string) (string, error) {
return "", err return "", err
} }
zfi.Name, err = filepath.Rel(path, file)
if err != nil {
return "", err
}
zf, err := zipfile.CreateHeader(zfi) zf, err := zipfile.CreateHeader(zfi)
if err != nil { if err != nil {
return "", err return "", err

View File

@@ -7,16 +7,27 @@ import (
"io" "io"
"io/fs" "io/fs"
"log/slog" "log/slog"
"strings"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type Parameters struct { type ModelParameters struct {
Architectures []string `json:"architectures"` Architectures []string `json:"architectures"`
VocabSize uint32 `json:"vocab_size"` VocabSize uint32 `json:"vocab_size"`
} }
func (Parameters) KV(t *Tokenizer) llm.KV { type AdapterParameters struct {
Alpha uint32 `json:"lora_alpha"`
LoraLayers uint32 `json:"lora_layers"`
LoraParameters struct {
Rank uint32 `json:"rank"`
Alpha float32 `json:"alpha"`
Scale float32 `json:"scale"`
} `json:"lora_parameters"`
}
func (ModelParameters) KV(t *Tokenizer) llm.KV {
kv := llm.KV{ kv := llm.KV{
"general.file_type": uint32(1), "general.file_type": uint32(1),
"general.quantization_version": uint32(2), "general.quantization_version": uint32(2),
@@ -43,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (Parameters) specialTokenTypes() []string { func (p AdapterParameters) KV() llm.KV {
var alpha float32
if p.LoraParameters.Alpha == 0 {
alpha = float32(p.Alpha)
} else {
alpha = p.LoraParameters.Alpha
}
kv := llm.KV{
"adapter.lora.alpha": alpha,
"adapter.type": "lora",
"general.file_type": uint32(1),
"general.type": "adapter",
"general.version": "v0.2",
}
return kv
}
func (ModelParameters) specialTokenTypes() []string {
return []string{ return []string{
"bos", "eos", "unk", "sep", "pad", "cls", "mask", "bos", "eos", "unk", "sep", "pad", "cls", "mask",
} }
} }
func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts) return llm.WriteGGUF(ws, kv, ts)
} }
type Converter interface { func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
}
type ModelConverter interface {
// KV maps parameters to LLM key-values // KV maps parameters to LLM key-values
KV(*Tokenizer) llm.KV KV(*Tokenizer) llm.KV
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
// tensorName returns the LLM tensor name for a specific input name
tensorName(string) string
// specialTokenTypes returns any special token types the model uses // specialTokenTypes returns any special token types the model uses
specialTokenTypes() []string specialTokenTypes() []string
// writeFile writes the model to the provided io.WriteSeeker
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
} }
type moreParser interface {
parseMore(fs.FS) error
}
type AdapterConverter interface {
// KV maps parameters to LLM key-values
KV(llm.KV) llm.KV
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
bts, err := fs.ReadFile(fsys, "adapter_config.json")
if err != nil {
return err
}
var p AdapterParameters
if err := json.Unmarshal(bts, &p); err != nil {
return err
}
arch, ok := baseKV["general.architecture"]
if !ok {
return errors.New("architecture not set for the base model")
}
var conv AdapterConverter
switch arch {
case "llama":
conv = &llamaAdapter{}
case "gemma2":
conv = &gemma2Adapter{}
default:
return errors.New("unsupported architecture")
}
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
if err != nil {
return err
}
if err := json.Unmarshal(bts, conv); err != nil {
return err
}
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
}
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
// and files it finds in the input path. // and files it finds in the input path.
// Supported input model formats include safetensors. // Supported input model formats include safetensors.
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
func Convert(fsys fs.FS, ws io.WriteSeeker) error { func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
bts, err := fs.ReadFile(fsys, "config.json") bts, err := fs.ReadFile(fsys, "config.json")
if err != nil { if err != nil {
return err return err
} }
var p Parameters var p ModelParameters
if err := json.Unmarshal(bts, &p); err != nil { if err := json.Unmarshal(bts, &p); err != nil {
return err return err
} }
@@ -85,16 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
return errors.New("unknown architecture") return errors.New("unknown architecture")
} }
var conv Converter var conv ModelConverter
switch p.Architectures[0] { switch p.Architectures[0] {
case "LlamaForCausalLM", "MistralForCausalLM": case "LlamaForCausalLM", "MistralForCausalLM":
conv = &llama{} conv = &llamaModel{}
case "MixtralForCausalLM": case "MixtralForCausalLM":
conv = &mixtral{} conv = &mixtralModel{}
case "GemmaForCausalLM": case "GemmaForCausalLM":
conv = &gemma{} conv = &gemmaModel{}
case "Gemma2ForCausalLM":
conv = &gemma2Model{}
case "Phi3ForCausalLM": case "Phi3ForCausalLM":
conv = &phi3{} conv = &phi3Model{}
case "BertModel":
conv = &bertModel{}
default: default:
return errors.New("unsupported architecture") return errors.New("unsupported architecture")
} }
@@ -103,6 +197,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
return err return err
} }
if t, ok := conv.(moreParser); ok {
if err := t.parseMore(fsys); err != nil {
return err
}
}
t, err := parseTokenizer(fsys, conv.specialTokenTypes()) t, err := parseTokenizer(fsys, conv.specialTokenTypes())
if err != nil { if err != nil {
return err return err
@@ -119,7 +219,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
} }
ts, err := parseTensors(fsys) ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
if err != nil { if err != nil {
return err return err
} }

174
convert/convert_bert.go Normal file
View File

@@ -0,0 +1,174 @@
package convert
import (
"cmp"
"encoding/json"
"io/fs"
"path/filepath"
"slices"
"strings"
"github.com/ollama/ollama/llm"
)
type bertModel struct {
ModelParameters
NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
NCtx uint32 `json:"n_ctx"`
HiddenSize uint32 `json:"hidden_size"`
NEmbd uint32 `json:"n_embd"`
IntermediateSize uint32 `json:"intermediate_size"`
NInner uint32 `json:"n_inner"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NHead uint32 `json:"n_head"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
LayerNormEPS float32 `json:"layer_norm_eps"`
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
NormEpsilon float32 `json:"norm_epsilon"`
PoolingType uint32
}
var (
_ ModelConverter = (*bertModel)(nil)
_ moreParser = (*bertModel)(nil)
)
func (p *bertModel) parseMore(fsys fs.FS) error {
bts, err := fs.ReadFile(fsys, "modules.json")
if err != nil {
return err
}
var modules []struct {
Type string `json:"type"`
Path string `json:"path"`
}
if err := json.Unmarshal(bts, &modules); err != nil {
return err
}
var pooling string
for _, m := range modules {
if m.Type == "sentence_transformers.models.Pooling" {
pooling = m.Path
break
}
}
if pooling != "" {
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
if err != nil {
return err
}
var pc struct {
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
}
if err := json.Unmarshal(bts, &pc); err != nil {
return err
}
if pc.PoolingModeMeanTokens {
p.PoolingType = 1
} else if pc.PoolingModeCLSToken {
p.PoolingType = 2
}
}
return nil
}
func (p *bertModel) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "bert"
kv["bert.attention.causal"] = false
kv["bert.pooling_type"] = p.PoolingType
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
kv["bert.context_length"] = contextLength
}
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
}
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
}
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
}
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
}
kv["tokenizer.ggml.model"] = "bert"
kv["tokenizer.ggml.token_type_count"] = uint32(2)
// convert to phantom space tokens
for i, e := range t.Tokens {
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
// noop
} else if strings.HasPrefix(e, "##") {
t.Tokens[i] = e[2:]
} else {
t.Tokens[i] = "\u2581" + e
}
}
kv["tokenizer.ggml.tokens"] = t.Tokens
return kv
}
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
if slices.Contains([]string{
"embeddings.position_ids",
"pooler.dense.weight",
"pooler.dense.bias",
}, t.Name()) {
continue
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (bertModel) Replacements() []string {
return []string{
"encoder.layer", "blk",
"encoder.layers", "blk",
"embeddings.word_embeddings", "token_embd",
"embeddings.token_type_embeddings", "token_types",
"embeddings.LayerNorm", "token_embd_norm",
"embeddings.position_embeddings", "position_embd",
"attention.self.query", "attn_q",
"attention.self.key", "attn_k",
"attention.self.value", "attn_v",
"attention.output.dense", "attn_output",
"attention.output.LayerNorm", "attn_output_norm",
"intermediate.dense", "ffn_up",
"output.dense", "ffn_down",
"output.LayerNorm", "layer_output_norm",
}
}

View File

@@ -9,8 +9,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type gemma struct { type gemmaModel struct {
Parameters ModelParameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"` HiddenSize uint32 `json:"hidden_size"`
HiddenLayers uint32 `json:"num_hidden_layers"` HiddenLayers uint32 `json:"num_hidden_layers"`
@@ -21,12 +21,11 @@ type gemma struct {
HeadDim uint32 `json:"head_dim"` HeadDim uint32 `json:"head_dim"`
} }
var _ Converter = (*gemma)(nil) var _ ModelConverter = (*gemmaModel)(nil)
func (p *gemma) KV(t *Tokenizer) llm.KV { func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
kv := p.Parameters.KV(t) kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma" kv["general.architecture"] = "gemma"
kv["general.name"] = "gemma"
kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.context_length"] = p.MaxPositionEmbeddings
kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.embedding_length"] = p.HiddenSize
kv["gemma.block_count"] = p.HiddenLayers kv["gemma.block_count"] = p.HiddenLayers
@@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor var out []llm.Tensor
for _, t := range ts { for _, t := range ts {
name := p.tensorName(t.Name()) if strings.HasSuffix(t.Name(), "_norm.weight") {
if strings.HasSuffix(name, "_norm.weight") {
t.SetRepacker(p.addOne) t.SetRepacker(p.addOne)
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: name, Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *gemma) tensorName(n string) string { func (p *gemmaModel) Replacements() []string {
return strings.NewReplacer( return []string{
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
"model.layers", "blk", "model.layers", "blk",
@@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up", "mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
"block_sparse_moe.gate", "ffn_inp", }
).Replace(n)
} }
func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data)) n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
ones := tensor.Ones(tensor.Float32, int(shape[0])) ones := tensor.Ones(tensor.Float32, int(shape[0]))

43
convert/convert_gemma2.go Normal file
View File

@@ -0,0 +1,43 @@
package convert
import (
"github.com/ollama/ollama/llm"
)
type gemma2Model struct {
gemmaModel
SlidingWindow uint32 `json:"sliding_window"`
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
}
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma2"
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
kv["gemma2.embedding_length"] = p.HiddenSize
kv["gemma2.block_count"] = p.HiddenLayers
kv["gemma2.feed_forward_length"] = p.IntermediateSize
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
kv["gemma2.attention.key_length"] = p.HeadDim
kv["gemma2.attention.value_length"] = p.HeadDim
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
return kv
}
func (p *gemma2Model) Replacements() []string {
return append(
p.gemmaModel.Replacements(),
"post_attention_layernorm", "post_attention_norm",
"pre_feedforward_layernorm", "ffn_norm",
"post_feedforward_layernorm", "post_ffw_norm",
)
}

View File

@@ -0,0 +1,91 @@
package convert
import (
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type gemma2Adapter struct {
AdapterParameters
}
var _ AdapterConverter = (*gemma2Adapter)(nil)
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "gemma2"
return kv
}
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (p *gemma2Adapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View File

@@ -3,6 +3,7 @@ package convert
import ( import (
"cmp" "cmp"
"fmt" "fmt"
"math"
"strings" "strings"
"github.com/pdevine/tensor" "github.com/pdevine/tensor"
@@ -11,8 +12,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type llama struct { type llamaModel struct {
Parameters ModelParameters
NLayers uint32 `json:"n_layers"` NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"` NLayer uint32 `json:"n_layer"`
@@ -27,8 +28,14 @@ type llama struct {
NumKeyValueHeads uint32 `json:"num_key_value_heads"` NumKeyValueHeads uint32 `json:"num_key_value_heads"`
RopeTheta float32 `json:"rope_theta"` RopeTheta float32 `json:"rope_theta"`
RopeScaling struct { RopeScaling struct {
Type string `json:"type"` Type string `json:"type"`
Factor float32 `json:"factor"` RopeType string `json:"rope_type"`
Factor float32 `json:"factor"`
LowFrequencyFactor float32 `json:"low_freq_factor"`
HighFrequencyFactor float32 `json:"high_freq_factor"`
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
factors ropeFactor
} `json:"rope_scaling"` } `json:"rope_scaling"`
RMSNormEPS float32 `json:"rms_norm_eps"` RMSNormEPS float32 `json:"rms_norm_eps"`
LayerNormEPS float32 `json:"layer_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"`
@@ -37,12 +44,11 @@ type llama struct {
HeadDim uint32 `json:"head_dim"` HeadDim uint32 `json:"head_dim"`
} }
var _ Converter = (*llama)(nil) var _ ModelConverter = (*llamaModel)(nil)
func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llamaModel) KV(t *Tokenizer) llm.KV {
kv := p.Parameters.KV(t) kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "llama" kv["general.architecture"] = "llama"
kv["general.name"] = "llama"
kv["llama.vocab_size"] = p.VocabSize kv["llama.vocab_size"] = p.VocabSize
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
if p.RopeScaling.Type == "linear" { if p.RopeScaling.Type == "linear" {
kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.type"] = p.RopeScaling.Type
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
} else if p.RopeScaling.RopeType == "llama3" {
dim := p.HiddenSize / p.NumAttentionHeads
for i := uint32(0); i < dim; i += 2 {
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
lambdaLow := float32(original) / factorLow
lambdaHigh := float32(original) / factorHigh
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
if lambda < float64(lambdaHigh) {
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
} else if lambda > float64(lambdaLow) {
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
} else {
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
}
}
} }
if p.NumKeyValueHeads > 0 { if p.NumKeyValueHeads > 0 {
@@ -93,17 +120,26 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *llama) Tensors(ts []Tensor) []llm.Tensor { func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor var out []llm.Tensor
if p.RopeScaling.factors != nil {
out = append(out, llm.Tensor{
Name: "rope_freqs.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
WriterTo: p.RopeScaling.factors,
})
}
for _, t := range ts { for _, t := range ts {
name := p.tensorName(t.Name()) if strings.HasSuffix(t.Name(), "attn_q.weight") ||
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
strings.HasSuffix(name, "attn_k.weight") {
t.SetRepacker(p.repack) t.SetRepacker(p.repack)
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: name, Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *llama) tensorName(n string) string { func (p *llamaModel) Replacements() []string {
return strings.NewReplacer( return []string{
"lm_head", "output", "lm_head", "output",
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
@@ -128,21 +164,19 @@ func (p *llama) tensorName(n string) string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up", "mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
// mixtral }
"block_sparse_moe.gate", "ffn_gate_inp",
).Replace(n)
} }
func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
var dims []int var dims []int
for _, dim := range shape { for _, dim := range shape {
dims = append(dims, int(dim)) dims = append(dims, int(dim))
} }
var heads uint32 var heads uint32
if strings.HasSuffix(name, "q_proj.weight") { if strings.HasSuffix(name, "attn_q.weight") {
heads = p.NumAttentionHeads heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "k_proj.weight") { } else if strings.HasSuffix(name, "attn_k.weight") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else { } else {
return nil, fmt.Errorf("unknown tensor for repack: %s", name) return nil, fmt.Errorf("unknown tensor for repack: %s", name)

View File

@@ -0,0 +1,169 @@
package convert
import (
"cmp"
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type llamaAdapter struct {
AdapterParameters
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
}
var _ AdapterConverter = (*llamaAdapter)(nil)
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "llama"
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
return kv
}
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repackAndTranspose)
} else {
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: shape,
WriterTo: t,
})
}
return out
}
func (p *llamaAdapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else {
return data, nil
}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
}
if heads > 0 {
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
}
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View File

@@ -9,16 +9,14 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type mixtral struct { type mixtralModel struct {
llama llamaModel
NumLocalExperts uint32 `json:"num_local_experts"` NumLocalExperts uint32 `json:"num_local_experts"`
NumExpertsPerToken uint32 `json:"num_experts_per_tok"` NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
} }
var _ Converter = (*mixtral)(nil) func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
kv := p.llamaModel.KV(t)
func (p *mixtral) KV(t *Tokenizer) llm.KV {
kv := p.llama.KV(t)
if p.NumLocalExperts > 0 { if p.NumLocalExperts > 0 {
kv["llama.expert_count"] = p.NumLocalExperts kv["llama.expert_count"] = p.NumLocalExperts
@@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
oldnew := []string{ oldnew := []string{
"model.layers", "blk", "model.layers", "blk",
"w1", "ffn_gate_exps", "w1", "ffn_gate_exps",
@@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
}) })
} }
return append(out, p.llama.Tensors(ts)...) return append(out, p.llamaModel.Tensors(ts)...)
}
func (p *mixtralModel) Replacements() []string {
return append(
p.llamaModel.Replacements(),
"block_sparse_moe.gate", "ffn_gate_inp",
)
} }
type experts []Tensor type experts []Tensor

View File

@@ -11,8 +11,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type phi3 struct { type phi3Model struct {
Parameters ModelParameters
NumHiddenLayers uint32 `json:"num_hidden_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayers uint32 `json:"n_layers"` NLayers uint32 `json:"n_layers"`
HiddenSize uint32 `json:"hidden_size"` HiddenSize uint32 `json:"hidden_size"`
@@ -35,12 +35,11 @@ type phi3 struct {
SlidingWindow uint32 `json:"sliding_window"` SlidingWindow uint32 `json:"sliding_window"`
} }
var _ Converter = (*phi3)(nil) var _ ModelConverter = (*phi3Model)(nil)
func (p *phi3) KV(t *Tokenizer) llm.KV { func (p *phi3Model) KV(t *Tokenizer) llm.KV {
kv := p.Parameters.KV(t) kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "phi3" kv["general.architecture"] = "phi3"
kv["general.name"] = "phi3"
kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.context_length"] = p.MaxPositionEmbeddings
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
kv["phi3.feed_forward_length"] = p.IntermediateSize kv["phi3.feed_forward_length"] = p.IntermediateSize
@@ -69,13 +68,12 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
var addRopeFactors sync.Once var addRopeFactors sync.Once
out := make([]llm.Tensor, 0, len(ts)+2) out := make([]llm.Tensor, 0, len(ts)+2)
for _, t := range ts { for _, t := range ts {
name := p.tensorName(t.Name()) if strings.HasPrefix(t.Name(), "blk.0.") {
if strings.HasPrefix(name, "blk.0.") {
addRopeFactors.Do(func() { addRopeFactors.Do(func() {
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: "rope_factors_long.weight", Name: "rope_factors_long.weight",
@@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: name, Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *phi3) tensorName(n string) string { func (p *phi3Model) Replacements() []string {
return strings.NewReplacer( return []string{
"lm_head", "output", "lm_head", "output",
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
@@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.gate_up_proj", "ffn_up", "mlp.gate_up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
).Replace(n) }
} }
type ropeFactor []float32 type ropeFactor []float32

View File

@@ -1,7 +1,9 @@
package convert package convert
import ( import (
"bytes"
"crypto/sha256" "crypto/sha256"
"encoding/binary"
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"flag" "flag"
@@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
} }
defer f.Close() defer f.Close()
if err := Convert(fsys, f); err != nil { if err := ConvertModel(fsys, f); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -51,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
return r, m.KV(), m.Tensors() return r, m.KV(), m.Tensors()
} }
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
actual := make(map[string]string)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
}
return actual
}
func TestMain(m *testing.M) { func TestMain(m *testing.M) {
var level slog.Level var level slog.Level
flag.TextVar(&level, "level", slog.LevelInfo, "log level") flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@@ -62,11 +92,14 @@ func TestMain(m *testing.M) {
func TestConvertFull(t *testing.T) { func TestConvertFull(t *testing.T) {
cases := []string{ cases := []string{
"Meta-Llama-3-8B-Instruct", "Meta-Llama-3-8B-Instruct",
"Meta-Llama-3.1-8B-Instruct",
"Mistral-7B-Instruct-v0.2", "Mistral-7B-Instruct-v0.2",
"Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x7B-Instruct-v0.1",
"gemma-2b-it", "gemma-2b-it",
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
"Phi-3-mini-128k-instruct", "Phi-3-mini-128k-instruct",
"all-MiniLM-L6-v2",
"gemma-2-9b-it",
} }
for i := range cases { for i := range cases {
@@ -82,29 +115,7 @@ func TestConvertFull(t *testing.T) {
} }
f, kv, tensors := convertFull(t, os.DirFS(p)) f, kv, tensors := convertFull(t, os.DirFS(p))
actual := make(map[string]string) actual := generateResultsJSON(t, f, kv, tensors)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
}
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt))) expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
if err != nil { if err != nil {
@@ -128,3 +139,310 @@ func TestConvertFull(t *testing.T) {
}) })
} }
} }
func TestConvertInvalidDatatype(t *testing.T) {
f, err := os.CreateTemp(t.TempDir(), "testmodel")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
generateSafetensorTestData(t, tempDir)
err = ConvertModel(os.DirFS(tempDir), f)
if err == nil || err.Error() != "unsupported safetensors model" {
t.Errorf("expected error but didn't get one")
}
}
func generateSafetensorTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 14336
td := map[string]*tensorData{}
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
Offsets: []int{0, offset},
Type: "I8",
Shape: []int{4096, 14336},
}
td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
Offsets: []int{offset, offset},
Type: "U8",
Shape: []int{},
}
data, err := json.Marshal(td)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"architectures": [
"LlamaForCausalLM"
]
}
`
f, err := os.Create(filepath.Join(tempDir, "config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
tokenizerData := `
{
}
`
f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(tokenizerData)
if err != nil {
t.Fatal(err)
}
}
func TestConvertAdapter(t *testing.T) {
type AdapterCase struct {
Name string
BaseKV map[string]any
Expected map[string]string
}
cases := []AdapterCase{
{
Name: "discollama",
BaseKV: map[string]any{
"general.architecture": "llama",
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
},
Expected: map[string]string{
"general.architecture": "llama",
"general.file_type": "1",
"general.parameter_count": "106496",
"general.type": "adapter",
"general.version": "v0.2",
"adapter.lora.alpha": "16",
"adapter.type": "lora",
"llama.attention.head_count": "32",
"llama.attention.head_count_kv": "8",
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
},
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
t.Parallel()
f, err := os.CreateTemp(t.TempDir(), "f16")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
generateLoraTestData(t, tempDir)
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
t.Fatal(err)
}
r, err := os.Open(f.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
m, _, err := llm.DecodeGGML(r, math.MaxInt)
if err != nil {
t.Fatal(err)
}
if _, err := r.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
keys := maps.Keys(c.Expected)
slices.Sort(keys)
for _, k := range keys {
if v, ok := actual[k]; !ok {
t.Errorf("missing %s", k)
} else if v != c.Expected[k] {
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
}
}
})
}
}
func generateLoraTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 8 * 4
td := map[string]*tensorData{"__metadata__": nil}
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
Offsets: []int{0, offset},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
Offsets: []int{offset, offset * 2},
Type: "F32",
Shape: []int{8, 4096},
}
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
Offsets: []int{offset * 2, offset * 3},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
Type: "F32",
Shape: []int{8, 1024},
}
data, err := json.Marshal(td)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
// write some data for the tensors
ones := make([]float32, 4096*8)
for i := range ones {
ones[i] = float32(1)
}
for range 3 {
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
}
ones = make([]float32, 1024*8)
for i := range ones {
ones[i] = float32(1)
}
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"adapter_path": "adapters-test",
"batch_size": 8,
"config": "config-tiny.json",
"data": "../discollama-completion",
"grad_checkpoint": null,
"iters": 1000,
"learning_rate": 1e-05,
"lora_layers": 1,
"lora_parameters": {
"rank": 8,
"alpha": 16,
"dropout": 0.0,
"scale": 2.0
},
"lr_schedule": null,
"max_seq_length": 2048,
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
"resume_adapter_file": null,
"save_every": 100,
"seed": 0,
"steps_per_eval": 200,
"steps_per_report": 10,
"test": false,
"test_batches": 500,
"train": true,
"use_dora": false,
"val_batches": 25
}
`
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
}

View File

@@ -35,7 +35,9 @@ const (
) )
func (t tensorBase) Kind() uint32 { func (t tensorBase) Kind() uint32 {
if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
t.name == "token_types.weight" {
// these tensors are always F32
return 0 return 0
} }
@@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {
type repacker func(string, []float32, []uint64) ([]float32, error) type repacker func(string, []float32, []uint64) ([]float32, error)
func parseTensors(fsys fs.FS) ([]Tensor, error) { func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
patterns := []struct { patterns := []struct {
Pattern string Pattern string
Func func(fs.FS, ...string) ([]Tensor, error) Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
}{ }{
{"model-*-of-*.safetensors", parseSafetensors}, {"model-*-of-*.safetensors", parseSafetensors},
{"model.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors},
{"adapters.safetensors", parseSafetensors},
{"adapter_model.safetensors", parseSafetensors},
{"pytorch_model-*-of-*.bin", parseTorch}, {"pytorch_model-*-of-*.bin", parseTorch},
{"pytorch_model.bin", parseTorch}, {"pytorch_model.bin", parseTorch},
{"consolidated.*.pth", parseTorch}, {"consolidated.*.pth", parseTorch},
@@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
} }
if len(matches) > 0 { if len(matches) > 0 {
return pattern.Func(fsys, matches...) return pattern.Func(fsys, replacer, matches...)
} }
} }

View File

@@ -4,10 +4,12 @@ import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"io" "io"
"io/fs" "io/fs"
"slices" "slices"
"strings"
"github.com/d4l3k/go-bfloat16" "github.com/d4l3k/go-bfloat16"
"github.com/x448/float16" "github.com/x448/float16"
@@ -20,7 +22,7 @@ type safetensorMetadata struct {
Offsets []int64 `json:"data_offsets"` Offsets []int64 `json:"data_offsets"`
} }
func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
var ts []Tensor var ts []Tensor
for _, p := range ps { for _, p := range ps {
f, err := fsys.Open(p) f, err := fsys.Open(p)
@@ -49,6 +51,10 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
for _, key := range keys { for _, key := range keys {
if value := headers[key]; value.Type != "" { if value := headers[key]; value.Type != "" {
// bitsandbytes quantized models are unsupported
if len(value.Shape) == 0 {
return nil, errors.New("unsupported safetensors model")
}
ts = append(ts, safetensor{ ts = append(ts, safetensor{
fs: fsys, fs: fsys,
path: p, path: p,
@@ -56,7 +62,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
offset: safetensorsPad(n, value.Offsets[0]), offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
tensorBase: &tensorBase{ tensorBase: &tensorBase{
name: key, name: replacer.Replace(key),
shape: value.Shape, shape: value.Shape,
}, },
}) })

View File

@@ -3,12 +3,13 @@ package convert
import ( import (
"io" "io"
"io/fs" "io/fs"
"strings"
"github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/pytorch"
"github.com/nlpodyssey/gopickle/types" "github.com/nlpodyssey/gopickle/types"
) )
func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
var ts []Tensor var ts []Tensor
for _, p := range ps { for _, p := range ps {
pt, err := pytorch.Load(p) pt, err := pytorch.Load(p)
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
ts = append(ts, torch{ ts = append(ts, torch{
storage: t.(*pytorch.Tensor).Source, storage: t.(*pytorch.Tensor).Source,
tensorBase: &tensorBase{ tensorBase: &tensorBase{
name: k.(string), name: replacer.Replace(k.(string)),
shape: shape, shape: shape,
}, },
}) })

View File

@@ -0,0 +1,3 @@
{
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
}

124
convert/testdata/all-MiniLM-L6-v2.json vendored Normal file
View File

@@ -0,0 +1,124 @@
{
"general.architecture": "bert",
"general.file_type": "1",
"general.quantization_version": "2",
"bert.attention.causal": "false",
"bert.attention.head_count": "12",
"bert.attention.layer_norm_epsilon": "1e-12",
"bert.block_count": "6",
"bert.context_length": "512",
"bert.embedding_length": "384",
"bert.feed_forward_length": "1536",
"bert.pooling_type": "1",
"tokenizer.ggml.model": "bert",
"tokenizer.ggml.padding_token_id": "0",
"tokenizer.ggml.unknown_token_id": "100",
"tokenizer.ggml.cls_token_id": "101",
"tokenizer.ggml.seperator_token_id": "102",
"tokenizer.ggml.mask_token_id": "103",
"tokenizer.ggml.token_type_count": "2",
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
}

6
convert/testdata/gemma-2-9b-it.json vendored Normal file
View File

@@ -0,0 +1,6 @@
{
"general.architecture": "gemma2",
"gemma2.attention.sliding_window": "4096",
"gemma2.attn_logit_softcapping": "50",
"gemma2.final_logit_softcapping": "30"
}

View File

@@ -1,7 +1,6 @@
package convert package convert
import ( import (
"cmp"
"crypto/sha256" "crypto/sha256"
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
@@ -11,6 +10,8 @@ import (
"log/slog" "log/slog"
"os" "os"
"slices" "slices"
"golang.org/x/exp/maps"
) )
const ( const (
@@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
return nil, err return nil, err
} }
var tokens []token tokens := make(map[int]token, len(t.Model.Vocab))
for k, v := range t.Model.Vocab { for k, v := range t.Model.Vocab {
tokens = append(tokens, token{ tokens[v] = token{
ID: v, ID: v,
Content: k, Content: k,
}) }
} }
for _, t := range t.AddedTokens { for _, token := range t.AddedTokens {
t.UserDefined = true token.UserDefined = true
tokens = append(tokens, t) tokens[token.ID] = token
} }
slices.SortFunc(tokens, func(i, j token) int { keys := maps.Keys(tokens)
return cmp.Compare(i.ID, j.ID) slices.Sort(keys)
})
v := Vocabulary{Model: "gpt2"} v := Vocabulary{Model: "gpt2"}
for _, t := range tokens { for _, k := range keys {
v.Tokens = append(v.Tokens, t.Content) token := tokens[k]
v.Scores = append(v.Scores, float32(t.ID)) v.Tokens = append(v.Tokens, token.Content)
v.Scores = append(v.Scores, float32(token.ID))
switch { switch {
case t.Special: case token.Special:
v.Types = append(v.Types, tokenTypeControl) v.Types = append(v.Types, tokenTypeControl)
case t.UserDefined: case token.UserDefined:
v.Types = append(v.Types, tokenTypeUserDefined) v.Types = append(v.Types, tokenTypeUserDefined)
default: default:
v.Types = append(v.Types, tokenTypeNormal) v.Types = append(v.Types, tokenTypeNormal)

View File

@@ -15,6 +15,11 @@ import (
) )
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
ast, err := parseAdditionalSpecialTokens(fsys)
if err != nil {
return nil, err
}
bts, err := fs.ReadFile(fsys, "tokenizer.model") bts, err := fs.ReadFile(fsys, "tokenizer.model")
if err != nil { if err != nil {
return nil, err return nil, err
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
sentencepiece.ModelProto_SentencePiece_BYTE: sentencepiece.ModelProto_SentencePiece_BYTE:
v.Types = append(v.Types, int32(t)) v.Types = append(v.Types, int32(t))
default: default:
v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL)) tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
if slices.Contains(ast, piece.GetPiece()) {
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
}
v.Types = append(v.Types, tt)
} }
} }
@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
return &v, nil return &v, nil
} }
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
f, err := fsys.Open("special_tokens_map.json")
if errors.Is(err, os.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, err
}
defer f.Close()
var m struct {
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
}
if err := json.NewDecoder(f).Decode(&m); err != nil {
return nil, err
}
return m.AdditionalSpecialTokens, nil
}

View File

@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
## How do I use Ollama behind a proxy? ## How do I use Ollama behind a proxy?
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform. Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
> [!NOTE]
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
### How do I use Ollama behind a proxy in Docker? ### How do I use Ollama behind a proxy in Docker?
@@ -276,4 +279,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
## How does Ollama load models on multiple GPUs? ## How does Ollama load models on multiple GPUs?
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.

BIN
docs/images/ollama-keys.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

BIN
docs/images/signup.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

View File

@@ -1,44 +1,129 @@
# Import # Importing a model
GGUF models and select Safetensors models can be imported directly into Ollama. ## Table of Contents
## Import GGUF * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
* [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
* [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
* [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
A binary GGUF file can be imported directly into Ollama through a Modelfile. ## Importing a fine tuned adapter from Safetensors weights
First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:
```dockerfile ```dockerfile
FROM /path/to/file.gguf FROM <base model name>
ADAPTER /path/to/safetensors/adapter/directory
``` ```
## Import Safetensors Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.
If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile: Now run `ollama create` from the directory where the `Modelfile` was created:
- LlamaForCausalLM ```bash
- MistralForCausalLM ollama create my-model
- MixtralForCausalLM ```
- GemmaForCausalLM
- Phi3ForCausalLM Lastly, test the model:
```bash
ollama run my-model
```
Ollama supports importing adapters based on several different model architectures including:
* Llama (including Llama 2, Llama 3, and Llama 3.1);
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
* Gemma (including Gemma 1 and Gemma 2)
You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
* Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
* [Unsloth](https://github.com/unslothai/unsloth)
* [MLX](https://github.com/ml-explore/mlx)
## Importing a model from Safetensors weights
First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:
```dockerfile ```dockerfile
FROM /path/to/safetensors/directory FROM /path/to/safetensors/directory
``` ```
For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf). If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
## Automatic Quantization Now run the `ollama create` command from the directory where you created the `Modelfile`:
> [!NOTE] ```shell
> Automatic quantization requires v0.1.35 or higher. ollama create my-model
```
Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`. Lastly, test the model:
```shell
ollama run my-model
```
Ollama supports importing models for several different architectures including:
* Llama (including Llama 2, Llama 3, and Llama 3.1);
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
* Gemma (including Gemma 1 and Gemma 2); and
* Phi3
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
## Importing a GGUF based model or adapter
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
* converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp;
* converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
* downloading a model or adapter from a place such as HuggingFace
To import a GGUF model, create a `Modelfile` containg:
```dockerfile
FROM /path/to/file.gguf
```
For a GGUF adapter, create the `Modelfile` with:
```dockerfile
FROM <model name>
ADAPTER /path/to/file.gguf
```
When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
* a model from Ollama
* a GGUF file
* a Safetensors based model
Once you have created your `Modelfile`, use the `ollama create` command to build the model.
```shell
ollama create my-model
```
## Quantizing a Model
Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.
```dockerfile ```dockerfile
FROM /path/to/my/gemma/f16/model FROM /path/to/my/gemma/f16/model
``` ```
Use `ollama create` to then create the quantized model.
```shell ```shell
$ ollama create -q Q4_K_M mymodel $ ollama create --quantize q4_K_M mymodel
transferring model data transferring model data
quantizing F16 model to Q4_K_M quantizing F16 model to Q4_K_M
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@@ -49,42 +134,53 @@ success
### Supported Quantizations ### Supported Quantizations
- `Q4_0` - `q4_0`
- `Q4_1` - `q4_1`
- `Q5_0` - `q5_0`
- `Q5_1` - `q5_1`
- `Q8_0` - `q8_0`
#### K-means Quantizations #### K-means Quantizations
- `Q3_K_S` - `q3_K_S`
- `Q3_K_M` - `q3_K_M`
- `Q3_K_L` - `q3_K_L`
- `Q4_K_S` - `q4_K_S`
- `Q4_K_M` - `q4_K_M`
- `Q5_K_S` - `q5_K_S`
- `Q5_K_M` - `q5_K_M`
- `Q6_K` - `q6_K`
## Template Detection
> [!NOTE] ## Sharing your model on ollama.com
> Template detection requires v0.1.42 or higher.
Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing. You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.
```dockerfile First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
FROM /path/to/my/gemma/model
``` <img src="images/signup.png" alt="Sign-Up" width="40%">
The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
Follow the directions on the page to determine where your Ollama Public Key is located.
<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).
```shell ```shell
$ ollama create mymodel ollama cp mymodel myuser/mymodel
transferring model data ollama push myuser/mymodel
using autodetected template gemma-instruct ```
creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb Once your model has been pushed, other users can pull and run it by using the command:
writing manifest
success ```shell
ollama run myuser/mymodel
``` ```
Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.

View File

@@ -28,6 +28,11 @@ Download and extract the Linux package:
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
``` ```
If you have an AMD GPU, also download and extract the ROCm package into the same location
```bash
curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz | sudo tar zx -C /usr
```
### Adding Ollama as a startup service (recommended) ### Adding Ollama as a startup service (recommended)
Create a user for Ollama: Create a user for Ollama:

View File

@@ -11,8 +11,9 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples) - [Examples](#examples)
- [Instructions](#instructions) - [Instructions](#instructions)
- [FROM (Required)](#from-required) - [FROM (Required)](#from-required)
- [Build from llama3](#build-from-llama3) - [Build from llama3.1](#build-from-llama31)
- [Build from a bin file](#build-from-a-bin-file) - [Build from a Safetensors model](#build-from-a-safetensors-model)
- [Build from a GGUF file](#build-from-a-gguf-file)
- [PARAMETER](#parameter) - [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values) - [Valid Parameters and Values](#valid-parameters-and-values)
- [TEMPLATE](#template) - [TEMPLATE](#template)
@@ -99,22 +100,39 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag> FROM <model name>:<tag>
``` ```
#### Build from llama3 #### Build from llama3.1
```modelfile ```modelfile
FROM llama3 FROM llama3.1
``` ```
A list of available base models: A list of available base models:
<https://github.com/ollama/ollama#model-library> <https://github.com/ollama/ollama#model-library>
Additional models can be found at:
<https://ollama.com/library>
#### Build from a `bin` file #### Build from a Safetensors model
```modelfile
FROM <model directory>
```
The model directory should contain the Safetensors weights for a supported architecture.
Currently supported model architectures:
* Llama (including Llama 2, Llama 3, and Llama 3.1)
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
* Gemma (including Gemma 1 and Gemma 2)
* Phi3
#### Build from a GGUF file
```modelfile ```modelfile
FROM ./ollama-model.bin FROM ./ollama-model.bin
``` ```
This bin file location should be specified as an absolute path or relative to the `Modelfile` location. The GGUF bin file location should be specified as an absolute path or relative to the `Modelfile` location.
### PARAMETER ### PARAMETER
@@ -174,7 +192,20 @@ SYSTEM """<system message>"""
### ADAPTER ### ADAPTER
The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
#### Safetensor adapter
```modelfile
ADAPTER <path to safetensor adapter>
```
Currently supported Safetensor adapters:
* Llama (including Llama 2, Llama 3, and Llama 3.1)
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
* Gemma (including Gemma 1 and Gemma 2)
#### GGUF adapter
```modelfile ```modelfile
ADAPTER ./ollama-lora.bin ADAPTER ./ollama-lora.bin

View File

@@ -190,7 +190,7 @@ func RunnersDir() (p string) {
} }
var paths []string var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
paths = append(paths, paths = append(paths,
root, root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
@@ -282,3 +282,12 @@ func Values() map[string]string {
func Var(key string) string { func Var(key string) string {
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'") return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
} }
// On windows, we keep the binary at the top directory, but
// other platforms use a "bin" directory, so this returns ".."
func LibRelativeToExe() string {
if runtime.GOOS == "windows" {
return "."
}
return ".."
}

View File

@@ -9,6 +9,8 @@ import (
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings" "strings"
"github.com/ollama/ollama/envconfig"
) )
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -54,7 +56,7 @@ func commonAMDValidateLibDir() (string, error) {
// Installer payload location if we're running the installed binary // Installer payload location if we're running the installed binary
exe, err := os.Executable() exe, err := os.Executable()
if err == nil { if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
if rocmLibUsable(rocmTargetDir) { if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil return rocmTargetDir, nil

View File

@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
// Installer payload (if we're running from some other location) // Installer payload (if we're running from some other location)
localAppData := os.Getenv("LOCALAPPDATA") localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama") appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
if rocmLibUsable(rocmTargetDir) { if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil return rocmTargetDir, nil

View File

@@ -264,6 +264,8 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.computeMajor = int(memInfo.major) gpuInfo.computeMajor = int(memInfo.major)
gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.computeMinor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
variant := cudaVariant(gpuInfo) variant := cudaVariant(gpuInfo)
if depPath != "" { if depPath != "" {
gpuInfo.DependencyPath = depPath gpuInfo.DependencyPath = depPath
@@ -275,8 +277,6 @@ func GetGPUInfo() GpuInfoList {
} }
} }
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
gpuInfo.Variant = variant gpuInfo.Variant = variant
// query the management library as well so we can record any skew between the two // query the management library as well so we can record any skew between the two
@@ -653,7 +653,7 @@ func LibraryDir() string {
slog.Warn("failed to lookup working directory", "error", err) slog.Warn("failed to lookup working directory", "error", err)
} }
// Scan for any of our dependeices, and pick first match // Scan for any of our dependeices, and pick first match
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} { for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
libDep := filepath.Join("lib", "ollama") libDep := filepath.Join("lib", "ollama")
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
return filepath.Join(root, libDep) return filepath.Join(root, libDep)

View File

@@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
} }
} }
func TestByLibrary(t *testing.T) {
type testCase struct {
input []GpuInfo
expect int
}
testCases := map[string]*testCase{
"empty": {input: []GpuInfo{}, expect: 0},
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
}
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
resp := (GpuInfoList)(v.input).ByLibrary()
if len(resp) != v.expect {
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
}
})
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

View File

@@ -94,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
} }
} }
if !found { if !found {
libs = append(libs, info.Library) libs = append(libs, requested)
resp = append(resp, []GpuInfo{info}) resp = append(resp, []GpuInfo{info})
} }
} }

View File

@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
} }
if res.PromptEvalCount != 8 { if res.PromptEvalCount != 6 {
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
} }
} }
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
} }
if res.PromptEvalCount != 16 { if res.PromptEvalCount != 12 {
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
} }
} }

View File

@@ -1429,7 +1429,13 @@ struct llama_server_context
switch (task.type) switch (task.type)
{ {
case TASK_TYPE_COMPLETION: { case TASK_TYPE_COMPLETION: {
server_slot *slot = prefix_slot(task.data["prompt"]); server_slot *slot = nullptr;
if (task.embedding_mode) {
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
slot = slots[0].available() ? &slots[0] : nullptr;
} else {
slot = prefix_slot(task.data["prompt"]);
}
if (slot == nullptr) if (slot == nullptr)
{ {
// if no slot is available, we defer this task for processing later // if no slot is available, we defer this task for processing later

View File

@@ -261,7 +261,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
fi fi
init_vars init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\"" echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""

View File

@@ -373,7 +373,7 @@ function build_rocm() {
"-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe", "-DCMAKE_CXX_COMPILER=clang++.exe",
"-DGGML_HIPBLAS=on", "-DGGML_HIPBLAS=on",
"-DLLAMA_CUDA_NO_PEER_COPY=on", "-DGGML_CUDA_NO_PEER_COPY=on",
"-DHIP_PLATFORM=amd", "-DHIP_PLATFORM=amd",
"-DGGML_AVX=on", "-DGGML_AVX=on",
"-DGGML_AVX2=off", "-DGGML_AVX2=off",

View File

@@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
return "unknown" return "unknown"
} }
func (kv KV) Kind() string {
if s, ok := kv["general.type"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 { func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count") return kv.u64("general.parameter_count")
} }

View File

@@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
assert.Len(t, tensors, inputLayerCount+1) assert.Len(t, tensors, inputLayerCount+1)
err = WriteGGUF(f, KV{ err = WriteGGUF(f, KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32), "llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096), "llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount), "llama.block_count": uint32(inputLayerCount),

View File

@@ -1,60 +0,0 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8420,14 +8420,14 @@ struct llm_build_context {
}
struct ggml_tensor * build_inp_mean() {
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
cb(lctx.inp_mean, "inp_mean", -1);
ggml_set_input(lctx.inp_mean);
return lctx.inp_mean;
}
struct ggml_tensor * build_inp_cls() {
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
sum[seq_id] += 1;
}
- std::vector<float> div(n_tokens, 0.0f);
- for (int i = 0; i < n_tokens; ++i) {
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
if (pos == 0) {
data[seq_id] = i;
}

View File

@@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--mlock") params = append(params, "--mlock")
} }
if gpu.IsNUMA() { if gpu.IsNUMA() && gpus[0].Library == "cpu" {
numaMode := "distribute" numaMode := "distribute"
if runtime.GOOS == "linux" { if runtime.GOOS == "linux" {
if _, err := exec.LookPath("numactl"); err == nil { if _, err := exec.LookPath("numactl"); err == nil {
@@ -409,7 +409,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment them essage about noexec // Detect permission denied and augment the message about noexec
if errors.Is(err, os.ErrPermission) { if errors.Is(err, os.ErrPermission) {
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir) finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
continue continue

View File

@@ -122,8 +122,8 @@ function buildOllama() {
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
} }
function buildApp() { function buildApp() {

View File

@@ -369,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
parameters := make(map[string]any) parameters := make(map[string]any)
var layers []Layer var layers []Layer
var baseLayers []*layerGGML
for _, c := range modelfile.Commands { for _, c := range modelfile.Commands {
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
command := c.Name
switch c.Name { switch command {
case "model", "adapter": case "model", "adapter":
var baseLayers []*layerGGML if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
if name := model.ParseName(c.Args); name.IsValid() {
baseLayers, err = parseFromModel(ctx, name, fn) baseLayers, err = parseFromModel(ctx, name, fn)
if err != nil { if err != nil {
return err return err
@@ -409,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
} }
defer blob.Close() defer blob.Close()
baseLayers, err = parseFromFile(ctx, blob, digest, fn) baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
if err != nil { if err != nil {
return err return err
} }
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil { } else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
defer file.Close() defer file.Close()
baseLayers, err = parseFromFile(ctx, file, "", fn) baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
if err != nil { if err != nil {
return err return err
} }

View File

@@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
if err := os.Rename(temp.Name(), blob); err != nil { if err := os.Rename(temp.Name(), blob); err != nil {
return Layer{}, err return Layer{}, err
} }
if err := os.Chmod(blob, 0o644); err != nil {
return Layer{}, err
}
} }
return Layer{ return Layer{

View File

@@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil return layers, nil
} }
func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
fi, err := f.Stat() fi, err := f.Stat()
if err != nil { if err != nil {
return nil, err return nil, err
@@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
defer t.Close() defer t.Close()
defer os.Remove(t.Name()) defer os.Remove(t.Name())
fn(api.ProgressResponse{Status: "converting model"}) var layerType string
if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
return nil, err switch command {
case "adapter":
var baseModel *llm.GGML
for _, l := range baseLayers {
if l.GGML != nil {
baseModel = l.GGML
break
}
}
if baseModel == nil {
return nil, fmt.Errorf("no base model specified for the adapter")
}
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.adapter"
case "model":
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.model"
} }
if _, err := t.Seek(0, io.SeekStart); err != nil { if _, err := t.Seek(0, io.SeekStart); err != nil {
return nil, err return nil, err
} }
layer, err := NewLayer(t, "application/vnd.ollama.image.model") layer, err := NewLayer(t, layerType)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
return detectChatTemplate(layers) return detectChatTemplate(layers)
} }
func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
sr := io.NewSectionReader(file, 0, 512) sr := io.NewSectionReader(file, 0, 512)
contentType, err := detectContentType(sr) contentType, err := detectContentType(sr)
if err != nil { if err != nil {
@@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
case "gguf", "ggla": case "gguf", "ggla":
// noop // noop
case "application/zip": case "application/zip":
return parseFromZipFile(ctx, file, digest, fn) return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
default: default:
return nil, fmt.Errorf("unsupported content type: %s", contentType) return nil, fmt.Errorf("unsupported content type: %s", contentType)
} }
@@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
} }
mediatype := "application/vnd.ollama.image.model" mediatype := "application/vnd.ollama.image.model"
if ggml.Name() == "ggla" { if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
mediatype = "application/vnd.ollama.image.adapter" mediatype = "application/vnd.ollama.image.adapter"
} else if ggml.KV().Architecture() == "clip" { } else if ggml.KV().Architecture() == "clip" {
mediatype = "application/vnd.ollama.image.projector" mediatype = "application/vnd.ollama.image.projector"

View File

@@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }
@@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }
@@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }

View File

@@ -463,7 +463,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
c.JSON(http.StatusOK, resp) c.JSON(http.StatusOK, resp)
} }
func (s *Server) PullModelHandler(c *gin.Context) { func (s *Server) PullHandler(c *gin.Context) {
var req api.PullRequest var req api.PullRequest
err := c.ShouldBindJSON(&req) err := c.ShouldBindJSON(&req)
switch { switch {
@@ -513,7 +513,7 @@ func (s *Server) PullModelHandler(c *gin.Context) {
streamResponse(c, ch) streamResponse(c, ch)
} }
func (s *Server) PushModelHandler(c *gin.Context) { func (s *Server) PushHandler(c *gin.Context) {
var req api.PushRequest var req api.PushRequest
err := c.ShouldBindJSON(&req) err := c.ShouldBindJSON(&req)
switch { switch {
@@ -577,7 +577,7 @@ func checkNameExists(name model.Name) error {
return nil return nil
} }
func (s *Server) CreateModelHandler(c *gin.Context) { func (s *Server) CreateHandler(c *gin.Context) {
var r api.CreateRequest var r api.CreateRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) { if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -647,7 +647,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
streamResponse(c, ch) streamResponse(c, ch)
} }
func (s *Server) DeleteModelHandler(c *gin.Context) { func (s *Server) DeleteHandler(c *gin.Context) {
var r api.DeleteRequest var r api.DeleteRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) { if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -680,7 +680,7 @@ func (s *Server) DeleteModelHandler(c *gin.Context) {
} }
} }
func (s *Server) ShowModelHandler(c *gin.Context) { func (s *Server) ShowHandler(c *gin.Context) {
var req api.ShowRequest var req api.ShowRequest
err := c.ShouldBindJSON(&req) err := c.ShouldBindJSON(&req)
switch { switch {
@@ -829,7 +829,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
return kv, nil return kv, nil
} }
func (s *Server) ListModelsHandler(c *gin.Context) { func (s *Server) ListHandler(c *gin.Context) {
ms, err := Manifests() ms, err := Manifests()
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -879,7 +879,7 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
c.JSON(http.StatusOK, api.ListResponse{Models: models}) c.JSON(http.StatusOK, api.ListResponse{Models: models})
} }
func (s *Server) CopyModelHandler(c *gin.Context) { func (s *Server) CopyHandler(c *gin.Context) {
var r api.CopyRequest var r api.CopyRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) { if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -1081,33 +1081,33 @@ func (s *Server) GenerateRoutes() http.Handler {
allowedHostsMiddleware(s.addr), allowedHostsMiddleware(s.addr),
) )
r.POST("/api/pull", s.PullModelHandler) r.POST("/api/pull", s.PullHandler)
r.POST("/api/generate", s.GenerateHandler) r.POST("/api/generate", s.GenerateHandler)
r.POST("/api/chat", s.ChatHandler) r.POST("/api/chat", s.ChatHandler)
r.POST("/api/embed", s.EmbedHandler) r.POST("/api/embed", s.EmbedHandler)
r.POST("/api/embeddings", s.EmbeddingsHandler) r.POST("/api/embeddings", s.EmbeddingsHandler)
r.POST("/api/create", s.CreateModelHandler) r.POST("/api/create", s.CreateHandler)
r.POST("/api/push", s.PushModelHandler) r.POST("/api/push", s.PushHandler)
r.POST("/api/copy", s.CopyModelHandler) r.POST("/api/copy", s.CopyHandler)
r.DELETE("/api/delete", s.DeleteModelHandler) r.DELETE("/api/delete", s.DeleteHandler)
r.POST("/api/show", s.ShowModelHandler) r.POST("/api/show", s.ShowHandler)
r.POST("/api/blobs/:digest", s.CreateBlobHandler) r.POST("/api/blobs/:digest", s.CreateBlobHandler)
r.HEAD("/api/blobs/:digest", s.HeadBlobHandler) r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
r.GET("/api/ps", s.ProcessHandler) r.GET("/api/ps", s.PsHandler)
// Compatibility endpoints // Compatibility endpoints
r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler) r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler)
r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler) r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler)
r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler) r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler)
r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler) r.GET("/v1/models", openai.ListMiddleware(), s.ListHandler)
r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler) r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowHandler)
for _, method := range []string{http.MethodGet, http.MethodHead} { for _, method := range []string{http.MethodGet, http.MethodHead} {
r.Handle(method, "/", func(c *gin.Context) { r.Handle(method, "/", func(c *gin.Context) {
c.String(http.StatusOK, "Ollama is running") c.String(http.StatusOK, "Ollama is running")
}) })
r.Handle(method, "/api/tags", s.ListModelsHandler) r.Handle(method, "/api/tags", s.ListHandler)
r.Handle(method, "/api/version", func(c *gin.Context) { r.Handle(method, "/api/version", func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"version": version.Version}) c.JSON(http.StatusOK, gin.H{"version": version.Version})
}) })
@@ -1269,7 +1269,7 @@ func streamResponse(c *gin.Context, ch chan any) {
}) })
} }
func (s *Server) ProcessHandler(c *gin.Context) { func (s *Server) PsHandler(c *gin.Context) {
models := []api.ProcessModelResponse{} models := []api.ProcessModelResponse{}
for _, v := range s.sched.loaded { for _, v := range s.sched.loaded {

View File

@@ -93,7 +93,7 @@ func TestCreateFromBin(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -120,7 +120,7 @@ func TestCreateFromModel(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -134,7 +134,7 @@ func TestCreateFromModel(t *testing.T) {
filepath.Join(p, "manifests", "registry.ollama.ai", "library", "test", "latest"), filepath.Join(p, "manifests", "registry.ollama.ai", "library", "test", "latest"),
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2", Name: "test2",
Modelfile: "FROM test", Modelfile: "FROM test",
Stream: &stream, Stream: &stream,
@@ -162,7 +162,7 @@ func TestCreateRemovesLayers(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -182,7 +182,7 @@ func TestCreateRemovesLayers(t *testing.T) {
filepath.Join(p, "blobs", "sha256-bc80b03733773e0728011b2f4adf34c458b400e1aad48cb28d61170f3a2ad2d6"), filepath.Join(p, "blobs", "sha256-bc80b03733773e0728011b2f4adf34c458b400e1aad48cb28d61170f3a2ad2d6"),
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -210,7 +210,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -230,7 +230,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
filepath.Join(p, "blobs", "sha256-f29e82a8284dbdf5910b1555580ff60b04238b8da9d5e51159ada67a4d0d5851"), filepath.Join(p, "blobs", "sha256-f29e82a8284dbdf5910b1555580ff60b04238b8da9d5e51159ada67a4d0d5851"),
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -267,7 +267,7 @@ func TestCreateMergeParameters(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -288,7 +288,7 @@ func TestCreateMergeParameters(t *testing.T) {
}) })
// in order to merge parameters, the second model must be created FROM the first // in order to merge parameters, the second model must be created FROM the first
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2", Name: "test2",
Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7", Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7",
Stream: &stream, Stream: &stream,
@@ -326,7 +326,7 @@ func TestCreateMergeParameters(t *testing.T) {
} }
// slices are replaced // slices are replaced
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2", Name: "test2",
Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7\nPARAMETER stop <|endoftext|>", Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7\nPARAMETER stop <|endoftext|>",
Stream: &stream, Stream: &stream,
@@ -371,7 +371,7 @@ func TestCreateReplacesMessages(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -391,7 +391,7 @@ func TestCreateReplacesMessages(t *testing.T) {
filepath.Join(p, "blobs", "sha256-e0e27d47045063ccb167ae852c51d49a98eab33fabaee4633fdddf97213e40b5"), filepath.Join(p, "blobs", "sha256-e0e27d47045063ccb167ae852c51d49a98eab33fabaee4633fdddf97213e40b5"),
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2", Name: "test2",
Modelfile: "FROM test\nMESSAGE assistant \"You're a test, Harry.\"\nMESSAGE user \"I-I'm a what?\"\nMESSAGE assistant \"A test. And a thumping good one at that, I'd wager.\"", Modelfile: "FROM test\nMESSAGE assistant \"You're a test, Harry.\"\nMESSAGE user \"I-I'm a what?\"\nMESSAGE assistant \"A test. And a thumping good one at that, I'd wager.\"",
Stream: &stream, Stream: &stream,
@@ -448,7 +448,7 @@ func TestCreateTemplateSystem(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -488,7 +488,7 @@ func TestCreateTemplateSystem(t *testing.T) {
} }
t.Run("incomplete template", func(t *testing.T) { t.Run("incomplete template", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -500,7 +500,7 @@ func TestCreateTemplateSystem(t *testing.T) {
}) })
t.Run("template with unclosed if", func(t *testing.T) { t.Run("template with unclosed if", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -512,7 +512,7 @@ func TestCreateTemplateSystem(t *testing.T) {
}) })
t.Run("template with undefined function", func(t *testing.T) { t.Run("template with undefined function", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ Prompt }}", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -531,7 +531,7 @@ func TestCreateLicenses(t *testing.T) {
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -579,7 +579,7 @@ func TestCreateDetectTemplate(t *testing.T) {
var s Server var s Server
t.Run("matched", func(t *testing.T) { t.Run("matched", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", "tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
@@ -600,7 +600,7 @@ func TestCreateDetectTemplate(t *testing.T) {
}) })
t.Run("unmatched", func(t *testing.T) { t.Run("unmatched", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,

View File

@@ -22,7 +22,7 @@ func TestDelete(t *testing.T) {
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test", Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
}) })
@@ -31,7 +31,7 @@ func TestDelete(t *testing.T) {
t.Fatalf("expected status code 200, actual %d", w.Code) t.Fatalf("expected status code 200, actual %d", w.Code)
} }
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2", Name: "test2",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
}) })
@@ -52,7 +52,7 @@ func TestDelete(t *testing.T) {
filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"), filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
}) })
w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"}) w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code) t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -68,7 +68,7 @@ func TestDelete(t *testing.T) {
filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"), filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
}) })
w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test2"}) w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test2"})
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code) t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -102,7 +102,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"}) w := createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
t.Errorf("expected status code 200, actual %d", w.Code) t.Errorf("expected status code 200, actual %d", w.Code)
} }

View File

@@ -84,7 +84,7 @@ func TestGenerateChat(t *testing.T) {
go s.sched.Run(context.TODO()) go s.sched.Run(context.TODO())
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test", Model: "test",
Modelfile: fmt.Sprintf(`FROM %s Modelfile: fmt.Sprintf(`FROM %s
TEMPLATE """ TEMPLATE """
@@ -144,7 +144,7 @@ func TestGenerateChat(t *testing.T) {
}) })
t.Run("missing capabilities chat", func(t *testing.T) { t.Run("missing capabilities chat", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert", Model: "bert",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"general.architecture": "bert", "general.architecture": "bert",
@@ -270,7 +270,7 @@ func TestGenerateChat(t *testing.T) {
checkChatResponse(t, w.Body, "test", "Hi!") checkChatResponse(t, w.Body, "test", "Hi!")
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-system", Model: "test-system",
Modelfile: "FROM test\nSYSTEM You are a helpful assistant.", Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
}) })
@@ -382,7 +382,7 @@ func TestGenerate(t *testing.T) {
go s.sched.Run(context.TODO()) go s.sched.Run(context.TODO())
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test", Model: "test",
Modelfile: fmt.Sprintf(`FROM %s Modelfile: fmt.Sprintf(`FROM %s
TEMPLATE """ TEMPLATE """
@@ -442,7 +442,7 @@ func TestGenerate(t *testing.T) {
}) })
t.Run("missing capabilities generate", func(t *testing.T) { t.Run("missing capabilities generate", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert", Model: "bert",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"general.architecture": "bert", "general.architecture": "bert",
@@ -583,7 +583,7 @@ func TestGenerate(t *testing.T) {
checkGenerateResponse(t, w.Body, "test", "Hi!") checkGenerateResponse(t, w.Body, "test", "Hi!")
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-system", Model: "test-system",
Modelfile: "FROM test\nSYSTEM You are a helpful assistant.", Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
}) })
@@ -652,7 +652,7 @@ func TestGenerate(t *testing.T) {
checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!") checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
}) })
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-suffix", Model: "test-suffix",
Modelfile: `FROM test Modelfile: `FROM test
TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID> TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>

View File

@@ -31,13 +31,13 @@ func TestList(t *testing.T) {
var s Server var s Server
for _, n := range expectNames { for _, n := range expectNames {
createRequest(t, s.CreateModelHandler, api.CreateRequest{ createRequest(t, s.CreateHandler, api.CreateRequest{
Name: n, Name: n,
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
}) })
} }
w := createRequest(t, s.ListModelsHandler, nil) w := createRequest(t, s.ListHandler, nil)
if w.Code != http.StatusOK { if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code) t.Fatalf("expected status code 200, actual %d", w.Code)
} }

View File

@@ -318,7 +318,7 @@ func TestCase(t *testing.T) {
var s Server var s Server
for _, tt := range cases { for _, tt := range cases {
t.Run(tt, func(t *testing.T) { t.Run(tt, func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: tt, Name: tt,
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -334,7 +334,7 @@ func TestCase(t *testing.T) {
} }
t.Run("create", func(t *testing.T) { t.Run("create", func(t *testing.T) {
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: strings.ToUpper(tt), Name: strings.ToUpper(tt),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)), Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream, Stream: &stream,
@@ -350,7 +350,7 @@ func TestCase(t *testing.T) {
}) })
t.Run("pull", func(t *testing.T) { t.Run("pull", func(t *testing.T) {
w := createRequest(t, s.PullModelHandler, api.PullRequest{ w := createRequest(t, s.PullHandler, api.PullRequest{
Name: strings.ToUpper(tt), Name: strings.ToUpper(tt),
Stream: &stream, Stream: &stream,
}) })
@@ -365,7 +365,7 @@ func TestCase(t *testing.T) {
}) })
t.Run("copy", func(t *testing.T) { t.Run("copy", func(t *testing.T) {
w := createRequest(t, s.CopyModelHandler, api.CopyRequest{ w := createRequest(t, s.CopyHandler, api.CopyRequest{
Source: tt, Source: tt,
Destination: strings.ToUpper(tt), Destination: strings.ToUpper(tt),
}) })
@@ -387,7 +387,7 @@ func TestShow(t *testing.T) {
var s Server var s Server
createRequest(t, s.CreateModelHandler, api.CreateRequest{ createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "show-model", Name: "show-model",
Modelfile: fmt.Sprintf( Modelfile: fmt.Sprintf(
"FROM %s\nFROM %s", "FROM %s\nFROM %s",
@@ -396,7 +396,7 @@ func TestShow(t *testing.T) {
), ),
}) })
w := createRequest(t, s.ShowModelHandler, api.ShowRequest{ w := createRequest(t, s.ShowHandler, api.ShowRequest{
Name: "show-model", Name: "show-model",
}) })

View File

@@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
numParallel = 1
}
// Evaluate if the model will fit in the available system memory, or if we should unload a model first // Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" { if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode // simplifying assumption of defaultParallel when in CPU mode

View File

@@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
require.NoError(t, llm.WriteGGUF(f, llm.KV{ require.NoError(t, llm.WriteGGUF(f, llm.KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32), "llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096), "llama.embedding_length": uint32(4096),
"llama.block_count": uint32(1), "llama.block_count": uint32(1),