diff --git a/.gitattributes b/.gitattributes
index 4e8ada93..10c21444 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -19,6 +19,8 @@ ml/backend/**/*.comp linguist-vendored
ml/backend/**/*.glsl linguist-vendored
ml/backend/**/CMakeLists.txt linguist-vendored
+app/webview linguist-vendored
+
llama/build-info.cpp linguist-generated
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
diff --git a/.golangci.yaml b/.golangci.yaml
index b211e5de..5a425413 100644
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -1,77 +1,51 @@
version: "2"
linters:
- default: none
enable:
- asasalint
- bidichk
- bodyclose
- containedctx
- - copyloopvar
- - errcheck
- - errorlint
- - exptostd
- gocheckcompilerdirectives
- - gocritic
- - govet
- - ineffassign
- intrange
- makezero
- misspell
- - modernize
- nilerr
- - nilnil
- nolintlint
- nosprintfhostport
- - perfsprint
- - prealloc
- - sloglint
- - staticcheck
- unconvert
- - unused
- - usestdlibvars
- usetesting
- wastedassign
- whitespace
+ disable:
+ - errcheck
+ - usestdlibvars
settings:
- errcheck:
- exclude-functions:
- - fmt.Fprintf
- perfsprint:
- strconcat: false
- concat-loop: false
+ govet:
+ disable:
+ - unusedresult
staticcheck:
checks:
- all
- # Using a deprecated function, variable, constant or field.
- # https://staticcheck.dev/docs/checks/#SA1019
+ - -QF* # disable quick fix suggestions
- -SA1019
- # Incorrect or missing package comment.
- # https://staticcheck.dev/docs/checks/#ST1000
- - -ST1000
- # Poorly chosen identifier.
- # https://staticcheck.dev/docs/checks/#ST1003
- - -ST1003
- # The documentation of an exported function should start with the function's name.
- # https://staticcheck.dev/docs/checks/#ST1020
- - -ST1020
- # The documentation of an exported type should start with type's name.
- # https://staticcheck.dev/docs/checks/#ST1021
- - -ST1021
- # The documentation of an exported variable or constant should start with variable's name.
- # https://staticcheck.dev/docs/checks/#ST1022
- - -ST1022
- usestdlibvars:
- http-method: false
- http-status-code: false
-
+ - -ST1000 # package comment format
+ - -ST1003 # underscores in package names
+ - -ST1005 # error strings should not be capitalized
+ - -ST1012 # error var naming (ErrFoo)
+ - -ST1016 # receiver name consistency
+ - -ST1020 # comment on exported function format
+ - -ST1021 # comment on exported type format
+ - -ST1022 # comment on exported var format
+ - -ST1023 # omit type from declaration
+severity:
+ default: error
+ rules:
+ - linters:
+ - gofmt
+ - goimports
+ - intrange
+ severity: info
formatters:
enable:
- - gci
- gofmt
- gofumpt
- settings:
- gci:
- sections:
- - standard
- - default
- - localmodule
diff --git a/Makefile.sync b/Makefile.sync
index b1fcde45..4991ad84 100644
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor
-FETCH_HEAD=3cfa9c3f125763305b4226bc032f1954f08990dc
+FETCH_HEAD=7f8ef50cce40e3e7e4526a3696cb45658190e69a
.PHONY: help
help:
diff --git a/api/client.go b/api/client.go
index 0d4c97ba..9a8f89e4 100644
--- a/api/client.go
+++ b/api/client.go
@@ -226,7 +226,14 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
bts := scanner.Bytes()
if err := json.Unmarshal(bts, &errorResponse); err != nil {
- return fmt.Errorf("unmarshal: %w", err)
+ if response.StatusCode >= http.StatusBadRequest {
+ return StatusError{
+ StatusCode: response.StatusCode,
+ Status: response.Status,
+ ErrorMessage: string(bts),
+ }
+ }
+ return errors.New(string(bts))
}
if response.StatusCode == http.StatusUnauthorized {
diff --git a/api/client_test.go b/api/client_test.go
index f0034e02..827e41d9 100644
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -55,6 +55,7 @@ func TestClientFromEnvironment(t *testing.T) {
type testError struct {
message string
statusCode int
+ raw bool // if true, write message as-is instead of JSON encoding
}
func (e testError) Error() string {
@@ -111,6 +112,20 @@ func TestClientStream(t *testing.T) {
},
},
},
+ {
+ name: "plain text error response",
+ responses: []any{
+ "internal server error",
+ },
+ wantErr: "internal server error",
+ },
+ {
+ name: "HTML error page",
+ responses: []any{
+ "
404 Not Found",
+ },
+ wantErr: "404 Not Found",
+ },
}
for _, tc := range testCases {
@@ -135,6 +150,12 @@ func TestClientStream(t *testing.T) {
return
}
+ if str, ok := resp.(string); ok {
+ fmt.Fprintln(w, str)
+ flusher.Flush()
+ continue
+ }
+
if err := json.NewEncoder(w).Encode(resp); err != nil {
t.Fatalf("failed to encode response: %v", err)
}
@@ -173,9 +194,10 @@ func TestClientStream(t *testing.T) {
func TestClientDo(t *testing.T) {
testCases := []struct {
- name string
- response any
- wantErr string
+ name string
+ response any
+ wantErr string
+ wantStatusCode int
}{
{
name: "immediate error response",
@@ -183,7 +205,8 @@ func TestClientDo(t *testing.T) {
message: "test error message",
statusCode: http.StatusBadRequest,
},
- wantErr: "test error message",
+ wantErr: "test error message",
+ wantStatusCode: http.StatusBadRequest,
},
{
name: "server error response",
@@ -191,7 +214,8 @@ func TestClientDo(t *testing.T) {
message: "internal error",
statusCode: http.StatusInternalServerError,
},
- wantErr: "internal error",
+ wantErr: "internal error",
+ wantStatusCode: http.StatusInternalServerError,
},
{
name: "successful response",
@@ -203,6 +227,26 @@ func TestClientDo(t *testing.T) {
Success: true,
},
},
+ {
+ name: "plain text error response",
+ response: testError{
+ message: "internal server error",
+ statusCode: http.StatusInternalServerError,
+ raw: true,
+ },
+ wantErr: "internal server error",
+ wantStatusCode: http.StatusInternalServerError,
+ },
+ {
+ name: "HTML error page",
+ response: testError{
+ message: "404 Not Found",
+ statusCode: http.StatusNotFound,
+ raw: true,
+ },
+ wantErr: "404 Not Found",
+ wantStatusCode: http.StatusNotFound,
+ },
}
for _, tc := range testCases {
@@ -210,11 +254,16 @@ func TestClientDo(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if errResp, ok := tc.response.(testError); ok {
w.WriteHeader(errResp.statusCode)
- err := json.NewEncoder(w).Encode(map[string]string{
- "error": errResp.message,
- })
- if err != nil {
- t.Fatal("failed to encode error response:", err)
+ if !errResp.raw {
+ err := json.NewEncoder(w).Encode(map[string]string{
+ "error": errResp.message,
+ })
+ if err != nil {
+ t.Fatal("failed to encode error response:", err)
+ }
+ } else {
+ // Write raw message (simulates non-JSON error responses)
+ fmt.Fprint(w, errResp.message)
}
return
}
@@ -241,6 +290,15 @@ func TestClientDo(t *testing.T) {
if err.Error() != tc.wantErr {
t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
}
+ if tc.wantStatusCode != 0 {
+ if statusErr, ok := err.(StatusError); ok {
+ if statusErr.StatusCode != tc.wantStatusCode {
+ t.Errorf("status code mismatch: got %d, want %d", statusErr.StatusCode, tc.wantStatusCode)
+ }
+ } else {
+ t.Errorf("expected StatusError, got %T", err)
+ }
+ }
return
}
diff --git a/api/examples/chat/main.go b/api/examples/chat/main.go
index 07430305..b44a1ec9 100644
--- a/api/examples/chat/main.go
+++ b/api/examples/chat/main.go
@@ -15,19 +15,19 @@ func main() {
}
messages := []api.Message{
- api.Message{
+ {
Role: "system",
Content: "Provide very brief, concise responses",
},
- api.Message{
+ {
Role: "user",
Content: "Name some unusual animals",
},
- api.Message{
+ {
Role: "assistant",
Content: "Monotreme, platypus, echidna",
},
- api.Message{
+ {
Role: "user",
Content: "which of these is the most dangerous?",
},
diff --git a/app/ui/app/src/api.ts b/app/ui/app/src/api.ts
index a06e4e00..a701a30a 100644
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -204,12 +204,10 @@ export async function* sendMessage(
data: uint8ArrayToBase64(att.data),
}));
- // Only send think parameter when actually requesting thinking
- // Don't send false as it causes issues with some providers
+ // Send think parameter when it's explicitly set (true, false, or a non-empty string).
const shouldSendThink =
think !== undefined &&
- ((typeof think === "boolean" && think) ||
- (typeof think === "string" && think !== ""));
+ (typeof think === "boolean" || (typeof think === "string" && think !== ""));
const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
method: "POST",
diff --git a/cmd/bench/bench.go b/cmd/bench/bench.go
index 25df1817..53721f87 100644
--- a/cmd/bench/bench.go
+++ b/cmd/bench/bench.go
@@ -48,8 +48,8 @@ func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool)
case "benchstat":
if verbose {
printHeader := func() {
- fmt.Printf("sysname: %s\n", runtime.GOOS)
- fmt.Printf("machine: %s\n", runtime.GOARCH)
+ fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
+ fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
}
once.Do(printHeader)
}
@@ -147,6 +147,17 @@ func BenchmarkChat(fOpt flagOptions) error {
return err
}
+ var out io.Writer = os.Stdout
+ if fOpt.outputFile != nil && *fOpt.outputFile != "" {
+ f, err := os.OpenFile(*fOpt.outputFile, os.O_CREATE|os.O_WRONLY, 0o644)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "ERROR: cannot open output file %s: %v\n", *fOpt.outputFile, err)
+ return err
+ }
+ defer f.Close()
+ out = f
+ }
+
for _, model := range models {
for range *fOpt.epochs {
options := make(map[string]interface{})
@@ -241,13 +252,14 @@ func BenchmarkChat(fOpt flagOptions) error {
},
}
- OutputMetrics(os.Stdout, *fOpt.format, metrics, *fOpt.verbose)
+ OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)
if *fOpt.keepAlive > 0 {
time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
}
}
}
+
return nil
}
diff --git a/cmd/cmd.go b/cmd/cmd.go
index a6729940..d77bb2c5 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1430,7 +1430,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
latest.Summary()
}
- return &api.Message{Role: role, Content: fullResponse.String()}, nil
+ return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
}
func generate(cmd *cobra.Command, opts runOptions) error {
diff --git a/convert/convert.go b/convert/convert.go
index f6afd8a3..15e31bf2 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -208,6 +208,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
conv = &gptossModel{}
case "DeepseekOCRForCausalLM":
conv = &deepseekocr{}
+ case "DeepseekV3ForCausalLM":
+ conv = &deepseek2Model{}
default:
return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
}
diff --git a/convert/convert_deepseek2.go b/convert/convert_deepseek2.go
new file mode 100644
index 00000000..aa620327
--- /dev/null
+++ b/convert/convert_deepseek2.go
@@ -0,0 +1,173 @@
+package convert
+
+import (
+ "cmp"
+ "fmt"
+ "log/slog"
+ "regexp"
+ "strconv"
+
+ "github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseek2Model struct {
+ ModelParameters // architectures, vocab_size
+ MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
+ HiddenSize uint32 `json:"hidden_size"`
+ HiddenLayers uint32 `json:"num_hidden_layers"`
+ IntermediateSize uint32 `json:"intermediate_size"`
+ NumAttentionHeads uint32 `json:"num_attention_heads"`
+ NumKeyValueHeads uint32 `json:"num_key_value_heads"`
+ RMSNormEPS float32 `json:"rms_norm_eps"`
+
+ RopeTheta float32 `json:"rope_theta"`
+ QKNopeHeadDim uint32 `json:"qk_nope_head_dim"`
+ QKRopeHeadDim uint32 `json:"qk_rope_head_dim"`
+ KVLoraRank uint32 `json:"kv_lora_rank"`
+ QLoraRank uint32 `json:"q_lora_rank"`
+ VHeadDim uint32 `json:"v_head_dim"`
+
+ ExpertCount uint32 `json:"n_routed_experts"`
+ ExpertSharedCount uint32 `json:"n_shared_experts"`
+ ExpertIntermediateSize uint32 `json:"moe_intermediate_size"`
+ ExpertUsedCount uint32 `json:"num_experts_per_tok"`
+ ExpertWeightsNorm bool `json:"norm_topk_prob"`
+ ExpertWeightsScale float32 `json:"routed_scaling_factor"`
+
+ ScoringFunc string `json:"scoring_func"`
+ LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
+
+ RopeScaling struct {
+ Factor float32 `json:"factor"`
+ OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
+ Type string `json:"type"`
+ MScaleAllDim float32 `json:"mscale_all_dim"`
+ } `json:"rope_scaling"`
+
+ Architecture string
+}
+
+func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
+ kv := p.ModelParameters.KV(t)
+ kv["general.architecture"] = "deepseek2"
+ kv["general.type"] = "model"
+ kv["deepseek2.block_count"] = p.HiddenLayers
+
+ numHeads := p.NumAttentionHeads
+ numKVHeads := p.NumKeyValueHeads
+
+ kv["deepseek2.attention.head_count"] = numHeads
+ kv["deepseek2.attention.head_count_kv"] = numKVHeads
+ kv["deepseek2.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
+ kv["deepseek2.attention.kv_lora_rank"] = p.KVLoraRank
+ kv["deepseek2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+ kv["deepseek2.attention.q_lora_rank"] = p.QLoraRank
+ kv["deepseek2.attention.value_length"] = p.VHeadDim
+ kv["deepseek2.context_length"] = p.MaxPositionEmbeddings
+ kv["deepseek2.embedding_length"] = p.HiddenSize
+ kv["deepseek2.expert_count"] = p.ExpertCount
+ kv["deepseek2.expert_feed_forward_length"] = p.ExpertIntermediateSize
+ kv["deepseek2.expert_shared_count"] = p.ExpertSharedCount
+
+ var scoringFunc uint32
+ switch p.ScoringFunc {
+ case "softmax":
+ // not currently supported in the model, but needed for Deepseek-OCR
+ scoringFunc = 1
+ case "sigmoid":
+ scoringFunc = 2
+ }
+ kv["deepseek2.expert_gating_func"] = scoringFunc
+ kv["deepseek2.expert_used_count"] = p.ExpertUsedCount
+ kv["deepseek2.expert_weights_norm"] = p.ExpertWeightsNorm
+ kv["deepseek2.expert_weights_scale"] = p.ExpertWeightsScale
+ kv["deepseek2.feed_forward_length"] = p.IntermediateSize
+ kv["deepseek2.leading_dense_block_count"] = p.LeadingDenseBlockCount
+
+ kv["deepseek2.rope.dimension_count"] = p.QKRopeHeadDim
+ kv["deepseek2.rope.freq_base"] = cmp.Or(p.RopeTheta, 10000.0)
+ kv["deepseek2.rope.scaling.factor"] = p.RopeScaling.Factor
+ kv["deepseek2.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
+ kv["deepseek2.rope.scaling.type"] = p.RopeScaling.Type
+ kv["deepseek2.rope.scaling.yarn_log_multiplier"] = 0.1 * p.RopeScaling.MScaleAllDim
+
+ kv["tokenizer.ggml.pre"] = "deepseek-v3"
+
+ return kv
+}
+
+func (p *deepseek2Model) Replacements() []string {
+ return []string{
+ "lm_head", "output",
+ "model.embed_tokens", "token_embd",
+ "model.norm", "output_norm",
+ "language_model.", "",
+ "model.layers", "blk",
+ "input_layernorm", "attn_norm",
+ "self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
+ "self_attn.kv_a_layernorm", "attn_kv_a_norm",
+ "self_attn.kv_b_proj", "attn_kv_b",
+ "self_attn.q_a_proj", "attn_q_a",
+ "self_attn.q_a_layernorm", "attn_q_a_norm",
+ "self_attn.q_b_proj", "attn_q_b",
+ "self_attn.o_proj", "attn_output",
+ "post_attention_layernorm", "ffn_norm",
+ "mlp.shared_experts.down_proj", "ffn_down_shexp",
+ "mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+ "mlp.shared_experts.up_proj", "ffn_up_shexp",
+ "mlp.gate_proj", "ffn_gate",
+ "mlp.down_proj", "ffn_down",
+ "mlp.up_proj", "ffn_up",
+ "mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
+ "mlp.gate", "ffn_gate_inp",
+ }
+}
+
+func (p *deepseek2Model) Tensors(s []Tensor) (out []*ggml.Tensor) {
+ merges := make([]merge, p.HiddenLayers*3)
+ for i := range p.HiddenLayers {
+ merges[i*3+0] = merge{
+ fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+ fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+ }
+ merges[i*3+1] = merge{
+ fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+ fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+ }
+ merges[i*3+2] = merge{
+ fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+ fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+ }
+ }
+
+ skipLayer := func(n string, minValue uint32) bool {
+ re := regexp.MustCompile(`^blk\.(\d+)`)
+ matches := re.FindStringSubmatch(n)
+ if matches == nil {
+ return false
+ }
+
+ blkNum, err := strconv.Atoi(matches[1])
+ if err != nil {
+ return false
+ }
+
+ return uint32(blkNum) >= minValue
+ }
+
+ out, s = mergeTensors(s, merges...)
+ for _, t := range s {
+ // skip any additional layers (such as the Multi-Token Prediction layer)
+ if skipLayer(t.Name(), p.HiddenLayers) {
+ slog.Debug("skipping layer", "name", t.Name())
+ continue
+ }
+ out = append(out, &ggml.Tensor{
+ Name: t.Name(),
+ Kind: t.Kind(),
+ Shape: t.Shape(),
+ WriterTo: t,
+ })
+ }
+ return out
+}
diff --git a/convert/convert_mistral.go b/convert/convert_mistral.go
index a6fd4c41..81774853 100644
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -29,6 +29,15 @@ type mistral3Model struct {
SlidingWindow *uint32 `json:"sliding_window"`
HiddenAct string `json:"hidden_act"`
VocabSize uint32 `json:"vocab_size"`
+ RopeParameters struct {
+ BetaFast float32 `json:"beta_fast"`
+ BetaSlow float32 `json:"beta_slow"`
+ Factor float32 `json:"factor"`
+ ScalingBeta float32 `json:"llama_4_scaling_beta"`
+ OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
+ RopeType string `json:"rope_type"`
+ RopeTheta float32 `json:"rope_theta"`
+ } `json:"rope_parameters"`
} `json:"text_config"`
VisionModel struct {
NumAttentionHeads uint32 `json:"num_attention_heads"`
@@ -61,8 +70,13 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
- kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
- kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
+ kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
+ kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
+
+ if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
+ kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
+ kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
+ }
// Vision configuration
kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
diff --git a/discover/runner.go b/discover/runner.go
index 0915594d..c963de6f 100644
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -65,6 +65,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
}
slog.Info("discovering available GPUs...")
+ detectIncompatibleLibraries()
// Warn if any user-overrides are set which could lead to incorrect GPU discovery
overrideWarnings()
@@ -98,6 +99,9 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
continue
} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
continue
+ } else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
+ slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
+ continue
} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
slog.Info("experimental Vulkan support disabled. To enable, set OLLAMA_VULKAN=1")
continue
@@ -143,7 +147,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
wg.Add(1)
go func(i int) {
defer wg.Done()
- extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1])
+ extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true)
devices[i].AddInitValidation(extraEnvs)
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
slog.Debug("filtering device which didn't fully initialize",
@@ -329,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
defer cancel()
// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
- devFilter := ml.GetVisibleDevicesEnv(devices)
+ // We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
+ devFilter := ml.GetVisibleDevicesEnv(devices, false)
for dir := range libDirs {
updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
@@ -484,3 +489,16 @@ func overrideWarnings() {
slog.Warn("if GPUs are not correctly discovered, unset and try again")
}
}
+
+func detectIncompatibleLibraries() {
+ if runtime.GOOS != "windows" {
+ return
+ }
+ basePath, err := exec.LookPath("ggml-base.dll")
+ if err != nil || basePath == "" {
+ return
+ }
+ if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
+ slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
+ }
+}
diff --git a/docs/capabilities/tool-calling.mdx b/docs/capabilities/tool-calling.mdx
index ae1ff959..30c994d9 100644
--- a/docs/capabilities/tool-calling.mdx
+++ b/docs/capabilities/tool-calling.mdx
@@ -15,7 +15,7 @@ Also known as "single-shot" tool calling.
```shell
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
"model": "qwen3",
- "messages": [{"role": "user", "content": "What's the temperature in New York?"}],
+ "messages": [{"role": "user", "content": "What is the temperature in New York?"}],
"stream": false,
"tools": [
{
@@ -41,7 +41,7 @@ Also known as "single-shot" tool calling.
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
"model": "qwen3",
"messages": [
- {"role": "user", "content": "What's the temperature in New York?"},
+ {"role": "user", "content": "What is the temperature in New York?"},
{
"role": "assistant",
"tool_calls": [
@@ -90,7 +90,7 @@ Also known as "single-shot" tool calling.
}
return temperatures.get(city, "Unknown")
- messages = [{"role": "user", "content": "What's the temperature in New York?"}]
+ messages = [{"role": "user", "content": "What is the temperature in New York?"}]
# pass functions directly as tools in the tools list or as a JSON schema
response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
@@ -146,7 +146,7 @@ Also known as "single-shot" tool calling.
},
]
- const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+ const messages = [{ role: 'user', content: "What is the temperature in New York?" }]
const response = await ollama.chat({
model: 'qwen3',
@@ -609,7 +609,7 @@ def get_temperature(city: str) -> str:
return temperatures.get(city, 'Unknown')
-messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
+messages = [{'role': 'user', 'content': "What is the temperature in New York?"}]
while True:
stream = chat(
@@ -684,7 +684,7 @@ const getTemperatureTool = {
}
async function agentLoop() {
- const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+ const messages = [{ role: 'user', content: "What is the temperature in New York?" }]
while (true) {
const stream = await ollama.chat({
diff --git a/docs/development.md b/docs/development.md
index ff07b5fb..d0120a19 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -49,6 +49,8 @@ Install prerequisites:
- [Ninja](https://github.com/ninja-build/ninja/releases)
- (Optional) NVIDIA GPU support
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
+- (Optional) VULKAN GPU support
+ - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
Then, configure and build the project:
@@ -57,6 +59,17 @@ cmake -B build
cmake --build build --config Release
```
+> Building for Vulkan requires VULKAN_SDK environment variable:
+>
+> PowerShell
+> ```powershell
+> $env:VULKAN_SDK="C:\VulkanSDK\"
+> ```
+> CMD
+> ```cmd
+> set VULKAN_SDK=C:\VulkanSDK\
+> ```
+
> [!IMPORTANT]
> Building for ROCm requires additional flags:
> ```
@@ -65,6 +78,7 @@ cmake --build build --config Release
> ```
+
Lastly, run Ollama:
```shell
@@ -84,7 +98,9 @@ Install prerequisites:
- [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
- (Optional) NVIDIA GPU support
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
-
+- (Optional) VULKAN GPU support
+ - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
+ - Or install via package manager: `sudo apt install vulkan-sdk` (Ubuntu/Debian) or `sudo dnf install vulkan-sdk` (Fedora/CentOS)
> [!IMPORTANT]
> Ensure prerequisites are in `PATH` before running CMake.
diff --git a/docs/faq.mdx b/docs/faq.mdx
index d9398e9d..13ef8a25 100644
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -57,8 +57,13 @@ ollama ps
```
- **Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
- 100% GPU 4 minutes from now ```
+
+**Output**:
+
+```
+NAME ID SIZE PROCESSOR UNTIL
+llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now
+```
The `Processor` column will show which memory the model was loaded in to:
@@ -385,4 +390,4 @@ Ollama for Windows and macOS register as a login item during installation. You
- In `Task Manager` go to the `Startup apps` tab, search for `ollama` then click `Disable`
**MacOS**
-- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
\ No newline at end of file
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
diff --git a/docs/modelfile.mdx b/docs/modelfile.mdx
index c91d7310..a3eca207 100644
--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@@ -149,9 +149,6 @@ PARAMETER
| Parameter | Description | Value Type | Example Usage |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
-| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
-| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 6ce9724f..4004bbfd 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -831,6 +831,7 @@ func (f GGML) FlashAttention() bool {
return slices.Contains([]string{
"gemma3",
"gptoss", "gpt-oss",
+ "mistral3",
"qwen3", "qwen3moe",
"qwen3vl", "qwen3vlmoe",
}, f.KV().String("general.architecture"))
diff --git a/integration/llm_image_test.go b/integration/llm_image_test.go
index e1c16baf..1a99ddd2 100644
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -33,6 +33,9 @@ func TestVisionModels(t *testing.T) {
// Qwen 3 VL mixture of experts
model: "qwen3-vl:30b",
},
+ {
+ model: "ministral-3",
+ },
}
for _, v := range testCases {
diff --git a/integration/tools_test.go b/integration/tools_test.go
index d6b8dfa5..fa37d8f3 100644
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -30,6 +30,7 @@ func TestAPIToolCalling(t *testing.T) {
"mistral": 6,
"qwen2.5": 6,
"qwen2": 6,
+ "ministral-3": 20,
"mistral-nemo": 9,
"mistral-small": 16,
"mixtral:8x22b": 80,
diff --git a/integration/utils_test.go b/integration/utils_test.go
index 8a362408..2dd39ecb 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -38,6 +38,7 @@ var (
// Note: add newer models at the top of the list to test them first
ollamaEngineChatModels = []string{
+ "ministral-3",
"qwen3-coder:30b",
"gpt-oss:20b",
"gemma3n:e2b",
@@ -167,6 +168,7 @@ var (
"medllama2",
"megadolphin",
"minicpm-v",
+ "ministral-3",
"mistral-large",
"mistral-nemo",
"mistral-openorca",
@@ -270,6 +272,7 @@ var (
"mistral",
"qwen2.5",
"qwen2",
+ "ministral-3",
"mistral-nemo",
"mistral-small",
"mixtral:8x22b",
diff --git a/llama/build-info.cpp b/llama/build-info.cpp
index 7f5e28c7..0122c7ed 100644
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "3cfa9c3f125763305b4226bc032f1954f08990dc";
+char const *LLAMA_COMMIT = "7f8ef50cce40e3e7e4526a3696cb45658190e69a";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
diff --git a/llama/llama.cpp/.rsync-filter b/llama/llama.cpp/.rsync-filter
index 650d9463..df75ca65 100644
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -22,6 +22,9 @@ include /src/llama.*
include /src/llama-*.*
include /src/unicode-data.*
include /src/unicode.*
+include /src/models/
+include /src/models/*.h
+include /src/models/*.cpp
include /vendor/
include /vendor/miniaudio/
include /vendor/miniaudio/*.h
diff --git a/llama/llama.cpp/common/common.cpp b/llama/llama.cpp/common/common.cpp
index b0591e84..0d7fd9a9 100644
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -8,6 +8,7 @@
#include "common.h"
#include "log.h"
#include "llama.h"
+#include "sampling.h"
#include
#include
@@ -26,7 +27,6 @@
#include
#include
#include
-#include
#include
#include
@@ -60,6 +60,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+common_time_meas::~common_time_meas() {
+ if (t_start_us >= 0) {
+ t_acc += ggml_time_us() - t_start_us;
+ }
+}
+
//
// CPU utils
//
@@ -355,11 +363,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
}
void common_init() {
- llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
- if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
- common_log_add(common_log_main(), level, "%s", text);
- }
- }, NULL);
+ llama_log_set(common_log_default_callback, NULL);
#ifdef NDEBUG
const char * build_type = "";
@@ -908,11 +912,96 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}
+std::vector fs_list_files(const std::string & path) {
+ std::vector files;
+ if (path.empty()) return files;
+
+ std::filesystem::path dir(path);
+ if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+ return files;
+ }
+
+ for (const auto & entry : std::filesystem::directory_iterator(dir)) {
+ try {
+ // Only include regular files (skip directories)
+ const auto & p = entry.path();
+ if (std::filesystem::is_regular_file(p)) {
+ common_file_info info;
+ info.path = p.string();
+ info.name = p.filename().string();
+ try {
+ info.size = static_cast(std::filesystem::file_size(p));
+ } catch (const std::filesystem::filesystem_error &) {
+ info.size = 0;
+ }
+ files.push_back(std::move(info));
+ }
+ } catch (const std::filesystem::filesystem_error &) {
+ // skip entries we cannot inspect
+ continue;
+ }
+ }
+
+ return files;
+}
+
//
// Model utils
//
+static inline void common_init_sampler_from_model(
+ const llama_model * model,
+ common_params_sampling & sparams) {
+
+ const uint64_t config = sparams.user_sampling_config;
+
+ auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+ if (config & user_config) return;
+
+ char buf[64] = {0};
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+ char * end = nullptr;
+ int32_t v = strtol(buf, &end, 10);
+ if (end && end != buf) dst = v;
+ }
+ };
+
+ auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+ if (config & user_config) return;
+
+ char buf[128] = {0};
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+ char * end = nullptr;
+ float v = strtof(buf, &end);
+ if (end && end != buf) dst = v;
+ }
+ };
+
+ // Sampling sequence
+ if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+ char buf[512] = {0};
+ if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+ const std::vector sampler_names = string_split(std::string(buf), ';');
+ if (!sampler_names.empty()) {
+ sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+ }
+ }
+ }
+
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
@@ -924,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
+ common_init_sampler_from_model(model, params.sampling);
+
const llama_vocab * vocab = llama_model_get_vocab(model);
auto cparams = common_context_params_to_llama(params);
diff --git a/llama/llama.cpp/common/common.h b/llama/llama.cpp/common/common.h
index a8cb630e..2f23d0ba 100644
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -2,17 +2,15 @@
#pragma once
+#include "ggml-opt.h"
+#include "llama-cpp.h"
+
#include
#include
#include
#include
#include
#include