embeddings: modified batch size (#13429)

This PR detects embedding models and sets batch_size = context_size so the full input fits in a single batch.
Previously, if batch size was smaller than the input, tokens could be split across batches and cause a SIGTRAP crash.
This change ensures all tokens stay in one batch and prevents crashes.
Fixes: #12938 #13054

Co-authored-by: Jesse Gross <jesse@ollama.com>
This commit is contained in:
nicole pardal
2025-12-11 15:36:31 -08:00
committed by GitHub
parent 48e78e9be1
commit 3475d915cb
5 changed files with 78 additions and 7 deletions

View File

@@ -1203,16 +1203,22 @@ func (s *Server) allocModel(
return errors.New("loras are not yet implemented")
}
if s.model.Config().Cache == nil {
if parallel > 1 {
parallel = 1
slog.Warn("model does not support caching, disabling parallel processing")
}
if s.batchSize < kvSize {
s.batchSize = kvSize
slog.Warn("model does not support caching, setting batch size to context length", "batch_size", kvSize)
}
}
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
if err != nil {
return err
}
if !s.cache.enabled && parallel > 1 {
parallel = 1
slog.Warn("model does not support caching, disabling parallel processing")
}
s.parallel = parallel
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))