mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
embeddings: modified batch size (#13429)
This PR detects embedding models and sets batch_size = context_size so the full input fits in a single batch. Previously, if batch size was smaller than the input, tokens could be split across batches and cause a SIGTRAP crash. This change ensures all tokens stay in one batch and prevents crashes. Fixes: #12938 #13054 Co-authored-by: Jesse Gross <jesse@ollama.com>
This commit is contained in:
@@ -1203,16 +1203,22 @@ func (s *Server) allocModel(
|
||||
return errors.New("loras are not yet implemented")
|
||||
}
|
||||
|
||||
if s.model.Config().Cache == nil {
|
||||
if parallel > 1 {
|
||||
parallel = 1
|
||||
slog.Warn("model does not support caching, disabling parallel processing")
|
||||
}
|
||||
if s.batchSize < kvSize {
|
||||
s.batchSize = kvSize
|
||||
slog.Warn("model does not support caching, setting batch size to context length", "batch_size", kvSize)
|
||||
}
|
||||
}
|
||||
|
||||
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !s.cache.enabled && parallel > 1 {
|
||||
parallel = 1
|
||||
slog.Warn("model does not support caching, disabling parallel processing")
|
||||
}
|
||||
|
||||
s.parallel = parallel
|
||||
s.seqs = make([]*Sequence, s.parallel)
|
||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||
|
||||
Reference in New Issue
Block a user