mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
ollamarunner: measure only active time
This commit is contained in:
committed by
Michael Yang
parent
bbbc73d637
commit
967a82f52f
@@ -91,10 +91,11 @@ type Sequence struct {
|
|||||||
doneReason llm.DoneReason
|
doneReason llm.DoneReason
|
||||||
|
|
||||||
// Metrics
|
// Metrics
|
||||||
startProcessingTime time.Time
|
startedAt, lastUpdatedAt time.Time
|
||||||
startGenerationTime time.Time
|
processingDuration time.Duration
|
||||||
numPredicted int
|
samplingDuration time.Duration
|
||||||
numPromptInputs int
|
numPredicted int
|
||||||
|
numPromptInputs int
|
||||||
}
|
}
|
||||||
|
|
||||||
type NewSequenceParams struct {
|
type NewSequenceParams struct {
|
||||||
@@ -108,8 +109,6 @@ type NewSequenceParams struct {
|
|||||||
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||||
s.ready.Wait()
|
s.ready.Wait()
|
||||||
|
|
||||||
startTime := time.Now()
|
|
||||||
|
|
||||||
inputs, ctxs, mmStore, err := s.inputs(prompt, images)
|
inputs, ctxs, mmStore, err := s.inputs(prompt, images)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
||||||
@@ -164,20 +163,19 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
|||||||
// TODO(jessegross): Ingest cached history for grammar
|
// TODO(jessegross): Ingest cached history for grammar
|
||||||
|
|
||||||
return &Sequence{
|
return &Sequence{
|
||||||
ctxs: ctxs,
|
ctxs: ctxs,
|
||||||
mmStore: mmStore,
|
mmStore: mmStore,
|
||||||
inputs: inputs,
|
inputs: inputs,
|
||||||
numPromptInputs: len(inputs),
|
numPromptInputs: len(inputs),
|
||||||
startProcessingTime: startTime,
|
numPredict: params.numPredict,
|
||||||
numPredict: params.numPredict,
|
pendingResponses: make([]string, 0),
|
||||||
pendingResponses: make([]string, 0),
|
responses: make(chan string, 100),
|
||||||
responses: make(chan string, 100),
|
quit: make(chan bool, 1),
|
||||||
quit: make(chan bool, 1),
|
embedding: make(chan []float32, 1),
|
||||||
embedding: make(chan []float32, 1),
|
sampler: params.sampler,
|
||||||
sampler: params.sampler,
|
embeddingOnly: params.embedding,
|
||||||
embeddingOnly: params.embedding,
|
stop: params.stop,
|
||||||
stop: params.stop,
|
numKeep: params.numKeep,
|
||||||
numKeep: params.numKeep,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -408,7 +406,7 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
|
|
||||||
supportsAsync := pooling.Type(s.model.Backend().Config().Uint("pooling_type")) == pooling.TypeNone
|
supportsAsync := pooling.Type(s.model.Backend().Config().Uint("pooling_type")) == pooling.TypeNone
|
||||||
|
|
||||||
var activeBatch batchState
|
var previousBatch batchState
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@@ -417,16 +415,18 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
panic(err)
|
panic(err)
|
||||||
default:
|
default:
|
||||||
var err error
|
var err error
|
||||||
activeBatch, err = s.forwardBatch(activeBatch)
|
nextBatch, err := s.forwardBatch(previousBatch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if supportsAsync {
|
if supportsAsync {
|
||||||
go s.computeBatch(activeBatch)
|
go s.computeBatch(nextBatch)
|
||||||
} else {
|
} else {
|
||||||
s.computeBatch(activeBatch)
|
s.computeBatch(nextBatch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
previousBatch = nextBatch
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -562,6 +562,13 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
|||||||
seq.inputs = seq.inputs[len(seq.pendingInputs):]
|
seq.inputs = seq.inputs[len(seq.pendingInputs):]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now()
|
||||||
|
for i := range nextBatch.seqs {
|
||||||
|
if nextBatch.seqs[i] != nil && nextBatch.seqs[i].startedAt.IsZero() {
|
||||||
|
nextBatch.seqs[i].startedAt = startedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if resumeSeq != -1 {
|
if resumeSeq != -1 {
|
||||||
s.nextSeq = resumeSeq
|
s.nextSeq = resumeSeq
|
||||||
} else {
|
} else {
|
||||||
@@ -682,6 +689,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
activeBatch.modelOutput)
|
activeBatch.modelOutput)
|
||||||
|
|
||||||
outputs := activeBatch.modelOutput.Floats()
|
outputs := activeBatch.modelOutput.Floats()
|
||||||
|
t := time.Now()
|
||||||
|
|
||||||
logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id)
|
logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id)
|
||||||
|
|
||||||
@@ -694,8 +702,10 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seq.lastUpdatedAt = t
|
||||||
if seq.numPredicted == 1 {
|
if seq.numPredicted == 1 {
|
||||||
seq.startGenerationTime = time.Now()
|
seq.processingDuration = seq.lastUpdatedAt.Sub(seq.startedAt)
|
||||||
|
seq.startedAt = seq.lastUpdatedAt
|
||||||
}
|
}
|
||||||
|
|
||||||
// if done processing the prompt, generate an embedding and return
|
// if done processing the prompt, generate an embedding and return
|
||||||
@@ -774,6 +784,13 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||||||
s.removeSequence(i, llm.DoneReasonConnectionClosed)
|
s.removeSequence(i, llm.DoneReasonConnectionClosed)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
samplingDuration := time.Since(t)
|
||||||
|
for i, seq := range s.seqs {
|
||||||
|
if seq != nil && nextBatchTokens[i] != nil {
|
||||||
|
s.seqs[i].samplingDuration += samplingDuration
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -887,9 +904,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: seq.doneReason,
|
DoneReason: seq.doneReason,
|
||||||
PromptEvalCount: seq.numPromptInputs,
|
PromptEvalCount: seq.numPromptInputs,
|
||||||
PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
|
PromptEvalDuration: seq.processingDuration,
|
||||||
EvalCount: seq.numPredicted,
|
EvalCount: seq.numPredicted,
|
||||||
EvalDuration: time.Since(seq.startGenerationTime),
|
EvalDuration: seq.lastUpdatedAt.Sub(seq.startedAt) - seq.samplingDuration,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user