Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-05-16 22:24:44 +08:00
committed by GitHub
18 changed files with 285 additions and 98 deletions

View File

@@ -165,7 +165,7 @@ if [ -z "${CUDART_LIB_DIR}" ]; then
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi
if [ -d "${CUDA_LIB_DIR}" ]; then
if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
@@ -227,7 +227,7 @@ if [ -z "${CLBlast_DIR}" ]; then
fi
fi
if [ -d "${ROCM_PATH}" ]; then
if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)

View File

@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
opts.NumCtx = max(opts.NumCtx, 2048)
}
layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok {
memoryMinimum += blk0.size()
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload = graphFullOffload
}
layers := ggml.Tensors().Layers()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
@@ -100,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
var layerCount int
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer {
memoryRequiredPartial += memoryLayer
layerCount++
memoryRequiredTotal += memoryLayer
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
memoryRequiredPartial += memoryLayer
layerCount++
}
}
}
@@ -117,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryRequiredTotal += memoryLayerOutput
}
if memoryAvailable > memoryRequiredTotal {
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"offload to gpu",
slog.Group(
"layers",
// actual number of layers offloaded
"real", opts.NumGPU,
// requested number of layers to offload
"requested", opts.NumGPU,
// estimated number of layers that can be offloaded
"estimate", layerCount,
"real", layerCount,
),
slog.Group(
"memory",

View File

@@ -38,6 +38,7 @@ type LlamaServer interface {
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
EstimatedVRAM() uint64
EstimatedTotal() uint64
}
// llmServer is an instance of the llama.cpp server
@@ -88,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu()
gpuCount = 0
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
@@ -316,8 +318,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
slog.Info("starting llama server", "cmd", s.cmd.String())
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", s.cmd.Env)
if envconfig.Debug {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment them essage about noexec
@@ -955,6 +971,10 @@ func (s *llmServer) EstimatedVRAM() uint64 {
return s.estimatedVRAM
}
func (s *llmServer) EstimatedTotal() uint64 {
return s.estimatedTotal
}
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {