mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 15:08:27 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -165,7 +165,7 @@ if [ -z "${CUDART_LIB_DIR}" ]; then
|
||||
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
|
||||
fi
|
||||
|
||||
if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||
@@ -227,7 +227,7 @@ if [ -z "${CLBlast_DIR}" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -d "${ROCM_PATH}" ]; then
|
||||
if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
|
||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
||||
|
||||
@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
memoryMinimum += blk0.size()
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
|
||||
@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
graphPartialOffload = graphFullOffload
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
@@ -100,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
|
||||
var layerCount int
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||
memoryLayer := blk.size()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
}
|
||||
|
||||
if memoryAvailable > memoryRequiredTotal {
|
||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
"offload to gpu",
|
||||
slog.Group(
|
||||
"layers",
|
||||
// actual number of layers offloaded
|
||||
"real", opts.NumGPU,
|
||||
// requested number of layers to offload
|
||||
"requested", opts.NumGPU,
|
||||
// estimated number of layers that can be offloaded
|
||||
"estimate", layerCount,
|
||||
"real", layerCount,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
|
||||
@@ -38,6 +38,7 @@ type LlamaServer interface {
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
EstimatedVRAM() uint64
|
||||
EstimatedTotal() uint64
|
||||
}
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
@@ -88,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
|
||||
cpuRunner = serverForCpu()
|
||||
gpuCount = 0
|
||||
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
@@ -316,8 +318,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||
// Log at debug as the environment is inherited and might contain sensitive information
|
||||
slog.Debug("subprocess", "environment", s.cmd.Env)
|
||||
if envconfig.Debug {
|
||||
filteredEnv := []string{}
|
||||
for _, ev := range s.cmd.Env {
|
||||
if strings.HasPrefix(ev, "CUDA_") ||
|
||||
strings.HasPrefix(ev, "ROCM_") ||
|
||||
strings.HasPrefix(ev, "HIP_") ||
|
||||
strings.HasPrefix(ev, "HSA_") ||
|
||||
strings.HasPrefix(ev, "GGML_") ||
|
||||
strings.HasPrefix(ev, "PATH=") ||
|
||||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
|
||||
filteredEnv = append(filteredEnv, ev)
|
||||
}
|
||||
}
|
||||
// Log at debug as the environment is inherited and might contain sensitive information
|
||||
slog.Debug("subprocess", "environment", filteredEnv)
|
||||
}
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
// Detect permission denied and augment them essage about noexec
|
||||
@@ -955,6 +971,10 @@ func (s *llmServer) EstimatedVRAM() uint64 {
|
||||
return s.estimatedVRAM
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedTotal() uint64 {
|
||||
return s.estimatedTotal
|
||||
}
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user