mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
Merge upstream/main and resolve conflicts
This commit is contained in:
31
llm/ext_server/server.cpp
vendored
31
llm/ext_server/server.cpp
vendored
@@ -56,7 +56,6 @@ struct server_params {
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::vector<std::string> api_keys;
|
||||
std::string public_path = "examples/server/public";
|
||||
std::string chat_template = "";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
@@ -427,16 +426,6 @@ struct llama_server_context
|
||||
return true;
|
||||
}
|
||||
|
||||
void validate_model_chat_template(server_params & sparams) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
sparams.chat_template = "chatml";
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -2335,9 +2324,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUDA
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUDA
|
||||
}
|
||||
else if (arg == "--tensor-split" || arg == "-ts")
|
||||
{
|
||||
@@ -2346,7 +2335,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
@@ -2367,8 +2356,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
}
|
||||
}
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
||||
#endif // GGML_USE_CUDA
|
||||
}
|
||||
else if (arg == "--main-gpu" || arg == "-mg")
|
||||
{
|
||||
@@ -2377,7 +2366,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
||||
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.chat_template = argv[i];
|
||||
}
|
||||
else if (arg == "--override-kv")
|
||||
{
|
||||
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
const auto model_meta = llama.model_meta();
|
||||
|
||||
if (sparams.chat_template.empty()) { // custom chat template is not supplied
|
||||
// check if the template comes with the model is supported by us
|
||||
llama.validate_model_chat_template(sparams);
|
||||
}
|
||||
|
||||
// Middleware for API key validation
|
||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||
// If API key is not set, skip validation
|
||||
|
||||
@@ -18,7 +18,7 @@ sign() {
|
||||
fi
|
||||
}
|
||||
|
||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
|
||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
|
||||
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
@@ -27,7 +27,7 @@ case "${GOARCH}" in
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
@@ -37,7 +37,7 @@ case "${GOARCH}" in
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
@@ -49,7 +49,7 @@ case "${GOARCH}" in
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
@@ -61,7 +61,7 @@ case "${GOARCH}" in
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||
@@ -75,7 +75,7 @@ case "${GOARCH}" in
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
|
||||
@@ -60,7 +60,7 @@ if [ -z "${CUDACXX}" ]; then
|
||||
export CUDACXX=$(command -v nvcc)
|
||||
fi
|
||||
fi
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
@@ -73,7 +73,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
@@ -102,7 +102,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||
|
||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
|
||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
@@ -187,7 +187,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||
echo "Building custom CUDA GPU"
|
||||
else
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
fi
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
|
||||
@@ -51,7 +51,8 @@ function init_vars {
|
||||
}
|
||||
$script:cmakeDefs = @(
|
||||
"-DBUILD_SHARED_LIBS=on",
|
||||
"-DLLAMA_NATIVE=off"
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_OPENMP=off"
|
||||
)
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
@@ -134,8 +135,13 @@ function build {
|
||||
& cmake --version
|
||||
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
|
||||
if ($cmakeDefs -contains "-G") {
|
||||
$extra=@("-j8")
|
||||
} else {
|
||||
$extra= @("--", "/p:CL_MPcount=8")
|
||||
}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
# Rearrange output to be consistent between different generators
|
||||
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
|
||||
@@ -215,7 +221,8 @@ function build_static() {
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
"-DLLAMA_FMA=off",
|
||||
"-DLLAMA_OPENMP=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
@@ -282,7 +289,15 @@ function build_cuda() {
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
$script:cmakeDefs += @(
|
||||
"-A", "x64",
|
||||
"-DLLAMA_CUDA=ON",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
|
||||
"-DCMAKE_CUDA_FLAGS=-t8",
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
|
||||
)
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||
@@ -292,10 +307,12 @@ function build_cuda() {
|
||||
sign
|
||||
install
|
||||
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
} else {
|
||||
write-host "Skipping CUDA generation step"
|
||||
}
|
||||
@@ -329,16 +346,18 @@ function build_oneapi() {
|
||||
sign
|
||||
install
|
||||
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
} else {
|
||||
Write-Host "Skipping oneAPI generation step"
|
||||
}
|
||||
|
||||
46
llm/ggml.go
46
llm/ggml.go
@@ -69,6 +69,30 @@ func (kv KV) HeadCountKV() uint64 {
|
||||
return 1
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||
if heads := kv.HeadCount(); heads > 0 {
|
||||
return kv.EmbeddingLength() / kv.HeadCount()
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
|
||||
return k
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
|
||||
return v
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
}
|
||||
|
||||
func (kv KV) GQA() uint64 {
|
||||
return kv.HeadCount() / kv.HeadCountKV()
|
||||
}
|
||||
@@ -299,6 +323,9 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
headsKV := llm.KV().HeadCountKV()
|
||||
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
||||
|
||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||
|
||||
layers := llm.Tensors().Layers()
|
||||
|
||||
switch llm.KV().Architecture() {
|
||||
@@ -307,7 +334,8 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
|
||||
partialOffload = 4 * batch * embedding
|
||||
partialOffload += max(
|
||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
|
||||
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
|
||||
@@ -315,15 +343,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||
)
|
||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||
// mixtral 8x7b
|
||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||
partialOffload = max(
|
||||
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||
)
|
||||
}
|
||||
@@ -366,6 +394,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
4*batch*(vocab+2*embedding),
|
||||
fullOffload,
|
||||
)
|
||||
case "deepseek2":
|
||||
fullOffload = max(
|
||||
4*batch*(3*embedding+vocab),
|
||||
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
|
||||
)
|
||||
|
||||
partialOffload = max(
|
||||
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
||||
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
|
||||
)
|
||||
}
|
||||
|
||||
return
|
||||
|
||||
Submodule llm/llama.cpp updated: 5921b8f089...7c26775adb
315
llm/memory.go
315
llm/memory.go
@@ -3,9 +3,10 @@ package llm
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||
return true, estimatedVRAM
|
||||
@@ -30,24 +32,76 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
||||
return false, estimatedVRAM
|
||||
}
|
||||
|
||||
type MemoryEstimate struct {
|
||||
// How many layers we predict we can load
|
||||
Layers int
|
||||
|
||||
// The size of the graph which occupies the main GPU
|
||||
Graph uint64
|
||||
|
||||
// How much VRAM will be allocated given the number of layers we predict
|
||||
VRAMSize uint64
|
||||
|
||||
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
|
||||
TotalSize uint64
|
||||
|
||||
// For multi-GPU scenarios, this provides the tensor split parameter
|
||||
TensorSplit string
|
||||
|
||||
// For multi-GPU scenarios, this is the size in bytes per GPU
|
||||
GPUSizes []uint64
|
||||
|
||||
// internal fields for logging purposes
|
||||
inferenceLibrary string
|
||||
layersRequested int
|
||||
layersModel int
|
||||
availableList []string
|
||||
kv uint64
|
||||
allocationsList []string
|
||||
memoryWeights uint64
|
||||
memoryLayerOutput uint64
|
||||
graphFullOffload uint64
|
||||
graphPartialOffload uint64
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
|
||||
var memoryAvailable uint64
|
||||
for _, info := range gpus {
|
||||
memoryAvailable += info.FreeMemory
|
||||
}
|
||||
if envconfig.MaxVRAM > 0 {
|
||||
memoryAvailable = envconfig.MaxVRAM
|
||||
}
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
// Graph size for a partial offload, applies to all GPUs
|
||||
var graphPartialOffload uint64
|
||||
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
||||
// Graph size when all layers are offloaded, applies to all GPUs
|
||||
var graphFullOffload uint64
|
||||
|
||||
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
|
||||
memoryMinimum := gpus[0].MinimumMemory
|
||||
// Final graph offload once we know full or partial
|
||||
var graphOffload uint64
|
||||
|
||||
// Projectors loaded into GPU0 only
|
||||
var projectorSize uint64
|
||||
|
||||
// Conditional output size on GPU 0
|
||||
var memoryLayerOutput uint64
|
||||
|
||||
// The sizes of a layer
|
||||
var layerSize uint64
|
||||
|
||||
// The sum of all the layer sizes (just for logging)
|
||||
var memoryWeights uint64
|
||||
|
||||
// True if all the layers are loaded
|
||||
var fullyLoaded bool
|
||||
|
||||
// Overflow that didn't fit into the GPU
|
||||
var overflow uint64
|
||||
|
||||
availableList := make([]string, len(gpus))
|
||||
for i, gpu := range gpus {
|
||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||
}
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||
|
||||
for _, projector := range projectors {
|
||||
memoryMinimum += projectorMemoryRequirements(projector)
|
||||
projectorSize += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
@@ -56,127 +110,246 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
layers := ggml.Tensors().Layers()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
memoryMinimum += blk0.size()
|
||||
layerSize = blk0.size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
||||
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||
|
||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
// KV is proportional to the number of layers
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
|
||||
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
graphFullOffload *= uint64(len(gpus))
|
||||
graphPartialOffload *= uint64(len(gpus))
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
graphPartialOffload = graphFullOffload
|
||||
} else if len(gpus) > 1 {
|
||||
// multigpu should always use the partial graph size
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
||||
// memory is preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
memoryRequiredPartial += memoryLayerOutput
|
||||
// Output layer handled at the end if we have space
|
||||
gpuZeroOverhead := projectorSize
|
||||
|
||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||
var layerCount int
|
||||
layerCounts := make([]int, len(gpus))
|
||||
gpuAllocations := make([]uint64, len(gpus))
|
||||
type gs struct {
|
||||
i int
|
||||
g *gpu.GpuInfo
|
||||
}
|
||||
gpusWithSpace := []gs{}
|
||||
for i := range gpus {
|
||||
var gzo uint64
|
||||
if len(gpusWithSpace) == 0 {
|
||||
gzo = gpuZeroOverhead
|
||||
}
|
||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||
continue
|
||||
}
|
||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
var gpuZeroID int
|
||||
if len(gpusWithSpace) > 0 {
|
||||
gpuZeroID = gpusWithSpace[0].i
|
||||
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||
}
|
||||
|
||||
// For all the layers, find where they can fit on the GPU(s)
|
||||
for i := range int(ggml.KV().BlockCount()) {
|
||||
// Some models have inconsistent layer sizes
|
||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||
memoryLayer := blk.size()
|
||||
layerSize = blk.size()
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
}
|
||||
memoryWeights += layerSize
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||
continue
|
||||
}
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
// distribute the layers across the GPU(s) that have space
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[i%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > used+layerSize {
|
||||
gpuAllocations[g.i] += layerSize
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
} else {
|
||||
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
||||
// memory was not preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||
fullyLoaded = true
|
||||
} else {
|
||||
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||
overflow += layerSize
|
||||
}
|
||||
}
|
||||
|
||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
// Determine if we need to consider output then find where it fits
|
||||
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[layerCount%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > used+memoryLayerOutput {
|
||||
gpuAllocations[g.i] += memoryLayerOutput
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if layerCount < int(ggml.KV().BlockCount())+1 {
|
||||
fullyLoaded = false
|
||||
overflow += memoryLayerOutput
|
||||
}
|
||||
}
|
||||
|
||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
||||
// Add the applicable (full or partial) graph allocations
|
||||
for i := range gpus {
|
||||
if layerCounts[i] <= 0 {
|
||||
continue
|
||||
}
|
||||
if fullyLoaded {
|
||||
gpuAllocations[i] += graphFullOffload
|
||||
} else {
|
||||
gpuAllocations[i] += graphPartialOffload
|
||||
}
|
||||
}
|
||||
if fullyLoaded {
|
||||
graphOffload = graphFullOffload
|
||||
} else {
|
||||
graphOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
// Summaries for the log
|
||||
var memoryRequiredPartial, memoryRequiredTotal uint64
|
||||
for i := range gpuAllocations {
|
||||
memoryRequiredPartial += gpuAllocations[i]
|
||||
}
|
||||
memoryRequiredTotal = memoryRequiredPartial + overflow
|
||||
|
||||
tensorSplit := ""
|
||||
if len(gpus) > 1 {
|
||||
splits := make([]string, len(gpus))
|
||||
for i, count := range layerCounts {
|
||||
splits[i] = strconv.Itoa(count)
|
||||
}
|
||||
tensorSplit = strings.Join(splits, ",")
|
||||
}
|
||||
allocationsList := []string{}
|
||||
for _, a := range gpuAllocations {
|
||||
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
||||
}
|
||||
|
||||
estimate := MemoryEstimate{
|
||||
TotalSize: memoryRequiredTotal,
|
||||
Layers: 0,
|
||||
Graph: 0,
|
||||
VRAMSize: 0,
|
||||
GPUSizes: []uint64{},
|
||||
|
||||
inferenceLibrary: gpus[0].Library,
|
||||
layersRequested: opts.NumGPU,
|
||||
layersModel: int(ggml.KV().BlockCount()) + 1,
|
||||
availableList: availableList,
|
||||
kv: kv,
|
||||
allocationsList: allocationsList,
|
||||
memoryWeights: memoryWeights,
|
||||
memoryLayerOutput: memoryLayerOutput,
|
||||
graphFullOffload: graphFullOffload,
|
||||
graphPartialOffload: graphPartialOffload,
|
||||
}
|
||||
|
||||
if gpus[0].Library == "cpu" {
|
||||
return estimate
|
||||
}
|
||||
if layerCount == 0 {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return estimate
|
||||
}
|
||||
estimate.Layers = layerCount
|
||||
estimate.Graph = graphOffload
|
||||
estimate.VRAMSize = memoryRequiredPartial
|
||||
estimate.TotalSize = memoryRequiredTotal
|
||||
estimate.TensorSplit = tensorSplit
|
||||
estimate.GPUSizes = gpuAllocations
|
||||
return estimate
|
||||
}
|
||||
|
||||
func (m MemoryEstimate) log() {
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
"offload to "+m.inferenceLibrary,
|
||||
slog.Group(
|
||||
"layers",
|
||||
// requested number of layers to offload
|
||||
"requested", opts.NumGPU,
|
||||
"requested", m.layersRequested,
|
||||
// The number of layers the model has (including output)
|
||||
"model", m.layersModel,
|
||||
// estimated number of layers that can be offloaded
|
||||
"real", layerCount,
|
||||
"offload", m.Layers,
|
||||
// multi-gpu split for tensors
|
||||
"split", m.TensorSplit,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
// memory available for offloading
|
||||
"available", format.HumanBytes2(memoryAvailable),
|
||||
// memory available by GPU for offloading
|
||||
"available", m.availableList,
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
||||
"full", format.HumanBytes2(m.TotalSize),
|
||||
// memory required to offload layers.estimate layers
|
||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
||||
"partial", format.HumanBytes2(m.VRAMSize),
|
||||
// memory of KV cache
|
||||
"kv", format.HumanBytes2(kv),
|
||||
"kv", format.HumanBytes2(m.kv),
|
||||
// Allocations across the GPUs
|
||||
"allocations", m.allocationsList,
|
||||
),
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(memoryWeights),
|
||||
"total", format.HumanBytes2(m.memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
||||
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
||||
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||
),
|
||||
slog.Group(
|
||||
"graph",
|
||||
// memory of graph when fully offloaded
|
||||
"full", format.HumanBytes2(graphFullOffload),
|
||||
"full", format.HumanBytes2(m.graphFullOffload),
|
||||
// memory of graph when not fully offloaded
|
||||
"partial", format.HumanBytes2(graphPartialOffload),
|
||||
"partial", format.HumanBytes2(m.graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
if gpus[0].Library == "cpu" {
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
|
||||
return layerCount, memoryRequiredPartial, memoryRequiredTotal
|
||||
}
|
||||
|
||||
127
llm/memory_test.go
Normal file
127
llm/memory_test.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
envconfig.Debug = true
|
||||
modelName := "dummy"
|
||||
f, err := os.CreateTemp(t.TempDir(), modelName)
|
||||
require.NoError(t, err)
|
||||
defer f.Close()
|
||||
gguf := NewGGUFV3(binary.LittleEndian)
|
||||
inputLayerCount := 5
|
||||
tensors := []Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
}
|
||||
assert.Len(t, tensors, inputLayerCount+1)
|
||||
err = gguf.Encode(f, KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": "name",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.block_count": uint32(inputLayerCount),
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(32),
|
||||
"tokenizer.ggml.tokens": []string{" "},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, tensors)
|
||||
require.NoError(t, err)
|
||||
|
||||
ggml, err := LoadModel(f.Name())
|
||||
require.NoError(t, err)
|
||||
|
||||
// Simple CPU scenario
|
||||
gpus := []gpu.GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
},
|
||||
}
|
||||
projectors := []string{}
|
||||
opts := api.DefaultOptions()
|
||||
t.Run("cpu", func(t *testing.T) {
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
assert.Equal(t, 0, estimate.Layers)
|
||||
assert.Equal(t, uint64(0), estimate.Graph)
|
||||
})
|
||||
|
||||
// derived from the dummy ggml file above
|
||||
graphPartialOffload := uint64(202377216)
|
||||
graphFullOffload := uint64(171968512)
|
||||
layerSize := uint64(33554436)
|
||||
projectorSize := uint64(0)
|
||||
memoryLayerOutput := uint64(4)
|
||||
|
||||
// Dual CUDA scenario with assymetry
|
||||
gpuMinimumMemory := uint64(2048)
|
||||
gpus = []gpu.GpuInfo{
|
||||
{
|
||||
Library: "cuda",
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
},
|
||||
{
|
||||
Library: "cuda",
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
},
|
||||
}
|
||||
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
||||
for i, s := range []struct {
|
||||
layer0, layer1 uint64
|
||||
expect0, expect1 uint64
|
||||
}{
|
||||
{1, 1, 1, 1},
|
||||
{2, 1, 2, 1},
|
||||
{2, 2, 2, 2},
|
||||
{1, 2, 1, 2},
|
||||
{3, 3, 3, 3},
|
||||
{4, 4, 3, 3},
|
||||
{6, 6, 3, 3},
|
||||
{0, 3, 0, 3},
|
||||
} {
|
||||
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
|
||||
gpus[0].FreeMemory = 0
|
||||
gpus[1].FreeMemory = 0
|
||||
gpus[0].FreeMemory += projectorSize
|
||||
if s.layer0 > 0 {
|
||||
gpus[0].FreeMemory += memoryLayerOutput
|
||||
} else {
|
||||
gpus[1].FreeMemory += memoryLayerOutput
|
||||
}
|
||||
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
|
||||
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
|
||||
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
|
||||
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
|
||||
var layerSums uint64
|
||||
for _, b := range estimate.GPUSizes {
|
||||
layerSums += b
|
||||
}
|
||||
if estimate.Layers < inputLayerCount+1 {
|
||||
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||
} else {
|
||||
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index ba1ecf0e..cead57cc 100644
|
||||
index 73ff0e85..6adb1a92 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
diff --git a/common/common.h b/common/common.h
|
||||
index d80344f2..71e84834 100644
|
||||
index 58ed72f4..0bb2605e 100644
|
||||
--- a/common/common.h
|
||||
+++ b/common/common.h
|
||||
@@ -174,6 +174,13 @@ struct gpt_params {
|
||||
// multimodal models (see examples/llava)
|
||||
@@ -180,6 +180,13 @@ struct gpt_params {
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
+
|
||||
|
||||
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
+ // If the provided progress_callback returns true, model loading continues.
|
||||
+ // If it returns false, model loading is immediately aborted.
|
||||
+ llama_progress_callback progress_callback = NULL;
|
||||
+ // context pointer passed to the progress callback
|
||||
+ void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
+
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 40d2ec2c..74f3ee9c 100644
|
||||
index 61948751..4b72a293 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
|
||||
@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
|
||||
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
|
||||
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
- } else if (
|
||||
+ if (
|
||||
tokenizer_pre == "default") {
|
||||
- } else if (tokenizer_pre == "default") {
|
||||
+ if (tokenizer_pre == "default") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "poro-chat") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
||||
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
|
||||
}
|
||||
|
||||
// glob payloadsDir for files that start with ollama_
|
||||
pattern := filepath.Join(payloadsDir, "*")
|
||||
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
||||
|
||||
files, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
|
||||
servers := make(map[string]string)
|
||||
for _, file := range files {
|
||||
slog.Debug("availableServers : found", "file", file)
|
||||
servers[filepath.Base(file)] = file
|
||||
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
||||
}
|
||||
|
||||
return servers
|
||||
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||
// glob workDir for files that start with ollama_
|
||||
availableServers := availableServers()
|
||||
requested := info.Library
|
||||
if info.Variant != "" {
|
||||
requested += "_" + info.Variant
|
||||
if info.Variant != gpu.CPUCapabilityNone {
|
||||
requested += "_" + info.Variant.String()
|
||||
}
|
||||
|
||||
servers := []string{}
|
||||
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if info.Library != "cpu" {
|
||||
variant := gpu.GetCPUVariant()
|
||||
variant := gpu.GetCPUCapability()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != "" {
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
servers = append(servers, cmp)
|
||||
break
|
||||
}
|
||||
@@ -146,11 +146,11 @@ func serverForCpu() string {
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUVariant()
|
||||
variant := gpu.GetCPUCapability()
|
||||
availableServers := availableServers()
|
||||
if variant != "" {
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
return cmp
|
||||
}
|
||||
}
|
||||
|
||||
133
llm/server.go
133
llm/server.go
@@ -37,8 +37,9 @@ type LlamaServer interface {
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
EstimatedVRAM() uint64
|
||||
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||
EstimatedTotal() uint64
|
||||
EstimatedVRAMByGPU(gpuID string) uint64
|
||||
}
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
@@ -49,13 +50,12 @@ type llmServer struct {
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
|
||||
// TODO - this should be broken down by GPU
|
||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
||||
estimatedTotal uint64 // Total size of model
|
||||
totalLayers uint64
|
||||
gpuCount int
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
estimate MemoryEstimate
|
||||
totalLayers uint64
|
||||
// gpuCount int
|
||||
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
@@ -80,43 +80,45 @@ func LoadModel(model string) (*GGML, error) {
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||
var err error
|
||||
var cpuRunner string
|
||||
var estimatedVRAM uint64
|
||||
var estimatedTotal uint64
|
||||
var systemMemory uint64
|
||||
gpuCount := len(gpus)
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
||||
var estimate MemoryEstimate
|
||||
var systemTotalMemory uint64
|
||||
var systemFreeMemory uint64
|
||||
|
||||
cpuRunner = serverForCpu()
|
||||
gpuCount = 0
|
||||
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
systemMemInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemMemory = memInfo.TotalMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
||||
}
|
||||
}
|
||||
var layers int
|
||||
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
systemTotalMemory = systemMemInfo.TotalMemory
|
||||
systemFreeMemory = systemMemInfo.FreeMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
|
||||
}
|
||||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
if opts.NumGPU == 0 {
|
||||
gpus = gpu.GetCPUInfo()
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = serverForCpu()
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
} else {
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
|
||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
opts.NumGPU = 0
|
||||
case gpus[0].Library != "metal" && layers == 0:
|
||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
cpuRunner = serverForCpu()
|
||||
gpuCount = 0
|
||||
case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = layers
|
||||
gpus = gpu.GetCPUInfo()
|
||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = estimate.Layers
|
||||
}
|
||||
}
|
||||
|
||||
estimate.log()
|
||||
|
||||
// Loop through potential servers
|
||||
finalErr := errors.New("no suitable llama servers found")
|
||||
|
||||
@@ -201,7 +203,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
if g.Library == "metal" &&
|
||||
uint64(opts.NumGPU) > 0 &&
|
||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
opts.UseMMap = false
|
||||
opts.UseMMap = api.TriStateFalse
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,7 +211,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--flash-attn")
|
||||
}
|
||||
|
||||
if !opts.UseMMap {
|
||||
// Windows CUDA should not use mmap for best performance
|
||||
// Linux with a model larger than free space, mmap leads to thrashing
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
||||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
||||
opts.UseMMap == api.TriStateFalse {
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
@@ -232,6 +238,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
if estimate.TensorSplit != "" {
|
||||
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||
}
|
||||
|
||||
if estimate.TensorSplit != "" {
|
||||
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||
}
|
||||
|
||||
for i := range len(servers) {
|
||||
dir := availableServers[servers[i]]
|
||||
if dir == "" {
|
||||
@@ -242,8 +256,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
||||
gpuCount = 0
|
||||
gpus = gpu.GetCPUInfo()
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||
@@ -265,8 +278,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
if runtime.GOOS == "windows" {
|
||||
pathEnv = "PATH"
|
||||
}
|
||||
// prepend the server directory to LD_LIBRARY_PATH/PATH
|
||||
libraryPaths := []string{dir}
|
||||
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
|
||||
libraryPaths := []string{dir, filepath.Dir(dir)}
|
||||
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
// Append our runner directory to the path
|
||||
@@ -299,22 +312,25 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
s := &llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimatedVRAM: estimatedVRAM,
|
||||
estimatedTotal: estimatedTotal,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
gpuCount: gpuCount,
|
||||
done: make(chan error, 1),
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimate: estimate,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
gpus: gpus,
|
||||
done: make(chan error, 1),
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
|
||||
envWorkarounds := [][2]string{}
|
||||
for _, gpu := range gpus {
|
||||
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
|
||||
}
|
||||
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
@@ -329,6 +345,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
||||
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
||||
devicesNeeded = false
|
||||
} else if len(envWorkarounds) != 0 {
|
||||
for _, kv := range envWorkarounds {
|
||||
if strings.EqualFold(cmp[0], kv[0]) {
|
||||
s.cmd.Env[i] = kv[0] + "=" + kv[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if pathNeeded {
|
||||
@@ -1004,11 +1026,20 @@ func (s *llmServer) Close() error {
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedVRAM() uint64 {
|
||||
return s.estimatedVRAM
|
||||
return s.estimate.VRAMSize
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedTotal() uint64 {
|
||||
return s.estimatedTotal
|
||||
return s.estimate.TotalSize
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
|
||||
for i, gpu := range s.gpus {
|
||||
if gpu.ID == gpuID {
|
||||
return s.estimate.GPUSizes[i]
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
|
||||
Reference in New Issue
Block a user