diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 29adf56f..13d1c957 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -58,6 +58,7 @@ jobs: runs-on: ${{ matrix.os }} env: GOARCH: ${{ matrix.arch }} + CGO_ENABLED: '1' steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 @@ -79,6 +80,7 @@ jobs: - run: go generate -x ./... if: ${{ ! startsWith(matrix.os, 'windows-') }} name: 'Unix Go Generate' + - run: go build . - uses: actions/upload-artifact@v4 with: name: ${{ matrix.os }}-${{ matrix.arch }}-libraries diff --git a/docs/development.md b/docs/development.md index 2a6886a4..cd6c41af 100644 --- a/docs/development.md +++ b/docs/development.md @@ -104,7 +104,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H, you might use: ``` -OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./... +OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./... go build . ``` diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index bbb77183..484c4b6c 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -89,3 +89,8 @@ Sometimes the Ollama can have difficulties initializing the GPU. When you check If none of those resolve the problem, gather additional information and file an issue: - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` + + +## Windows Terminal Errors + +Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer. diff --git a/docs/windows.md b/docs/windows.md index abc0eb30..69c2aa6d 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -19,7 +19,7 @@ Logs will often be helpful in diagnosing the problem (see ## System Requirements -* Windows 10 or newer, Home or Pro +* Windows 10 22H2 or newer, Home or Pro * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index db7d52dc..c300244f 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -3,11 +3,14 @@ set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) -install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +install(TARGETS ollama_llama_server ggml llama + RUNTIME DESTINATION "${CMAKE_BINARY_DIR}/bin" + LIBRARY DESTINATION "${CMAKE_BINARY_DIR}/bin" + COMPONENT ollama_llama_server) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 09970599..00a15b4a 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1382,12 +1382,50 @@ struct llama_server_context } } + std::string common_prefix(const std::string& str1, const std::string& str2) { + auto mismatch_pair = std::mismatch(str1.begin(), str1.end(), str2.begin()); + return std::string(str1.begin(), mismatch_pair.first); + } + + // Find the slot that has the greatest common prefix + server_slot *prefix_slot(const json &prompt) { + if (!prompt.is_string()) { + return nullptr; + } + + std::string prompt_str = prompt.get(); + server_slot *slot = nullptr; + size_t longest = 0; + + for (server_slot &s : slots) { + if (s.available() && s.prompt.is_string()) { + std::string s_prompt = s.prompt.get(); + std::string prefix = common_prefix(s_prompt, prompt_str); + + if (prefix.size() > longest) { + slot = &s; + longest = prefix.size(); + } + } + } + + if (!slot) { + return get_slot(-1); + } + + LOG_INFO("slot with common prefix found", {{ + "slot_id", slot->id, + "characters", longest + }}); + return slot; + } + void process_single_task(task_server& task) { switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + server_slot *slot = prefix_slot(task.data["prompt"]); if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index da1b0688..23feaf99 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -81,6 +81,7 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 + cmake --install ${BUILD_DIR} --component ollama_llama_server } compress() { diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 721a9ae8..02577545 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,16 +18,16 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off" +COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" case "${GOARCH}" in "amd64") - COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off" + COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off" # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build @@ -37,7 +37,7 @@ case "${GOARCH}" in # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu" echo "Building LCD CPU" build @@ -49,7 +49,7 @@ case "${GOARCH}" in # Approximately 400% faster than LCD on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" echo "Building AVX CPU" build @@ -61,7 +61,7 @@ case "${GOARCH}" in # Approximately 10% faster than AVX on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" @@ -75,14 +75,14 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then init_vars - CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/metal" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 43ad5b81..da1bab1b 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -60,7 +60,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off" +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -73,7 +73,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}_static" echo "Building static library" build @@ -93,22 +93,22 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 - # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer - # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) - # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen - # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver + # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer + # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) + # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen + # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver # Note: the following seem to yield slower results than AVX2 - ymmv - # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT) - # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake - # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake + # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT) + # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake + # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off" + COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build @@ -125,7 +125,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # Approximately 400% faster than LCD on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build @@ -138,7 +138,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # Approximately 10% faster than AVX on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build @@ -179,15 +179,15 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then # # CUDA compute < 6.0 lacks proper FP16 support on ARM. # Disabling has minimal performance effect while maintaining compatibility. - ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off" + ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off" fi # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\"" - CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DGGML_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat" fi CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" @@ -225,7 +225,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then init_vars source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI CC=icx - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it @@ -263,7 +263,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) fi init_vars - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\"" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 19a967e4..a4b6c14a 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -51,8 +51,8 @@ function init_vars { } $script:cmakeDefs = @( "-DBUILD_SHARED_LIBS=on", - "-DLLAMA_NATIVE=off", - "-DLLAMA_OPENMP=off" + "-DGGML_NATIVE=off", + "-DGGML_OPENMP=off" ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() @@ -194,9 +194,9 @@ function cleanup { } -# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer -# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen -# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver +# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer +# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen +# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver function build_static() { @@ -216,13 +216,13 @@ function build_static() { "-DCMAKE_C_COMPILER=gcc.exe", "-DCMAKE_CXX_COMPILER=g++.exe", "-DBUILD_SHARED_LIBS=off", - "-DLLAMA_NATIVE=off", - "-DLLAMA_AVX=off", - "-DLLAMA_AVX2=off", - "-DLLAMA_AVX512=off", - "-DLLAMA_F16C=off", - "-DLLAMA_FMA=off", - "-DLLAMA_OPENMP=off") + "-DGGML_NATIVE=off", + "-DGGML_AVX=off", + "-DGGML_AVX2=off", + "-DGGML_AVX512=off", + "-DGGML_F16C=off", + "-DGGML_FMA=off", + "-DGGML_OPENMP=off") $script:buildDir="../build/windows/${script:ARCH}_static" write-host "Building static library" build @@ -236,7 +236,7 @@ function build_cpu($gen_arch) { if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) { # remaining llama.cpp builds use MSVC init_vars - $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs + $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs $script:buildDir="../build/windows/${script:ARCH}/cpu" $script:distDir="$script:DIST_BASE\cpu" write-host "Building LCD CPU" @@ -251,7 +251,7 @@ function build_cpu($gen_arch) { function build_cpu_avx() { if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) { init_vars - $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs + $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs $script:buildDir="../build/windows/${script:ARCH}/cpu_avx" $script:distDir="$script:DIST_BASE\cpu_avx" write-host "Building AVX CPU" @@ -266,7 +266,7 @@ function build_cpu_avx() { function build_cpu_avx2() { if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) { init_vars - $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs + $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2" $script:distDir="$script:DIST_BASE\cpu_avx2" write-host "Building AVX2 CPU" @@ -291,9 +291,9 @@ function build_cuda() { $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT" $script:cmakeDefs += @( "-A", "x64", - "-DLLAMA_CUDA=ON", - "-DLLAMA_AVX=on", - "-DLLAMA_AVX2=off", + "-DGGML_CUDA=ON", + "-DGGML_AVX=on", + "-DGGML_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_FLAGS=-t8", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" @@ -331,7 +331,7 @@ function build_oneapi() { $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT" $script:cmakeDefs += @( "-G", "MinGW Makefiles", - "-DLLAMA_SYCL=ON", + "-DGGML_SYCL=ON", "-DCMAKE_C_COMPILER=icx", "-DCMAKE_CXX_COMPILER=icx", "-DCMAKE_BUILD_TYPE=Release" @@ -377,10 +377,10 @@ function build_rocm() { "-G", "Ninja", "-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_CXX_COMPILER=clang++.exe", - "-DLLAMA_HIPBLAS=on", + "-DGGML_HIPBLAS=on", "-DHIP_PLATFORM=amd", - "-DLLAMA_AVX=on", - "-DLLAMA_AVX2=off", + "-DGGML_AVX=on", + "-DGGML_AVX2=off", "-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DAMDGPU_TARGETS=$(amdGPUs)", "-DGPU_TARGETS=$(amdGPUs)" diff --git a/llm/llama.cpp b/llm/llama.cpp index 7c26775a..d7fd29ff 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c +Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff diff --git a/llm/llm.go b/llm/llm.go index 2a0c4b91..98fe7f09 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,12 +1,13 @@ package llm -// #cgo CFLAGS: -Illama.cpp -// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++ -// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++ -// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++ -// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++ -// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++ -// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++ +// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include +// #cgo LDFLAGS: -lllama -lggml -lstdc++ +// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal +// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src +// #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src +// #cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src +// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src +// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src // #include // #include "llama.h" import "C" diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff index be528609..a053c1c2 100644 --- a/llm/patches/01-load-progress.diff +++ b/llm/patches/01-load-progress.diff @@ -1,8 +1,8 @@ diff --git a/common/common.cpp b/common/common.cpp -index 73ff0e85..6adb1a92 100644 +index 2c05a4d4..927f0e3d 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & +@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; @@ -12,10 +12,10 @@ index 73ff0e85..6adb1a92 100644 mparams.kv_overrides = NULL; } else { diff --git a/common/common.h b/common/common.h -index 58ed72f4..0bb2605e 100644 +index 65c0ef81..ebca2c77 100644 --- a/common/common.h +++ b/common/common.h -@@ -180,6 +180,13 @@ struct gpt_params { +@@ -184,6 +184,13 @@ struct gpt_params { std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) @@ -26,6 +26,6 @@ index 58ed72f4..0bb2605e 100644 + // context pointer passed to the progress callback + void * progress_callback_user_data; + - // server params - int32_t port = 8080; // server listens on this network port - int32_t timeout_read = 600; // http read timeout in seconds + // embedding + bool embedding = false; // get only sentence embedding + int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) diff --git a/llm/patches/03-load_exception.diff b/llm/patches/03-load_exception.diff index eb245c2a..02666196 100644 --- a/llm/patches/03-load_exception.diff +++ b/llm/patches/03-load_exception.diff @@ -1,17 +1,8 @@ -From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Thu, 23 May 2024 11:18:45 -0700 -Subject: [PATCH] throw exception on load errors - ---- - llama.cpp | 25 ++++++++++++++++--------- - 1 file changed, 16 insertions(+), 9 deletions(-) - -diff --git a/llama.cpp b/llama.cpp -index 15c66077..8ba90b6a 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -6346,7 +6346,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam +diff --git a/src/llama.cpp b/src/llama.cpp +index 73f52435..58a00fb1 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -7241,7 +7241,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); @@ -20,7 +11,7 @@ index 15c66077..8ba90b6a 100644 } return 0; -@@ -15600,16 +15600,23 @@ struct llama_model * llama_load_model_from_file( +@@ -17564,16 +17564,23 @@ struct llama_model * llama_load_model_from_file( } model->rpc_servers.push_back(servers); } @@ -52,6 +43,3 @@ index 15c66077..8ba90b6a 100644 } return model; --- -2.45.1 - diff --git a/llm/patches/04-metal.diff b/llm/patches/04-metal.diff index f8fa7db7..e63732e7 100644 --- a/llm/patches/04-metal.diff +++ b/llm/patches/04-metal.diff @@ -1,7 +1,7 @@ -diff --git a/ggml-metal.m b/ggml-metal.m +diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 0207b787..b5e9884b 100644 ---- a/ggml-metal.m -+++ b/ggml-metal.m +--- a/ggml/src/ggml-metal.m ++++ b/ggml/src/ggml-metal.m @@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute( // to the matrix-vector kernel int ne11_mm_min = 1; diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 2a2e7306..f4eaced7 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ -diff --git a/llama.cpp b/llama.cpp -index 61948751..4b72a293 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -4824,16 +4824,7 @@ static void llm_load_vocab( +diff --git a/src/llama.cpp b/src/llama.cpp +index 73f52435..2b81b4bd 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -5092,16 +5092,7 @@ static void llm_load_vocab( // for now, only BPE models have pre-tokenizers if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { @@ -20,13 +20,13 @@ index 61948751..4b72a293 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -4888,7 +4879,8 @@ static void llm_load_vocab( - tokenizer_pre == "poro-chat") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; +@@ -5164,7 +5155,8 @@ static void llm_load_vocab( + tokenizer_pre == "jais") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - } else { + } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff index d7b0c155..1c7109f6 100644 --- a/llm/patches/06-qwen2.diff +++ b/llm/patches/06-qwen2.diff @@ -1,7 +1,7 @@ -diff --git a/llama.cpp b/llama.cpp +diff --git a/src/llama.cpp b/src/llama.cpp index 40d2ec2c..f34eb79a 100644 ---- a/llama.cpp -+++ b/llama.cpp +--- a/src/llama.cpp ++++ b/src/llama.cpp @@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); diff --git a/llm/patches/07-embeddings.diff b/llm/patches/07-embeddings.diff new file mode 100644 index 00000000..a84e3b06 --- /dev/null +++ b/llm/patches/07-embeddings.diff @@ -0,0 +1,45 @@ +diff --git a/src/llama.cpp b/src/llama.cpp +index 1fe2b9f7..a43312a7 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead +- const bool has_logits = !cparams.embeddings; ++ const bool has_logits = cparams.causal_attn; + const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); + + const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; +@@ -13959,17 +13959,25 @@ static int llama_decode_internal( + // no output + res = nullptr; + embd = nullptr; +- } else if (cparams.embeddings) { +- res = nullptr; // do not extract logits for embedding case +- embd = gf->nodes[gf->n_nodes - 1]; +- if (strcmp(embd->name, "result_embd_pooled") != 0) { +- embd = gf->nodes[gf->n_nodes - 2]; ++ } ++ ++ if (cparams.embeddings) { ++ for (int i = gf->n_nodes - 1; i >= 0; --i) { ++ embd = gf->nodes[i]; ++ if (strcmp(embd->name, "result_embd_pooled") == 0) { ++ break; ++ } + } + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); +- } else { ++ } else { + embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); + } ++ ++ if (!cparams.causal_attn) { ++ res = nullptr; // do not extract logits when not needed ++ } ++ + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(lctx.sched, gf); diff --git a/llm/patches/07-gemma.diff b/llm/patches/07-gemma.diff deleted file mode 100644 index 86eac3d1..00000000 --- a/llm/patches/07-gemma.diff +++ /dev/null @@ -1,305 +0,0 @@ -From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001 -From: Ollama maintainers -Date: Wed, 26 Jun 2024 16:18:09 -0700 -Subject: [PATCH] Architecture support - ---- - llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++- - 1 file changed, 193 insertions(+), 1 deletion(-) - -diff --git a/llama.cpp b/llama.cpp -index 61948751..3b4196f5 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -217,6 +217,7 @@ enum llm_arch { - LLM_ARCH_INTERNLM2, - LLM_ARCH_MINICPM, - LLM_ARCH_GEMMA, -+ LLM_ARCH_GEMMA2, - LLM_ARCH_STARCODER2, - LLM_ARCH_MAMBA, - LLM_ARCH_XVERSE, -@@ -255,6 +256,7 @@ static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, - { LLM_ARCH_GEMMA, "gemma" }, -+ { LLM_ARCH_GEMMA2, "gemma2" }, - { LLM_ARCH_STARCODER2, "starcoder2" }, - { LLM_ARCH_MAMBA, "mamba" }, - { LLM_ARCH_XVERSE, "xverse" }, -@@ -464,10 +466,12 @@ enum llm_tensor { - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_NORM_2, - LLM_TENSOR_ATTN_OUT_NORM, -+ LLM_TENSOR_ATTN_POST_NORM, - LLM_TENSOR_ATTN_ROT_EMBD, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_GATE_INP_SHEXP, - LLM_TENSOR_FFN_NORM, -+ LLM_TENSOR_FFN_POST_NORM, - LLM_TENSOR_FFN_GATE, - LLM_TENSOR_FFN_DOWN, - LLM_TENSOR_FFN_UP, -@@ -960,6 +964,24 @@ static const std::map> LLM_TENSOR_NA - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, -+ { -+ LLM_ARCH_GEMMA2, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, -+ }, -+ }, - { - LLM_ARCH_STARCODER2, - { -@@ -1941,6 +1963,8 @@ enum e_model { - MODEL_8x22B, - MODEL_16x12B, - MODEL_10B_128x3_66B, -+ MODEL_9B, -+ MODEL_27B, - }; - - static const size_t kiB = 1024; -@@ -2114,6 +2138,7 @@ struct llama_layer { - struct ggml_tensor * attn_out_norm_b; - struct ggml_tensor * attn_q_a_norm; - struct ggml_tensor * attn_kv_a_norm; -+ struct ggml_tensor * attn_post_norm; - - // attention - struct ggml_tensor * wq; -@@ -2136,6 +2161,7 @@ struct llama_layer { - // normalization - struct ggml_tensor * ffn_norm; - struct ggml_tensor * ffn_norm_b; -+ struct ggml_tensor * ffn_post_norm; - struct ggml_tensor * layer_out_norm; - struct ggml_tensor * layer_out_norm_b; - struct ggml_tensor * ffn_norm_exps; -@@ -4529,6 +4555,16 @@ static void llm_load_hparams( - } - } break; - case LLM_ARCH_GEMMA: -+ { -+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -+ -+ switch (hparams.n_layer) { -+ case 18: model.type = e_model::MODEL_9B; break; -+ case 28: model.type = e_model::MODEL_27B; break; -+ default: model.type = e_model::MODEL_UNKNOWN; -+ } -+ } break; -+ case LLM_ARCH_GEMMA2: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - -@@ -6305,6 +6341,40 @@ static bool llm_load_tensors( - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - } - } break; -+ case LLM_ARCH_GEMMA2: -+ { -+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); -+ -+ // output -+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); -+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading -+ -+ const int64_t n_ff = hparams.n_ff; -+ const int64_t n_embd_head_k = hparams.n_embd_head_k; -+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); -+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); -+ -+ for (uint32_t i = 0; i < n_layer; ++i) { -+ ggml_context * ctx_layer = ctx_for_layer(i); -+ ggml_context * ctx_split = ctx_for_layer_split(i); -+ -+ auto & layer = model.layers[i]; -+ -+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); -+ -+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); -+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); -+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); -+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); -+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}); -+ -+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); -+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); -+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); -+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); -+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}); -+ } -+ } break; - case LLM_ARCH_STARCODER2: - { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); -@@ -10614,6 +10684,123 @@ struct llm_build_context { - return gf; - } - -+ struct ggml_cgraph * build_gemma2() { -+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); -+ -+ const int64_t n_embd_head_k = hparams.n_embd_head_k; -+ -+ struct ggml_tensor * cur; -+ struct ggml_tensor * inpL; -+ -+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); -+ -+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); -+ cb(inpL, "inp_scaled", -1); -+ -+ // inp_pos - contains the positions -+ struct ggml_tensor * inp_pos = build_inp_pos(); -+ -+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads) -+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); -+ -+ for (int il = 0; il < n_layer; ++il) { -+ // norm -+ cur = llm_build_norm(ctx0, inpL, hparams, -+ model.layers[il].attn_norm, NULL, -+ LLM_NORM_RMS, cb, il); -+ cb(cur, "attn_norm", il); -+ -+ // self-attention -+ { -+ // compute Q and K and RoPE them -+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); -+ cb(Qcur, "Qcur", il); -+ -+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); -+ cb(Kcur, "Kcur", il); -+ -+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); -+ cb(Vcur, "Vcur", il); -+ -+ Qcur = ggml_rope_ext( -+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, -+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); -+ cb(Qcur, "Qcur_scaled", il); -+ -+ Kcur = ggml_rope_ext( -+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, -+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow); -+ cb(Kcur, "Kcur", il); -+ -+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, -+ model.layers[il].wo, NULL, -+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); -+ } -+ -+ if (il == n_layer - 1) { -+ // skip computing output for unused tokens -+ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); -+ cur = ggml_get_rows(ctx0, cur, inp_out_ids); -+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); -+ } -+ -+ cur = llm_build_norm(ctx0, cur, hparams, -+ model.layers[il].attn_post_norm, NULL, -+ LLM_NORM_RMS, cb, il); -+ cb(cur, "attn_post_norm", il); -+ -+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); -+ cb(sa_out, "sa_out", il); -+ -+ cur = llm_build_norm(ctx0, sa_out, hparams, -+ model.layers[il].ffn_norm, NULL, -+ LLM_NORM_RMS, cb, il); -+ cb(cur, "ffn_norm", il); -+ -+ // feed-forward network -+ { -+ cur = llm_build_ffn(ctx0, cur, -+ model.layers[il].ffn_up, NULL, -+ model.layers[il].ffn_gate, NULL, -+ model.layers[il].ffn_down, NULL, -+ NULL, -+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il); -+ cb(cur, "ffn_out", il); -+ } -+ -+ cur = llm_build_norm(ctx0, cur, hparams, -+ model.layers[il].ffn_post_norm, NULL, -+ LLM_NORM_RMS, cb, -1); -+ cb(cur, "ffn_post_norm", -1); -+ -+ cur = ggml_add(ctx0, cur, sa_out); -+ cb(cur, "l_out", il); -+ -+ // input for next layer -+ inpL = cur; -+ } -+ -+ cur = inpL; -+ -+ cur = llm_build_norm(ctx0, cur, hparams, -+ model.output_norm, NULL, -+ LLM_NORM_RMS, cb, -1); -+ cb(cur, "result_norm", -1); -+ -+ // lm_head -+ cur = ggml_mul_mat(ctx0, model.output, cur); -+ cb(cur, "result_output", -1); -+ -+ ggml_build_forward_expand(gf, cur); -+ -+ return gf; -+ } -+ - struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - -@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph( - { - result = llm.build_gemma(); - } break; -+ case LLM_ARCH_GEMMA2: -+ { -+ result = llm.build_gemma2(); -+ } break; - case LLM_ARCH_STARCODER2: - { - result = llm.build_starcoder2(); -@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { - case LLM_ARCH_PHI2: - case LLM_ARCH_PHI3: - case LLM_ARCH_GEMMA: -+ case LLM_ARCH_GEMMA2: - case LLM_ARCH_STARCODER2: - case LLM_ARCH_GPTNEOX: - return LLAMA_ROPE_TYPE_NEOX; -@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal( - if (add_ass) { - ss << "assistant\n"; - } -- } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) { -+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { - // google/gemma-7b-it - std::string system_prompt = ""; - for (auto message : chat) { --- -2.45.2 - diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/08-clip-unicode.diff new file mode 100644 index 00000000..53e5ee11 --- /dev/null +++ b/llm/patches/08-clip-unicode.diff @@ -0,0 +1,42 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index 95fbe3d0..5a02a6ec 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -32,6 +33,14 @@ + #include + #include + ++#if defined(_WIN32) ++#define WIN32_LEAN_AND_MEAN ++#ifndef NOMINMAX ++ #define NOMINMAX ++#endif ++#include ++#endif ++ + //#define CLIP_DEBUG_FUNCTIONS + + // RGB uint8 image +@@ -1055,7 +1064,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { + return nullptr; + } + ++#ifdef _WIN32 ++ int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); ++ if (!wlen) { ++ return NULL; ++ } ++ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); ++ wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); ++ if (!wlen) { ++ free(wbuf); ++ return NULL; ++ } ++ auto fin = std::ifstream(wbuf, std::ios::binary); ++ free(wbuf); ++#else + auto fin = std::ifstream(fname, std::ios::binary); ++#endif + if (!fin) { + LOG_TEE("cannot open model file for loading tensors\n"); + clip_free(new_clip); diff --git a/llm/patches/09-pooling.diff b/llm/patches/09-pooling.diff new file mode 100644 index 00000000..2e4fe11e --- /dev/null +++ b/llm/patches/09-pooling.diff @@ -0,0 +1,60 @@ +diff --git a/src/llama.cpp b/src/llama.cpp +index 721b8f4e..cfe7ac40 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -8420,14 +8420,14 @@ struct llm_build_context { + } + + struct ggml_tensor * build_inp_mean() { +- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); ++ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); + cb(lctx.inp_mean, "inp_mean", -1); + ggml_set_input(lctx.inp_mean); + return lctx.inp_mean; + } + + struct ggml_tensor * build_inp_cls() { +- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ++ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); + cb(lctx.inp_cls, "inp_cls", -1); + ggml_set_input(lctx.inp_cls); + return lctx.inp_cls; +@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + + float * data = (float *) lctx.inp_mean->data; +- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); ++ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); + + std::vector sum(n_tokens, 0); + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; +- +- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); +- + sum[seq_id] += 1; + } + +- std::vector div(n_tokens, 0.0f); +- for (int i = 0; i < n_tokens; ++i) { ++ std::vector div(cparams.n_seq_max, 0.0f); ++ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); +@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + + uint32_t * data = (uint32_t *) lctx.inp_cls->data; +- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); ++ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + const llama_pos pos = batch.pos[i]; +- +- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); +- + if (pos == 0) { + data[seq_id] = i; + } diff --git a/server/sched.go b/server/sched.go index dc492cfb..8c054c6b 100644 --- a/server/sched.go +++ b/server/sched.go @@ -139,6 +139,11 @@ func (s *Scheduler) processPending(ctx context.Context) { } for { + cpus := s.getCpuFn() + var systemMem gpu.GpuInfo + if len(cpus) > 0 { + systemMem = cpus[0] + } var runnerToExpire *runnerRef s.loadedMu.Lock() runner := s.loaded[pending.model.ModelPath] @@ -192,6 +197,27 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Block attempting to load a model larger than system memory + GPU memory + estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) + maxSize := systemMem.FreeMemory + for _, gpu := range gpus { + if gpu.Library == "cpu" { + continue + } + if loadedCount == 0 { + // If no other models are loaded, set the limit based on what's available + maxSize += gpu.FreeMemory + } else { + // Other models could be unloaded, favor total memory for limit + maxSize += gpu.TotalMemory + } + } + if estimate.TotalSize > maxSize { + slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) + pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) + break + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode diff --git a/server/sched_test.go b/server/sched_test.go index d957927e..3fbd188a 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -199,6 +199,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario1a.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario1a.req.errCh) + case err := <-scenario1a.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } @@ -212,6 +214,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario1a.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario1b.req.errCh) + case err := <-scenario1b.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } @@ -230,6 +234,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario2a.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario2a.req.errCh) + case err := <-scenario2a.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } @@ -246,6 +252,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario3a.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3a.req.errCh) + case err := <-scenario3a.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } @@ -262,6 +270,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario3b.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3b.req.errCh) + case err := <-scenario3b.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } @@ -278,6 +288,8 @@ func TestRequests(t *testing.T) { require.Equal(t, resp.llama, scenario3c.srv) require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3c.req.errCh) + case err := <-scenario3c.req.errCh: + t.Fatal(err.Error()) case <-ctx.Done(): t.Fatal("timeout") } diff --git a/types/model/name.go b/types/model/name.go index e645a844..5e475687 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -91,7 +91,6 @@ type Name struct { Namespace string Model string Tag string - RawDigest string } // ParseName parses and assembles a Name from a name string. The @@ -143,11 +142,6 @@ func ParseNameBare(s string) Name { var n Name var promised bool - s, n.RawDigest, promised = cutLast(s, "@") - if promised && n.RawDigest == "" { - n.RawDigest = MissingPart - } - // "/" is an illegal tag character, so we can use it to split the host if strings.LastIndex(s, ":") > strings.LastIndex(s, "/") { s, n.Tag, _ = cutPromised(s, ":") @@ -222,10 +216,6 @@ func (n Name) String() string { b.WriteByte(':') b.WriteString(n.Tag) } - if n.RawDigest != "" { - b.WriteByte('@') - b.WriteString(n.RawDigest) - } return b.String() } @@ -250,16 +240,18 @@ func (n Name) DisplayShortest() string { return sb.String() } -func IsValidNamespace(namespace string) bool { - return isValidPart(kindNamespace, namespace) +// IsValidNamespace reports whether the provided string is a valid +// namespace. +func IsValidNamespace(s string) bool { + return isValidPart(kindNamespace, s) } // IsValid reports whether all parts of the name are present and valid. The // digest is a special case, and is checked for validity only if present. +// +// Note: The digest check has been removed as is planned to be added back in +// at a later time. func (n Name) IsValid() bool { - if n.RawDigest != "" && !isValidPart(kindDigest, n.RawDigest) { - return false - } return n.IsFullyQualified() } diff --git a/types/model/name_test.go b/types/model/name_test.go index 008dd586..794d14d7 100644 --- a/types/model/name_test.go +++ b/types/model/name_test.go @@ -122,21 +122,6 @@ func TestParseNameParts(t *testing.T) { }, wantFilepath: filepath.Join(part350, part80, part80, part80), }, - { - in: "@digest", - want: Name{ - RawDigest: "digest", - }, - wantValidDigest: false, - }, - { - in: "model@sha256:123", - want: Name{ - Model: "model", - RawDigest: "sha256:123", - }, - wantValidDigest: true, - }, } for _, tt := range cases { @@ -160,22 +145,18 @@ var testCases = map[string]bool{ // name -> valid "_why/_the/_lucky:_stiff": true, // minimal - "h/n/m:t@d": true, + "h/n/m:t": true, "host/namespace/model:tag": true, "host/namespace/model": false, "namespace/model": false, "model": false, - "@sha256-1000000000000000000000000000000000000000000000000000000000000000": false, - "model@sha256-1000000000000000000000000000000000000000000000000000000000000000": false, - "model@sha256:1000000000000000000000000000000000000000000000000000000000000000": false, // long (but valid) part80 + "/" + part80 + "/" + part80 + ":" + part80: true, part350 + "/" + part80 + "/" + part80 + ":" + part80: true, - "h/nn/mm:t@sha256-1000000000000000000000000000000000000000000000000000000000000000": true, // bare minimum part sizes - "h/nn/mm:t@sha256:1000000000000000000000000000000000000000000000000000000000000000": true, // bare minimum part sizes + "h/nn/mm:t": true, // bare minimum part sizes // unqualified "m": false, @@ -196,11 +177,10 @@ var testCases = map[string]bool{ // name -> valid "@": false, // not starting with alphanum - "-hh/nn/mm:tt@dd": false, - "hh/-nn/mm:tt@dd": false, - "hh/nn/-mm:tt@dd": false, - "hh/nn/mm:-tt@dd": false, - "hh/nn/mm:tt@-dd": false, + "-hh/nn/mm:tt": false, + "hh/-nn/mm:tt": false, + "hh/nn/-mm:tt": false, + "hh/nn/mm:-tt": false, // hosts "host:https/namespace/model:tag": true, @@ -334,7 +314,7 @@ func FuzzName(f *testing.F) { f.Fuzz(func(t *testing.T, s string) { n := ParseNameBare(s) if n.IsValid() { - parts := [...]string{n.Host, n.Namespace, n.Model, n.Tag, n.RawDigest} + parts := [...]string{n.Host, n.Namespace, n.Model, n.Tag} for _, part := range parts { if part == ".." { t.Errorf("unexpected .. as valid part")