Merge branch 'ollama:main' into main

2025-12-21 22:33:56 +00:00 · 2024-09-27 11:44:48 +08:00
parent c8d6d9a010 03608cb46e
commit f6cc192947
39 changed files with 454 additions and 160 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -274,7 +274,134 @@ jobs:
          path: dist/deps/*
-  # Import the prior generation steps and build the final windows assets
+  # windows arm64 generate, go build, and zip file (no installer)
  # Output of this build is aggregated into the final x86 build
  # for a unified windows installer
  windows-arm64:
    runs-on: windows-arm64
    environment: release
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      # The current Windows arm64 beta image has effectively zero dev tools installed...
      - name: Install git and gzip
        run: |
          Set-ExecutionPolicy Bypass -Scope Process -Force
          [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
          iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install Visual Studio 2022
        run: |
          $components = @(
            "Microsoft.VisualStudio.Component.CoreEditor",
            "Microsoft.VisualStudio.Workload.CoreEditor",
            "Microsoft.VisualStudio.Component.Roslyn.Compiler",
            "Microsoft.Component.MSBuild",
            "Microsoft.VisualStudio.Component.TextTemplating",
            "Microsoft.VisualStudio.Component.Debugger.JustInTime",
            "Microsoft.VisualStudio.Component.VC.CoreIde",
            "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
            "Microsoft.VisualStudio.Component.Windows11SDK.22621",
            "Microsoft.VisualStudio.Component.VC.Tools.ARM64EC",
            "Microsoft.VisualStudio.Component.VC.Tools.ARM64",
            "Microsoft.VisualStudio.Component.VC.ATL",
            "Microsoft.VisualStudio.Component.VC.ATL.ARM64",
            "Microsoft.VisualStudio.Component.Graphics",
            "Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
            "Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
            "Microsoft.VisualStudio.Component.Windows11Sdk.WindowsPerformanceToolkit",
            "Microsoft.VisualStudio.Component.CppBuildInsights",
            "Microsoft.VisualStudio.Component.VC.DiagnosticTools",
            "Microsoft.VisualStudio.ComponentGroup.WebToolsExtensions.CMake",
            "Microsoft.VisualStudio.Component.VC.CMake.Project",
            "Microsoft.VisualStudio.Component.VC.ASAN",
            "Microsoft.VisualStudio.Component.Vcpkg",
            "Microsoft.VisualStudio.Workload.NativeDesktop"
          )
          $config = @{
                "version" = "1.0"
                "components"  = $components
                "extensions"  = @()
            }
          $configPath = "${env:RUNNER_TEMP}\vsconfig"
          $config | ConvertTo-Json | Out-File -FilePath $configPath
          $bootstrapperFilePath = "${env:RUNNER_TEMP}\vs_community.exe"
          write-host "Downloading Visual Studio 2022"
          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_community.exe" -outfile $bootstrapperFilePath
          $bootstrapperArgumentList = ('/c', $bootstrapperFilePath, '--config', $configPath, '--quiet', '--wait' )
          write-host "Installing Visual Studio 2022"
          $process = Start-Process -FilePath cmd.exe -ArgumentList $bootstrapperArgumentList -Wait -PassThru
          $exitCode = $process.ExitCode
          write-host $exitCode
      # pacman in mingw/msys2 is ~broken on windows arm right now - hangs consistently during attempts to install
      # so we'll use this alternative GCC binary
      - name: Install llvm-mingw GCC
        run: |
          $gcc_url="https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip"
          write-host "Downloading llvm-mingw"
          Invoke-WebRequest -Uri "${gcc_url}" -OutFile "${env:RUNNER_TEMP}\gcc.zip"
          write-host "Unpacking llvm-mingw"
          expand-archive -path "${env:RUNNER_TEMP}\gcc.zip" -destinationpath "c:\"
          mv c:\llvm-mingw-* c:\llvm-mingw
          echo "c:\llvm-mingw\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Verify GCC
        run: |
          echo $env:PATH
          gcc --version
      - uses: actions/checkout@v4
      - name: Set Version
        run: |
          $ver=${env:GITHUB_REF_NAME}.trim("v")
          echo VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
      - uses: 'google-github-actions/auth@v2'
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
      - name: install Windows SDK 8.1 to get signtool
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading SDK"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
          echo $env:PATH
          $env:ARCH="arm64"
          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
        name: 'Windows Build'
      - uses: actions/upload-artifact@v4
        with:
          name: windows-arm64
          path: |
            dist/windows-arm64/**
            dist/windows-arm64-app.exe
            dist/ollama-windows-arm64.zip
  # Import the prior generation steps plus the full arm64 build, and build the final windows assets
  build-windows:
    environment: release
    runs-on: windows
@@ -282,6 +409,7 @@ jobs:
      - generate-windows-cuda
      - generate-windows-rocm
      - generate-windows-cpu
      - windows-arm64
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
@@ -339,6 +467,10 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
      - uses: actions/download-artifact@v4
        with:
          name: windows-arm64
          path: dist
      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
--- a/README.md
+++ b/README.md
@@ -53,10 +53,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 ## Quickstart
-To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
+To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
 ```
-ollama run llama3.1
+ollama run llama3.2
 ```
 ## Model library
@@ -67,6 +67,8 @@ Here are some example models that can be downloaded:
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
 | Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
@@ -117,16 +119,16 @@ See the [guide](docs/import.md) on importing models for more information.
 ### Customize a prompt
-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.2` model:
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 Create a `Modelfile`:
 ```
-FROM llama3.1
+FROM llama3.2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -161,7 +163,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -169,13 +171,13 @@ ollama pull llama3.1
 ### Remove a model
 ```
-ollama rm llama3.1
+ollama rm llama3.2
 ```
 ### Copy a model
 ```
-ollama cp llama3.1 my-model
+ollama cp llama3.2 my-model
 ```
 ### Multiline input
@@ -199,14 +201,14 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass the prompt as an argument
 ```
-$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
+$ ollama run llama3.2 "Summarize this file: $(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 ### Show model information
 ```
-ollama show llama3.1
+ollama show llama3.2
 ```
 ### List models on your computer
@@ -224,7 +226,7 @@ ollama ps
 ### Stop a model which is currently running
 ```
-ollama stop llama3.1
+ollama stop llama3.2
 ```
 ### Start Ollama
@@ -246,7 +248,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:
 ```
-./ollama run llama3.1
+./ollama run llama3.2
 ```
 ## REST API
@@ -257,7 +259,7 @@ Ollama has a REST API for running and managing models.
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt":"Why is the sky blue?"
 }'
 ```
@@ -266,7 +268,7 @@ curl http://localhost:11434/api/generate -d '{
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ]
@@ -343,6 +345,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 ### Terminal
@@ -395,7 +398,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
+- [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
 AppPublisherURL={#MyAppURL}
 AppSupportURL={#MyAppURL}
 AppUpdatesURL={#MyAppURL}
-ArchitecturesAllowed=x64 arm64
+ArchitecturesAllowed=x64compatible arm64
-ArchitecturesInstallIn64BitMode=x64 arm64
+ArchitecturesInstallIn64BitMode=x64compatible arm64
 DefaultDirName={localappdata}\Programs\{#MyAppName}
 DefaultGroupName={#MyAppName}
 DisableProgramGroupPage=yes
@@ -48,6 +48,7 @@ OutputDir=..\dist\
 SetupLogging=yes
 CloseApplications=yes
 RestartApplications=no
 RestartIfNeededByRun=no
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp
@@ -86,12 +87,21 @@ Name: "english"; MessagesFile: "compiler:Default.isl"
 DialogFontSize=12
 [Files]
-Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
+#if DirExists("..\dist\windows-amd64")
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-amd64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: not IsArm64();  Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\dist\windows-amd64\ollama.exe"; DestDir: "{app}"; Check: not IsArm64(); Flags: ignoreversion 64bit
 Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: not IsArm64(); Flags: ignoreversion 64bit recursesubdirs
 #endif
 #if DirExists("..\dist\windows-arm64")
 Source: "..\dist\windows-arm64\vc_redist.arm64.exe"; DestDir: "{tmp}"; Check: IsArm64() and vc_redist_needed(); Flags: deleteafterinstall
 Source: "..\dist\windows-arm64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: IsArm64();  Flags: ignoreversion 64bit
 Source: "..\dist\windows-arm64\ollama.exe"; DestDir: "{app}"; Check: IsArm64(); Flags: ignoreversion 64bit
 Source: "..\dist\windows-arm64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: IsArm64(); Flags: ignoreversion 64bit recursesubdirs
 #endif
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
 Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -99,6 +109,9 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
 [Run]
 #if DirExists("..\dist\windows-arm64")
 Filename: "{tmp}\vc_redist.arm64.exe"; Parameters: "/install /passive /norestart"; Check: IsArm64() and vc_redist_needed(); StatusMsg: "Installing VC++ Redistributables..."; Flags: waituntilterminated
 #endif
 Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
 [UninstallRun]
@@ -129,7 +142,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.1
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.2
 ;ClickFinish=%n
 [Registry]
@@ -154,3 +167,39 @@ begin
  { Pos() returns 0 if not found }
  Result := Pos(';' + ExpandConstant(Param) + ';', ';' + OrigPath + ';') = 0;
 end;
 { --- VC Runtime libraries discovery code - Only install vc_redist if it isn't already installed ----- }
 const VCRTL_MIN_V1 = 14;
 const VCRTL_MIN_V2 = 40;
 const VCRTL_MIN_V3 = 33807;
 const VCRTL_MIN_V4 = 0;
 // check if the minimum required vc redist is installed (by looking the registry)
 function vc_redist_needed (): Boolean;
 var
  sRegKey: string;
  v1: Cardinal;
  v2: Cardinal;
  v3: Cardinal;
  v4: Cardinal;
 begin
  sRegKey := 'SOFTWARE\WOW6432Node\Microsoft\VisualStudio\14.0\VC\Runtimes\arm64';
  if (RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Major', v1)  and
      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Minor', v2) and
      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Bld', v3) and
      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'RBld', v4)) then
  begin
    Log ('VC Redist version: ' + IntToStr (v1) +
        '.' + IntToStr (v2) + '.' + IntToStr (v3) +
        '.' + IntToStr (v4));
    { Version info was found. Return true if later or equal to our
       minimal required version RTL_MIN_Vx }
    Result := not (
        (v1 > VCRTL_MIN_V1) or ((v1 = VCRTL_MIN_V1) and
         ((v2 > VCRTL_MIN_V2) or ((v2 = VCRTL_MIN_V2) and
          ((v3 > VCRTL_MIN_V3) or ((v3 = VCRTL_MIN_V3) and
           (v4 >= VCRTL_MIN_V4)))))));
  end
  else
    Result := TRUE;
 end;
--- a/app/ollama_welcome.ps1
+++ b/app/ollama_welcome.ps1
@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
 write-host ""
 write-host "Run your first model:"
 write-host ""
-write-host "`tollama run llama3.1"
+write-host "`tollama run llama3.2"
 write-host ""
--- a/docs/api.md
+++ b/docs/api.md
@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1"
+  "model": "llama3.2"
 }'
 ```
@@ -400,7 +400,7 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@@ -415,7 +415,7 @@ If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a m
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "keep_alive": 0
 }'
 ```
@@ -426,7 +426,7 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2024-09-12T03:54:03.516566Z",
  "response": "",
  "done": true,
@@ -472,7 +472,7 @@ Send a chat message with a streaming response.
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -488,7 +488,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -503,7 +503,7 @@ Final response:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -521,7 +521,7 @@ Final response:
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -536,7 +536,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -560,7 +560,7 @@ Send a chat message with a conversation history. You can use this same approach
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -584,7 +584,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -598,7 +598,7 @@ Final response:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -656,7 +656,7 @@ curl http://localhost:11434/api/chat -d '{
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -674,7 +674,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -696,7 +696,7 @@ curl http://localhost:11434/api/chat -d '{
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -735,7 +735,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2024-07-22T20:33:28.123648Z",
  "message": {
    "role": "assistant",
@@ -771,7 +771,7 @@ If the messages array is empty, the model will be loaded into memory.
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": []
 }'
 ```
@@ -779,7 +779,7 @@ curl http://localhost:11434/api/chat -d '{
 ##### Response
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at":"2024-09-12T21:17:29.110811Z",
  "message": {
    "role": "assistant",
@@ -798,7 +798,7 @@ If the messages array is empty and the `keep_alive` parameter is set to `0`, a m
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [],
  "keep_alive": 0
 }'
@@ -810,7 +810,7 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at":"2024-09-12T21:33:17.547535Z",
  "message": {
    "role": "assistant",
@@ -989,7 +989,7 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.1"
+  "name": "llama3.2"
 }'
 ```
@@ -1050,7 +1050,7 @@ Copy a model. Creates a model with another name from an existing model.
 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3.1",
+  "source": "llama3.2",
  "destination": "llama3-backup"
 }'
 ```
@@ -1105,7 +1105,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.1"
+  "name": "llama3.2"
 }'
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -148,3 +148,22 @@ In addition to the common Windows development tools described above, install AMD
 - [Strawberry Perl](https://strawberryperl.com/)
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
 #### Windows arm64
 The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
 ```powershell
 import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
 Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
 ```
 You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
 Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
 ```
 pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
 ```
 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
 ```
-docker exec -it ollama ollama run llama3.1
+docker exec -it ollama ollama run llama3.2
 ```
 ### Try different models
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -232,7 +232,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3.1 ""
+ollama run llama3.2 ""
 ```
 ## How do I keep a model loaded in memory or make it unload immediately?
@@ -240,7 +240,7 @@ ollama run llama3.1 ""
 By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:
 ```shell
-ollama stop llama3.1
+ollama stop llama3.2
 ```
 If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
@@ -251,12 +251,12 @@ If you're using the API, use the `keep_alive` parameter with the `/api/generate`
 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
 ```
 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
 ```
 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -50,7 +50,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
 ```modelfile
-FROM llama3.1
+FROM llama3.2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
  ```bash
-  > ollama show --modelfile llama3.1
+  > ollama show --modelfile llama3.2
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3.1:latest
+  # FROM llama3.2:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
@@ -103,7 +103,7 @@ FROM <model name>:<tag>
 #### Build from existing model
 ```modelfile
-FROM llama3.1
+FROM llama3.2
 ```
 A list of available base models:
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama3.1',
+    model='llama3.2',
 )
 response = client.chat.completions.create(
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
 )
 completion = client.completions.create(
-    model="llama3.1",
+    model="llama3.2",
    prompt="Say this is a test",
 )
 list_completion = client.models.list()
-model = client.models.retrieve("llama3.1")
+model = client.models.retrieve("llama3.2")
 embeddings = client.embeddings.create(
    model="all-minilm",
@@ -74,7 +74,7 @@ const openai = new OpenAI({
 const chatCompletion = await openai.chat.completions.create({
    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3.1',
+    model: 'llama3.2',
 })
 const response = await openai.chat.completions.create({
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
 })
 const completion = await openai.completions.create({
-    model: "llama3.1",
+    model: "llama3.2",
    prompt: "Say this is a test.",
 })
 const listCompletion = await openai.models.list()
-const model = await openai.models.retrieve("llama3.1")
+const model = await openai.models.retrieve("llama3.2")
 const embedding = await openai.embeddings.create({
  model: "all-minilm",
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3.1",
+        "model": "llama3.2",
        "messages": [
            {
                "role": "system",
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3.1",
+        "model": "llama3.2",
        "prompt": "Say this is a test"
    }'
 curl http://localhost:11434/v1/models
-curl http://localhost:11434/v1/models/llama3.1
+curl http://localhost:11434/v1/models/llama3.2
 curl http://localhost:11434/v1/embeddings \
    -H "Content-Type: application/json" \
@@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
 Before using a model, pull it locally `ollama pull`:
 ```shell
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 ### Default model names
@@ -282,7 +282,7 @@ ollama pull llama3.1
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
 ```
-ollama cp llama3.1 gpt-3.5-turbo
+ollama cp llama3.2 gpt-3.5-turbo
 ```
 Afterwards, this new model name can be specified the `model` field:
--- a/docs/template.md
+++ b/docs/template.md
@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
 To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
 ```dockerfile
-FROM llama3.1
+FROM llama3.2
 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
-  model: "llama3.1",
+  model: "llama3.2",
 });
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
-That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 ```bash
 npm install cheerio
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```
 ## Troubleshooting
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3.1",
+		Model:    "llama3.2",
 		Messages: messages,
 	}
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,10 +4,10 @@ This example provides an interface for asking questions to a PDF document.
 ## Setup
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 2. Install the Python Requirements.
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )
-    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3.2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
 ## Running the Example
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
   ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
   ```
 2. Install the Python Requirements.
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,7 +5,7 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3.2")
 chain = load_summarize_chain(llm, chain_type="stuff")
 result = chain.invoke(docs)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 ## Running the Example
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
   ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
   ```
 2. Install the Python Requirements.
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 input = input("What is your question?")
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3.2")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3.1
+FROM llama3.2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 # Example character: Mario
-This example shows how to create a basic character using Llama3.1 as the base model.
+This example shows how to create a basic character using Llama 3.2 as the base model.
 To run this example:
 1. Download the Modelfile
-2. `ollama pull llama3.1` to get the base model used in the model file.
+2. `ollama pull llama3.2` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 ```
-FROM llama3.1
+FROM llama3.2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-grounded-factuality-rag-check/README.md
+++ b/examples/python-grounded-factuality-rag-check/README.md
@@ -1,14 +1,14 @@
 # RAG Hallucination Checker using Bespoke-Minicheck
-This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.1` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
+This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
 ## Running the Example
-1. Ensure `all-minilm` (embedding) `llama3.1` (chat) and `bespoke-minicheck` (check) models installed:
+1. Ensure `all-minilm` (embedding) `llama3.2` (chat) and `bespoke-minicheck` (check) models installed:
   ```bash
   ollama pull all-minilm
-   ollama pull llama3.1
+   ollama pull llama3.2
   ollama pull bespoke-minicheck
   ```
--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@@ -9,7 +9,7 @@ import nltk
 warnings.filterwarnings(
    "ignore", category=FutureWarning, module="transformers.tokenization_utils_base"
 )
-nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True)
 def getArticleText(url):
@@ -119,7 +119,7 @@ if __name__ == "__main__":
        system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
        ollama_response = ollama.generate(
-            model="llama3.1",
+            model="llama3.2",
            prompt=question,
            system=system_prompt,
            options={"stream": False},
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
-model = "llama3.1"
+model = "llama3.2"
 template = {
  "firstName": "",
  "lastName": "",
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama3.1"
+model = "llama3.2"
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 ## Running the Example
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
   ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
   ```
 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3.1"  # TODO: update this for whatever model you wish to use
+model = "llama3.2"  # TODO: update this for whatever model you wish to use
 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 ## Running the Example
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
   ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
   ```
 2. Install the Python Requirements.
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";
-const model = "llama3.1";
+const model = "llama3.2";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -205,13 +205,16 @@ func GetGPUInfo() GpuInfoList {
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
 		depPath := LibraryDir()
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
-					memInfo: mem,
+					memInfo:        mem,
-					Library: "cpu",
+					Library:        "cpu",
-					Variant: cpuCapability.String(),
+					Variant:        cpuCapability.String(),
-					ID:      "0",
+					ID:             "0",
 					DependencyPath: depPath,
 				},
 			},
 		}
@@ -224,8 +227,6 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 		depPath := LibraryDir()
 		// Load ALL libraries
 		cHandles = initCudaHandles()
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -10,5 +10,6 @@ target_compile_definitions(${TARGET} PRIVATE
 target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -37,6 +37,19 @@ function amdGPUs {
 function init_vars {
    write-host "Checking for cmake..."
    get-command cmake
    write-host "Checking for ninja..."
    $d=(get-command -ea 'silentlycontinue' ninja).path
    if ($null -eq $d) {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
        if ($matches.count -eq 0) {
            throw "Unable to locate ninja"
        }
        $ninjaDir=($matches[0].FullName | split-path -parent)
        $env:PATH="$env:PATH;$ninjaDir"
    }
    if (!$script:SRC_DIR) {
        $script:SRC_DIR = $(resolve-path "..\..\")
    }
@@ -163,7 +176,7 @@ function cleanup {
        }
        # Checkout each file
-        foreach ($file in $filePaths) {            
+        foreach ($file in $filePaths) {
            git -C "${script:llamacppDir}" checkout $file
        }
        git -C "${script:llamacppDir}" checkout CMakeLists.txt
@@ -180,12 +193,12 @@ function build_static() {
    if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
        # GCC build for direct linking into the Go binary
        init_vars
-        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
+
-        # as we need this to be compiled by gcc for golang to be able to link with itx
+        # cmake will silently fallback to msvc compilers if gcc isn't in the path, so detect and fail fast
-        write-host "Checking for MinGW..."
+        # as we need this to be compiled by gcc for golang to be able to link with it
-        # error action ensures we exit on failure
+        write-host "Checking for gcc..."
-        get-command gcc
+        get-command  gcc
-        get-command mingw32-make
+        get-command  mingw32-make
        $oldTargets = $script:cmakeTargets
        $script:cmakeTargets = @("llama", "ggml")
        $script:cmakeDefs = @(
@@ -209,11 +222,10 @@ function build_static() {
    }
 }
-function build_cpu($gen_arch) {
+function build_cpu_x64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
+        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
@@ -225,6 +237,32 @@ function build_cpu($gen_arch) {
    }
 }
 function build_cpu_arm64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        init_vars
        write-host "Checking for clang..."
        get-command clang
        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
        $env:CXXFLAGS="$env:CFLAGS"
        $env:LDFLAGS="-static-libstdc++"
        $script:cmakeDefs = $script:commonCpuDefs + @(
            "-DCMAKE_VERBOSE_MAKEFILE=on",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
        ) + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_avx() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
        init_vars
@@ -349,7 +387,7 @@ function build_rocm() {
        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
        $script:cmakeDefs += @(
-            "-G", "Ninja", 
+            "-G", "Ninja",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
@@ -398,9 +436,9 @@ if ($($args.count) -eq 0) {
    apply_patches
    build_static
    if ($script:ARCH -eq "arm64") {
-        build_cpu("ARM64")
+        build_cpu_arm64
    } else { # amd64
-        build_cpu("x64")
+        build_cpu_x64
        build_cpu_avx
        build_cpu_avx2
        build_cuda
@@ -414,5 +452,5 @@ if ($($args.count) -eq 0) {
    for ( $i = 0; $i -lt $args.count; $i++ ) {
        write-host "performing $($args[$i])"
        & $($args[$i])
-    } 
+    }
 }
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -5,7 +5,7 @@ package llm
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
-// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
+// #cgo windows,arm64 LDFLAGS: -lllama -lggml -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
 // #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
 // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
 // #include <stdlib.h>
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -4,7 +4,10 @@ import (
 	"syscall"
 )
-const CREATE_DEFAULT_ERROR_MODE = 0x04000000
+const (
 	CREATE_DEFAULT_ERROR_MODE   = 0x04000000
 	ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
 )
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	// Wire up the default error handling logic If for some reason a DLL is
@@ -12,5 +15,8 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	// the user can either fix their PATH, or report a bug. Without this
 	// setting, the process exits immediately with a generic exit status but no
 	// way to (easily) figure out what the actual missing DLL was.
-	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
+	//
 	// Setting Above Normal priority class ensures when running as a "background service"
 	// with "programs" given best priority, we aren't starved of cpu cycles
 	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
 }
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
  const [step, setStep] = useState<Step>(Step.WELCOME)
  const [commandCopied, setCommandCopied] = useState<boolean>(false)
-  const command = 'ollama run llama3.1'
+  const command = 'ollama run llama3.2'
  return (
    <div className='drag'>
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,12 +7,22 @@
 $ErrorActionPreference = "Stop"
 function checkEnv() {
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
+    if ($null -ne $env:ARCH ) {
-    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
+        $script:ARCH = $env:ARCH
    } else {
        $arch=([System.Runtime.InteropServices.RuntimeInformation]::OSArchitecture)
        if ($null -ne $arch) {
            $script:ARCH = ($arch.ToString().ToLower()).Replace("x64", "amd64")
        } else {
            write-host "WARNING: old powershell detected, assuming amd64 architecture - set `$env:ARCH to override"
            $script:ARCH="amd64"
        }
    }
    $script:TARGET_ARCH=$script:ARCH
    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
    $script:SRC_DIR=$PWD
-    if (!$env:VCToolsRedistDir) {
+    if ($null -eq $env:VCToolsRedistDir) {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
@@ -28,9 +38,12 @@ function checkEnv() {
        $script:CUDA_DIRS=$cudaList
    }
-    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
+    $inoSetup=(get-item "C:\Program Files*\Inno Setup*\")
    if ($inoSetup.length -gt 0) {
        $script:INNO_SETUP_DIR=$inoSetup[0]
    }
-    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
+    $script:DIST_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
    Write-Output "Checking version"
    if (!$env:VERSION) {
@@ -67,7 +80,6 @@ function checkEnv() {
 function buildOllama() {
    write-host "Building ollama CLI"
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
@@ -75,15 +87,16 @@ function buildOllama() {
        #        which targets to build
        # Start by skipping CUDA to build everything else
-        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        write-host "Building ollama runners"
        powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
        # Then skip everyhting else and build all the CUDA variants
        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
-            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+            write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
            if ($env:CUDA_LIB_DIR.Contains("v12")) {
-                pwsh -Command {
+                powershell -Command {
                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
@@ -96,7 +109,7 @@ function buildOllama() {
                    & go generate ./...
                }
            } else {
-                pwsh -Command {
+                powershell -Command {
                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
@@ -115,6 +128,7 @@ function buildOllama() {
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
    }
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
@@ -130,34 +144,50 @@ function buildApp() {
    write-host "Building Ollama App"
    cd "${script:SRC_DIR}\app"
    & windres -l 0 -o ollama.syso ollama.rc
-    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
+    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} app.exe
+            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe"
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
 }
 function gatherDependencies() {
-    write-host "Gathering runtime dependencies"
+    if ($null -eq $env:VCToolsRedistDir) {
        write-error "Unable to locate VC Install location - please use a Developer shell"
        exit 1
    }
    write-host "Gathering runtime dependencies from $env:VCToolsRedistDir"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
+    md "${script:DIST_DIR}\lib\ollama" -ea 0 > $null
    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    if ($script:TARGET_ARCH -eq "amd64") {
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
+        $depArch="x64"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
+    } else {
-    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
+        $depArch=$script:TARGET_ARCH
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    }
    if ($depArch -eq "amd64") {
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
        $llvmCrtDir="$env:VCToolsRedistDir\..\..\..\Tools\Llvm\${depArch}\bin"
        foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
            write-host "cp ${llvmCrtDir}\api-ms-win-crt-${part}*.dll ${script:DIST_DIR}\lib\ollama\"
            cp "${llvmCrtDir}\api-ms-win-crt-${part}*.dll" "${script:DIST_DIR}\lib\ollama\"
        }
    } else {
        # Carying the dll's doesn't seem to work, so use the redist installer
        copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
    }
    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DIST_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
@@ -167,6 +197,10 @@ function gatherDependencies() {
 }
 function buildInstaller() {
    if ($null -eq ${script:INNO_SETUP_DIR}) {
        write-host "Inno Setup not present, skipping installer build"
        return
    }
    write-host "Building Ollama Installer"
    cd "${script:SRC_DIR}\app"
    $env:PKG_VERSION=$script:PKG_VERSION
@@ -183,13 +217,20 @@ function distZip() {
    Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
 }
 checkEnv
 try {
-    checkEnv
+    if ($($args.count) -eq 0) {
-    buildOllama
+        buildOllama
-    buildApp
+        buildApp
-    gatherDependencies
+        gatherDependencies
-    buildInstaller
+        buildInstaller
-    distZip
+        distZip
    } else {
        for ( $i = 0; $i -lt $args.count; $i++ ) {
            write-host "performing $($args[$i])"
            & $($args[$i])
        } 
    }
 } catch {
    write-host "Build Failed"
    write-host $_
--- a/server/images.go
+++ b/server/images.go
@@ -1025,6 +1025,8 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		switch {
 		case resp.StatusCode == http.StatusUnauthorized:
 			resp.Body.Close()
 			// Handle authentication error with one retry
 			challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
 			token, err := getAuthorizationToken(ctx, challenge)
@@ -1040,8 +1042,10 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 				}
 			}
 		case resp.StatusCode == http.StatusNotFound:
 			resp.Body.Close()
 			return nil, os.ErrNotExist
 		case resp.StatusCode >= http.StatusBadRequest:
 			defer resp.Body.Close()
 			responseBody, err := io.ReadAll(resp.Body)
 			if err != nil {
 				return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -354,7 +354,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 }
 func TestGetRunner(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 200*time.Millisecond)
 	defer done()
 	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
@@ -395,7 +395,7 @@ func TestGetRunner(t *testing.T) {
 	slog.Info("c")
 	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
 	// Starts in pending channel, then should be quickly processsed to return an error
-	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
+	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
 	require.Empty(t, successCh1c)
 	s.loadedMu.Lock()
 	require.Empty(t, s.loaded)